http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index b9efd9b..efaf29b 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,16 +1,16 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-19856038 -// Cuda compilation tools, release 7.5, V7.5.17 +// Compiler Build ID: CL-21124049 +// Cuda compilation tools, release 8.0, V8.0.44 // Based on LLVM 3.4svn // -.version 4.3 +.version 5.0 .target sm_30 .address_size 64 - // .globl _Z6reduceI5SumOpEvPdS1_jT_d + // .globl copy_u2l_dense .func (.param .b64 func_retval0) __internal_accurate_pow ( .param .b64 __internal_accurate_pow_param_0, @@ -19,1692 +19,10 @@ ; .extern .shared .align 8 .b8 sdata[]; -.visible .func _Z6reduceI5SumOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<33>; - .reg .f64 %fd<79>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd76, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r32, %r8, %r9, %r6; - setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB0_5; - - mov.f64 %fd77, %fd76; - -BB0_2: - mov.f64 %fd1, %fd77; - mul.wide.u32 %rd4, %r32, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd29, [%rd5]; - add.f64 %fd78, %fd1, %fd29; - add.s32 %r3, %r32, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB0_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd30, [%rd7]; - add.f64 %fd78, %fd78, %fd30; - -BB0_4: - mov.f64 %fd77, %fd78; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r32, %r12, %r13, %r32; - setp.lt.u32 %p3, %r32, %r5; - mov.f64 %fd76, %fd77; - @%p3 bra BB0_2; - -BB0_5: - mov.f64 %fd74, %fd76; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd74; - bar.sync 0; - setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB0_9; - - setp.gt.u32 %p5, %r6, 511; - mov.f64 %fd75, %fd74; - @%p5 bra BB0_8; - - ld.shared.f64 %fd31, [%rd1+4096]; - add.f64 %fd75, %fd74, %fd31; - st.shared.f64 [%rd1], %fd75; - -BB0_8: - mov.f64 %fd74, %fd75; - bar.sync 0; - -BB0_9: - mov.f64 %fd72, %fd74; - setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB0_13; - - setp.gt.u32 %p7, %r6, 255; - mov.f64 %fd73, %fd72; - @%p7 bra BB0_12; - - ld.shared.f64 %fd32, [%rd1+2048]; - add.f64 %fd73, %fd72, %fd32; - st.shared.f64 [%rd1], %fd73; - -BB0_12: - mov.f64 %fd72, %fd73; - bar.sync 0; - -BB0_13: - mov.f64 %fd70, %fd72; - setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB0_17; - - setp.gt.u32 %p9, %r6, 127; - mov.f64 %fd71, %fd70; - @%p9 bra BB0_16; - - ld.shared.f64 %fd33, [%rd1+1024]; - add.f64 %fd71, %fd70, %fd33; - st.shared.f64 [%rd1], %fd71; - -BB0_16: - mov.f64 %fd70, %fd71; - bar.sync 0; - -BB0_17: - mov.f64 %fd68, %fd70; - setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB0_21; - - setp.gt.u32 %p11, %r6, 63; - mov.f64 %fd69, %fd68; - @%p11 bra BB0_20; - - ld.shared.f64 %fd34, [%rd1+512]; - add.f64 %fd69, %fd68, %fd34; - st.shared.f64 [%rd1], %fd69; - -BB0_20: - mov.f64 %fd68, %fd69; - bar.sync 0; - -BB0_21: - mov.f64 %fd67, %fd68; - setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB0_34; - - setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB0_24; - - ld.volatile.shared.f64 %fd35, [%rd1+256]; - add.f64 %fd67, %fd67, %fd35; - st.volatile.shared.f64 [%rd1], %fd67; - -BB0_24: - mov.f64 %fd66, %fd67; - setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB0_26; - - ld.volatile.shared.f64 %fd36, [%rd1+128]; - add.f64 %fd66, %fd66, %fd36; - st.volatile.shared.f64 [%rd1], %fd66; - -BB0_26: - mov.f64 %fd65, %fd66; - setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB0_28; - - ld.volatile.shared.f64 %fd37, [%rd1+64]; - add.f64 %fd65, %fd65, %fd37; - st.volatile.shared.f64 [%rd1], %fd65; - -BB0_28: - mov.f64 %fd64, %fd65; - setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB0_30; - - ld.volatile.shared.f64 %fd38, [%rd1+32]; - add.f64 %fd64, %fd64, %fd38; - st.volatile.shared.f64 [%rd1], %fd64; - -BB0_30: - mov.f64 %fd63, %fd64; - setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB0_32; - - ld.volatile.shared.f64 %fd39, [%rd1+16]; - add.f64 %fd63, %fd63, %fd39; - st.volatile.shared.f64 [%rd1], %fd63; - -BB0_32: - setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB0_34; - - ld.volatile.shared.f64 %fd40, [%rd1+8]; - add.f64 %fd41, %fd63, %fd40; - st.volatile.shared.f64 [%rd1], %fd41; - -BB0_34: - setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB0_36; - - ld.shared.f64 %fd42, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd42; - -BB0_36: - ret; -} - - // .globl _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<29>; - .reg .f64 %fd<41>; - .reg .b64 %rd<10>; - - - ld.param.u64 %rd2, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd3, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r7, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd40, [_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r1, %ctaid.x; - setp.ge.u32 %p1, %r1, %r7; - @%p1 bra BB1_34; - - mov.u32 %r28, %tid.x; - mul.lo.s32 %r3, %r1, %r6; - setp.ge.u32 %p2, %r28, %r6; - @%p2 bra BB1_3; - -BB1_2: - add.s32 %r8, %r28, %r3; - mul.wide.u32 %rd4, %r8, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd27, [%rd5]; - add.f64 %fd40, %fd40, %fd27; - mov.u32 %r9, %ntid.x; - add.s32 %r28, %r9, %r28; - setp.lt.u32 %p3, %r28, %r6; - @%p3 bra BB1_2; - -BB1_3: - mov.u32 %r10, %tid.x; - mul.wide.u32 %rd6, %r10, 8; - mov.u64 %rd7, sdata; - add.s64 %rd1, %rd7, %rd6; - st.shared.f64 [%rd1], %fd40; - bar.sync 0; - mov.u32 %r11, %ntid.x; - setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB1_7; - - setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB1_6; - - ld.shared.f64 %fd28, [%rd1+4096]; - add.f64 %fd40, %fd40, %fd28; - st.shared.f64 [%rd1], %fd40; - -BB1_6: - bar.sync 0; - -BB1_7: - setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB1_11; - - setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB1_10; - - ld.shared.f64 %fd29, [%rd1+2048]; - add.f64 %fd40, %fd40, %fd29; - st.shared.f64 [%rd1], %fd40; - -BB1_10: - bar.sync 0; - -BB1_11: - setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB1_15; - - setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB1_14; - - ld.shared.f64 %fd30, [%rd1+1024]; - add.f64 %fd40, %fd40, %fd30; - st.shared.f64 [%rd1], %fd40; - -BB1_14: - bar.sync 0; - -BB1_15: - setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB1_19; - - setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB1_18; - - ld.shared.f64 %fd31, [%rd1+512]; - add.f64 %fd40, %fd40, %fd31; - st.shared.f64 [%rd1], %fd40; - -BB1_18: - bar.sync 0; - -BB1_19: - setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB1_32; - - setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB1_22; - - ld.volatile.shared.f64 %fd32, [%rd1+256]; - add.f64 %fd40, %fd40, %fd32; - st.volatile.shared.f64 [%rd1], %fd40; - -BB1_22: - setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB1_24; - - ld.volatile.shared.f64 %fd33, [%rd1+128]; - add.f64 %fd40, %fd40, %fd33; - st.volatile.shared.f64 [%rd1], %fd40; - -BB1_24: - setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB1_26; - - ld.volatile.shared.f64 %fd34, [%rd1+64]; - add.f64 %fd40, %fd40, %fd34; - st.volatile.shared.f64 [%rd1], %fd40; - -BB1_26: - setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB1_28; - - ld.volatile.shared.f64 %fd35, [%rd1+32]; - add.f64 %fd40, %fd40, %fd35; - st.volatile.shared.f64 [%rd1], %fd40; - -BB1_28: - setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB1_30; - - ld.volatile.shared.f64 %fd36, [%rd1+16]; - add.f64 %fd40, %fd40, %fd36; - st.volatile.shared.f64 [%rd1], %fd40; - -BB1_30: - setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB1_32; - - ld.volatile.shared.f64 %fd37, [%rd1+8]; - add.f64 %fd38, %fd40, %fd37; - st.volatile.shared.f64 [%rd1], %fd38; - -BB1_32: - setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB1_34; - - ld.shared.f64 %fd39, [sdata]; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd3, %rd8; - st.f64 [%rd9], %fd39; - -BB1_34: - ret; -} - - // .globl _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<11>; - .reg .f64 %fd<7>; - .reg .b64 %rd<7>; - - - ld.param.u64 %rd1, [_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd2, [_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r5, [_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd6, [_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %ntid.x; - mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r8, %r7, %r9; - setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB2_5; - - mul.lo.s32 %r2, %r6, %r5; - setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB2_4; - - mov.u32 %r10, %r1; - -BB2_3: - mov.u32 %r3, %r10; - mul.wide.u32 %rd3, %r3, 8; - add.s64 %rd4, %rd1, %rd3; - ld.f64 %fd5, [%rd4]; - add.f64 %fd6, %fd6, %fd5; - add.s32 %r4, %r3, %r6; - setp.lt.u32 %p3, %r4, %r2; - mov.u32 %r10, %r4; - @%p3 bra BB2_3; - -BB2_4: - mul.wide.u32 %rd5, %r1, 8; - add.s64 %rd6, %rd2, %rd5; - st.f64 [%rd6], %fd6; - -BB2_5: - ret; -} - - // .globl _Z6reduceI5MaxOpEvPdS1_jT_d -.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<33>; - .reg .f64 %fd<79>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd76, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r32, %r8, %r9, %r6; - setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB3_5; - - mov.f64 %fd77, %fd76; - -BB3_2: - mov.f64 %fd1, %fd77; - mul.wide.u32 %rd4, %r32, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd29, [%rd5]; - max.f64 %fd78, %fd1, %fd29; - add.s32 %r3, %r32, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB3_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd30, [%rd7]; - max.f64 %fd78, %fd78, %fd30; - -BB3_4: - mov.f64 %fd77, %fd78; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r32, %r12, %r13, %r32; - setp.lt.u32 %p3, %r32, %r5; - mov.f64 %fd76, %fd77; - @%p3 bra BB3_2; - -BB3_5: - mov.f64 %fd74, %fd76; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd74; - bar.sync 0; - setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB3_9; - - setp.gt.u32 %p5, %r6, 511; - mov.f64 %fd75, %fd74; - @%p5 bra BB3_8; - - ld.shared.f64 %fd31, [%rd1+4096]; - max.f64 %fd75, %fd74, %fd31; - st.shared.f64 [%rd1], %fd75; - -BB3_8: - mov.f64 %fd74, %fd75; - bar.sync 0; - -BB3_9: - mov.f64 %fd72, %fd74; - setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB3_13; - - setp.gt.u32 %p7, %r6, 255; - mov.f64 %fd73, %fd72; - @%p7 bra BB3_12; - - ld.shared.f64 %fd32, [%rd1+2048]; - max.f64 %fd73, %fd72, %fd32; - st.shared.f64 [%rd1], %fd73; - -BB3_12: - mov.f64 %fd72, %fd73; - bar.sync 0; - -BB3_13: - mov.f64 %fd70, %fd72; - setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB3_17; - - setp.gt.u32 %p9, %r6, 127; - mov.f64 %fd71, %fd70; - @%p9 bra BB3_16; - - ld.shared.f64 %fd33, [%rd1+1024]; - max.f64 %fd71, %fd70, %fd33; - st.shared.f64 [%rd1], %fd71; - -BB3_16: - mov.f64 %fd70, %fd71; - bar.sync 0; - -BB3_17: - mov.f64 %fd68, %fd70; - setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB3_21; - - setp.gt.u32 %p11, %r6, 63; - mov.f64 %fd69, %fd68; - @%p11 bra BB3_20; - - ld.shared.f64 %fd34, [%rd1+512]; - max.f64 %fd69, %fd68, %fd34; - st.shared.f64 [%rd1], %fd69; - -BB3_20: - mov.f64 %fd68, %fd69; - bar.sync 0; - -BB3_21: - mov.f64 %fd67, %fd68; - setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB3_34; - - setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB3_24; - - ld.volatile.shared.f64 %fd35, [%rd1+256]; - max.f64 %fd67, %fd67, %fd35; - st.volatile.shared.f64 [%rd1], %fd67; - -BB3_24: - mov.f64 %fd66, %fd67; - setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB3_26; - - ld.volatile.shared.f64 %fd36, [%rd1+128]; - max.f64 %fd66, %fd66, %fd36; - st.volatile.shared.f64 [%rd1], %fd66; - -BB3_26: - mov.f64 %fd65, %fd66; - setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB3_28; - - ld.volatile.shared.f64 %fd37, [%rd1+64]; - max.f64 %fd65, %fd65, %fd37; - st.volatile.shared.f64 [%rd1], %fd65; - -BB3_28: - mov.f64 %fd64, %fd65; - setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB3_30; - - ld.volatile.shared.f64 %fd38, [%rd1+32]; - max.f64 %fd64, %fd64, %fd38; - st.volatile.shared.f64 [%rd1], %fd64; - -BB3_30: - mov.f64 %fd63, %fd64; - setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB3_32; - - ld.volatile.shared.f64 %fd39, [%rd1+16]; - max.f64 %fd63, %fd63, %fd39; - st.volatile.shared.f64 [%rd1], %fd63; - -BB3_32: - setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB3_34; - - ld.volatile.shared.f64 %fd40, [%rd1+8]; - max.f64 %fd41, %fd63, %fd40; - st.volatile.shared.f64 [%rd1], %fd41; - -BB3_34: - setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB3_36; - - ld.shared.f64 %fd42, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd42; - -BB3_36: - ret; -} - - // .globl _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<29>; - .reg .f64 %fd<41>; - .reg .b64 %rd<10>; - - - ld.param.u64 %rd2, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd3, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r7, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd40, [_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r1, %ctaid.x; - setp.ge.u32 %p1, %r1, %r7; - @%p1 bra BB4_34; - - mov.u32 %r28, %tid.x; - mul.lo.s32 %r3, %r1, %r6; - setp.ge.u32 %p2, %r28, %r6; - @%p2 bra BB4_3; - -BB4_2: - add.s32 %r8, %r28, %r3; - mul.wide.u32 %rd4, %r8, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd27, [%rd5]; - max.f64 %fd40, %fd40, %fd27; - mov.u32 %r9, %ntid.x; - add.s32 %r28, %r9, %r28; - setp.lt.u32 %p3, %r28, %r6; - @%p3 bra BB4_2; - -BB4_3: - mov.u32 %r10, %tid.x; - mul.wide.u32 %rd6, %r10, 8; - mov.u64 %rd7, sdata; - add.s64 %rd1, %rd7, %rd6; - st.shared.f64 [%rd1], %fd40; - bar.sync 0; - mov.u32 %r11, %ntid.x; - setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB4_7; - - setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB4_6; - - ld.shared.f64 %fd28, [%rd1+4096]; - max.f64 %fd40, %fd40, %fd28; - st.shared.f64 [%rd1], %fd40; - -BB4_6: - bar.sync 0; - -BB4_7: - setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB4_11; - - setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB4_10; - - ld.shared.f64 %fd29, [%rd1+2048]; - max.f64 %fd40, %fd40, %fd29; - st.shared.f64 [%rd1], %fd40; - -BB4_10: - bar.sync 0; - -BB4_11: - setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB4_15; - - setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB4_14; - - ld.shared.f64 %fd30, [%rd1+1024]; - max.f64 %fd40, %fd40, %fd30; - st.shared.f64 [%rd1], %fd40; - -BB4_14: - bar.sync 0; - -BB4_15: - setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB4_19; - - setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB4_18; - - ld.shared.f64 %fd31, [%rd1+512]; - max.f64 %fd40, %fd40, %fd31; - st.shared.f64 [%rd1], %fd40; - -BB4_18: - bar.sync 0; - -BB4_19: - setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB4_32; - - setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB4_22; - - ld.volatile.shared.f64 %fd32, [%rd1+256]; - max.f64 %fd40, %fd40, %fd32; - st.volatile.shared.f64 [%rd1], %fd40; - -BB4_22: - setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB4_24; - - ld.volatile.shared.f64 %fd33, [%rd1+128]; - max.f64 %fd40, %fd40, %fd33; - st.volatile.shared.f64 [%rd1], %fd40; - -BB4_24: - setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB4_26; - - ld.volatile.shared.f64 %fd34, [%rd1+64]; - max.f64 %fd40, %fd40, %fd34; - st.volatile.shared.f64 [%rd1], %fd40; - -BB4_26: - setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB4_28; - - ld.volatile.shared.f64 %fd35, [%rd1+32]; - max.f64 %fd40, %fd40, %fd35; - st.volatile.shared.f64 [%rd1], %fd40; - -BB4_28: - setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB4_30; - - ld.volatile.shared.f64 %fd36, [%rd1+16]; - max.f64 %fd40, %fd40, %fd36; - st.volatile.shared.f64 [%rd1], %fd40; - -BB4_30: - setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB4_32; - - ld.volatile.shared.f64 %fd37, [%rd1+8]; - max.f64 %fd38, %fd40, %fd37; - st.volatile.shared.f64 [%rd1], %fd38; - -BB4_32: - setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB4_34; - - ld.shared.f64 %fd39, [sdata]; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd3, %rd8; - st.f64 [%rd9], %fd39; - -BB4_34: - ret; -} - - // .globl _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<11>; - .reg .f64 %fd<7>; - .reg .b64 %rd<7>; - - - ld.param.u64 %rd1, [_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd2, [_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r5, [_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd6, [_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %ntid.x; - mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r8, %r7, %r9; - setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB5_5; - - mul.lo.s32 %r2, %r6, %r5; - setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB5_4; - - mov.u32 %r10, %r1; - -BB5_3: - mov.u32 %r3, %r10; - mul.wide.u32 %rd3, %r3, 8; - add.s64 %rd4, %rd1, %rd3; - ld.f64 %fd5, [%rd4]; - max.f64 %fd6, %fd6, %fd5; - add.s32 %r4, %r3, %r6; - setp.lt.u32 %p3, %r4, %r2; - mov.u32 %r10, %r4; - @%p3 bra BB5_3; - -BB5_4: - mul.wide.u32 %rd5, %r1, 8; - add.s64 %rd6, %rd2, %rd5; - st.f64 [%rd6], %fd6; - -BB5_5: - ret; -} - - // .globl _Z6reduceI5MinOpEvPdS1_jT_d -.visible .func _Z6reduceI5MinOpEvPdS1_jT_d( - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<33>; - .reg .f64 %fd<79>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd76, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r32, %r8, %r9, %r6; - setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB6_5; - - mov.f64 %fd77, %fd76; - -BB6_2: - mov.f64 %fd1, %fd77; - mul.wide.u32 %rd4, %r32, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd29, [%rd5]; - min.f64 %fd78, %fd1, %fd29; - add.s32 %r3, %r32, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB6_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd30, [%rd7]; - min.f64 %fd78, %fd78, %fd30; - -BB6_4: - mov.f64 %fd77, %fd78; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r32, %r12, %r13, %r32; - setp.lt.u32 %p3, %r32, %r5; - mov.f64 %fd76, %fd77; - @%p3 bra BB6_2; - -BB6_5: - mov.f64 %fd74, %fd76; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd74; - bar.sync 0; - setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB6_9; - - setp.gt.u32 %p5, %r6, 511; - mov.f64 %fd75, %fd74; - @%p5 bra BB6_8; - - ld.shared.f64 %fd31, [%rd1+4096]; - min.f64 %fd75, %fd74, %fd31; - st.shared.f64 [%rd1], %fd75; - -BB6_8: - mov.f64 %fd74, %fd75; - bar.sync 0; - -BB6_9: - mov.f64 %fd72, %fd74; - setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB6_13; - - setp.gt.u32 %p7, %r6, 255; - mov.f64 %fd73, %fd72; - @%p7 bra BB6_12; - - ld.shared.f64 %fd32, [%rd1+2048]; - min.f64 %fd73, %fd72, %fd32; - st.shared.f64 [%rd1], %fd73; - -BB6_12: - mov.f64 %fd72, %fd73; - bar.sync 0; - -BB6_13: - mov.f64 %fd70, %fd72; - setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB6_17; - - setp.gt.u32 %p9, %r6, 127; - mov.f64 %fd71, %fd70; - @%p9 bra BB6_16; - - ld.shared.f64 %fd33, [%rd1+1024]; - min.f64 %fd71, %fd70, %fd33; - st.shared.f64 [%rd1], %fd71; - -BB6_16: - mov.f64 %fd70, %fd71; - bar.sync 0; - -BB6_17: - mov.f64 %fd68, %fd70; - setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB6_21; - - setp.gt.u32 %p11, %r6, 63; - mov.f64 %fd69, %fd68; - @%p11 bra BB6_20; - - ld.shared.f64 %fd34, [%rd1+512]; - min.f64 %fd69, %fd68, %fd34; - st.shared.f64 [%rd1], %fd69; - -BB6_20: - mov.f64 %fd68, %fd69; - bar.sync 0; - -BB6_21: - mov.f64 %fd67, %fd68; - setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB6_34; - - setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB6_24; - - ld.volatile.shared.f64 %fd35, [%rd1+256]; - min.f64 %fd67, %fd67, %fd35; - st.volatile.shared.f64 [%rd1], %fd67; - -BB6_24: - mov.f64 %fd66, %fd67; - setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB6_26; - - ld.volatile.shared.f64 %fd36, [%rd1+128]; - min.f64 %fd66, %fd66, %fd36; - st.volatile.shared.f64 [%rd1], %fd66; - -BB6_26: - mov.f64 %fd65, %fd66; - setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB6_28; - - ld.volatile.shared.f64 %fd37, [%rd1+64]; - min.f64 %fd65, %fd65, %fd37; - st.volatile.shared.f64 [%rd1], %fd65; - -BB6_28: - mov.f64 %fd64, %fd65; - setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB6_30; - - ld.volatile.shared.f64 %fd38, [%rd1+32]; - min.f64 %fd64, %fd64, %fd38; - st.volatile.shared.f64 [%rd1], %fd64; - -BB6_30: - mov.f64 %fd63, %fd64; - setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB6_32; - - ld.volatile.shared.f64 %fd39, [%rd1+16]; - min.f64 %fd63, %fd63, %fd39; - st.volatile.shared.f64 [%rd1], %fd63; - -BB6_32: - setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB6_34; - - ld.volatile.shared.f64 %fd40, [%rd1+8]; - min.f64 %fd41, %fd63, %fd40; - st.volatile.shared.f64 [%rd1], %fd41; - -BB6_34: - setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB6_36; - - ld.shared.f64 %fd42, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd42; - -BB6_36: - ret; -} - - // .globl _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<29>; - .reg .f64 %fd<41>; - .reg .b64 %rd<10>; - - - ld.param.u64 %rd2, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd3, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r7, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd40, [_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r1, %ctaid.x; - setp.ge.u32 %p1, %r1, %r7; - @%p1 bra BB7_34; - - mov.u32 %r28, %tid.x; - mul.lo.s32 %r3, %r1, %r6; - setp.ge.u32 %p2, %r28, %r6; - @%p2 bra BB7_3; - -BB7_2: - add.s32 %r8, %r28, %r3; - mul.wide.u32 %rd4, %r8, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd27, [%rd5]; - min.f64 %fd40, %fd40, %fd27; - mov.u32 %r9, %ntid.x; - add.s32 %r28, %r9, %r28; - setp.lt.u32 %p3, %r28, %r6; - @%p3 bra BB7_2; - -BB7_3: - mov.u32 %r10, %tid.x; - mul.wide.u32 %rd6, %r10, 8; - mov.u64 %rd7, sdata; - add.s64 %rd1, %rd7, %rd6; - st.shared.f64 [%rd1], %fd40; - bar.sync 0; - mov.u32 %r11, %ntid.x; - setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB7_7; - - setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB7_6; - - ld.shared.f64 %fd28, [%rd1+4096]; - min.f64 %fd40, %fd40, %fd28; - st.shared.f64 [%rd1], %fd40; - -BB7_6: - bar.sync 0; - -BB7_7: - setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB7_11; - - setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB7_10; - - ld.shared.f64 %fd29, [%rd1+2048]; - min.f64 %fd40, %fd40, %fd29; - st.shared.f64 [%rd1], %fd40; - -BB7_10: - bar.sync 0; - -BB7_11: - setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB7_15; - - setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB7_14; - - ld.shared.f64 %fd30, [%rd1+1024]; - min.f64 %fd40, %fd40, %fd30; - st.shared.f64 [%rd1], %fd40; - -BB7_14: - bar.sync 0; - -BB7_15: - setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB7_19; - - setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB7_18; - - ld.shared.f64 %fd31, [%rd1+512]; - min.f64 %fd40, %fd40, %fd31; - st.shared.f64 [%rd1], %fd40; - -BB7_18: - bar.sync 0; - -BB7_19: - setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB7_32; - - setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB7_22; - - ld.volatile.shared.f64 %fd32, [%rd1+256]; - min.f64 %fd40, %fd40, %fd32; - st.volatile.shared.f64 [%rd1], %fd40; - -BB7_22: - setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB7_24; - - ld.volatile.shared.f64 %fd33, [%rd1+128]; - min.f64 %fd40, %fd40, %fd33; - st.volatile.shared.f64 [%rd1], %fd40; - -BB7_24: - setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB7_26; - - ld.volatile.shared.f64 %fd34, [%rd1+64]; - min.f64 %fd40, %fd40, %fd34; - st.volatile.shared.f64 [%rd1], %fd40; - -BB7_26: - setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB7_28; - - ld.volatile.shared.f64 %fd35, [%rd1+32]; - min.f64 %fd40, %fd40, %fd35; - st.volatile.shared.f64 [%rd1], %fd40; - -BB7_28: - setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB7_30; - - ld.volatile.shared.f64 %fd36, [%rd1+16]; - min.f64 %fd40, %fd40, %fd36; - st.volatile.shared.f64 [%rd1], %fd40; - -BB7_30: - setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB7_32; - - ld.volatile.shared.f64 %fd37, [%rd1+8]; - min.f64 %fd38, %fd40, %fd37; - st.volatile.shared.f64 [%rd1], %fd38; - -BB7_32: - setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB7_34; - - ld.shared.f64 %fd39, [sdata]; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd3, %rd8; - st.f64 [%rd9], %fd39; - -BB7_34: - ret; -} - - // .globl _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 1 .b8 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1], - .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<11>; - .reg .f64 %fd<7>; - .reg .b64 %rd<7>; - - - ld.param.u64 %rd1, [_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd2, [_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r5, [_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3]; - ld.param.f64 %fd6, [_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %ntid.x; - mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r8, %r7, %r9; - setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB8_5; - - mul.lo.s32 %r2, %r6, %r5; - setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB8_4; - - mov.u32 %r10, %r1; - -BB8_3: - mov.u32 %r3, %r10; - mul.wide.u32 %rd3, %r3, 8; - add.s64 %rd4, %rd1, %rd3; - ld.f64 %fd5, [%rd4]; - min.f64 %fd6, %fd6, %fd5; - add.s32 %r4, %r3, %r6; - setp.lt.u32 %p3, %r4, %r2; - mov.u32 %r10, %r4; - @%p3 bra BB8_3; - -BB8_4: - mul.wide.u32 %rd5, %r1, 8; - add.s64 %rd6, %rd2, %rd5; - st.f64 [%rd6], %fd6; - -BB8_5: - ret; -} - - // .globl _Z6reduceI9ProductOpEvPdS1_jT_d -.visible .func _Z6reduceI9ProductOpEvPdS1_jT_d( - .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_0, - .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_1, - .param .b32 _Z6reduceI9ProductOpEvPdS1_jT_d_param_2, - .param .align 1 .b8 _Z6reduceI9ProductOpEvPdS1_jT_d_param_3[1], - .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_4 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<33>; - .reg .f64 %fd<79>; - .reg .b64 %rd<12>; - - - ld.param.u64 %rd2, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_0]; - ld.param.u64 %rd3, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_1]; - ld.param.u32 %r5, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_2]; - ld.param.f64 %fd76, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_4]; - mov.u32 %r6, %tid.x; - mov.u32 %r7, %ctaid.x; - shl.b32 %r8, %r7, 1; - mov.u32 %r9, %ntid.x; - mad.lo.s32 %r32, %r8, %r9, %r6; - setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB9_5; - - mov.f64 %fd77, %fd76; - -BB9_2: - mov.f64 %fd1, %fd77; - mul.wide.u32 %rd4, %r32, 8; - add.s64 %rd5, %rd2, %rd4; - ld.f64 %fd29, [%rd5]; - mul.f64 %fd78, %fd1, %fd29; - add.s32 %r3, %r32, %r9; - setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB9_4; - - mul.wide.u32 %rd6, %r3, 8; - add.s64 %rd7, %rd2, %rd6; - ld.f64 %fd30, [%rd7]; - mul.f64 %fd78, %fd78, %fd30; - -BB9_4: - mov.f64 %fd77, %fd78; - shl.b32 %r12, %r9, 1; - mov.u32 %r13, %nctaid.x; - mad.lo.s32 %r32, %r12, %r13, %r32; - setp.lt.u32 %p3, %r32, %r5; - mov.f64 %fd76, %fd77; - @%p3 bra BB9_2; - -BB9_5: - mov.f64 %fd74, %fd76; - mul.wide.u32 %rd8, %r6, 8; - mov.u64 %rd9, sdata; - add.s64 %rd1, %rd9, %rd8; - st.shared.f64 [%rd1], %fd74; - bar.sync 0; - setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB9_9; - - setp.gt.u32 %p5, %r6, 511; - mov.f64 %fd75, %fd74; - @%p5 bra BB9_8; - - ld.shared.f64 %fd31, [%rd1+4096]; - mul.f64 %fd75, %fd74, %fd31; - st.shared.f64 [%rd1], %fd75; - -BB9_8: - mov.f64 %fd74, %fd75; - bar.sync 0; - -BB9_9: - mov.f64 %fd72, %fd74; - setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB9_13; - - setp.gt.u32 %p7, %r6, 255; - mov.f64 %fd73, %fd72; - @%p7 bra BB9_12; - - ld.shared.f64 %fd32, [%rd1+2048]; - mul.f64 %fd73, %fd72, %fd32; - st.shared.f64 [%rd1], %fd73; - -BB9_12: - mov.f64 %fd72, %fd73; - bar.sync 0; - -BB9_13: - mov.f64 %fd70, %fd72; - setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB9_17; - - setp.gt.u32 %p9, %r6, 127; - mov.f64 %fd71, %fd70; - @%p9 bra BB9_16; - - ld.shared.f64 %fd33, [%rd1+1024]; - mul.f64 %fd71, %fd70, %fd33; - st.shared.f64 [%rd1], %fd71; - -BB9_16: - mov.f64 %fd70, %fd71; - bar.sync 0; - -BB9_17: - mov.f64 %fd68, %fd70; - setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB9_21; - - setp.gt.u32 %p11, %r6, 63; - mov.f64 %fd69, %fd68; - @%p11 bra BB9_20; - - ld.shared.f64 %fd34, [%rd1+512]; - mul.f64 %fd69, %fd68, %fd34; - st.shared.f64 [%rd1], %fd69; - -BB9_20: - mov.f64 %fd68, %fd69; - bar.sync 0; - -BB9_21: - mov.f64 %fd67, %fd68; - setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB9_34; - - setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB9_24; - - ld.volatile.shared.f64 %fd35, [%rd1+256]; - mul.f64 %fd67, %fd67, %fd35; - st.volatile.shared.f64 [%rd1], %fd67; - -BB9_24: - mov.f64 %fd66, %fd67; - setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB9_26; - - ld.volatile.shared.f64 %fd36, [%rd1+128]; - mul.f64 %fd66, %fd66, %fd36; - st.volatile.shared.f64 [%rd1], %fd66; - -BB9_26: - mov.f64 %fd65, %fd66; - setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB9_28; - - ld.volatile.shared.f64 %fd37, [%rd1+64]; - mul.f64 %fd65, %fd65, %fd37; - st.volatile.shared.f64 [%rd1], %fd65; - -BB9_28: - mov.f64 %fd64, %fd65; - setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB9_30; - - ld.volatile.shared.f64 %fd38, [%rd1+32]; - mul.f64 %fd64, %fd64, %fd38; - st.volatile.shared.f64 [%rd1], %fd64; - -BB9_30: - mov.f64 %fd63, %fd64; - setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB9_32; - - ld.volatile.shared.f64 %fd39, [%rd1+16]; - mul.f64 %fd63, %fd63, %fd39; - st.volatile.shared.f64 [%rd1], %fd63; - -BB9_32: - setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB9_34; - - ld.volatile.shared.f64 %fd40, [%rd1+8]; - mul.f64 %fd41, %fd63, %fd40; - st.volatile.shared.f64 [%rd1], %fd41; - -BB9_34: - setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB9_36; - - ld.shared.f64 %fd42, [sdata]; - mul.wide.u32 %rd10, %r7, 8; - add.s64 %rd11, %rd3, %rd10; - st.f64 [%rd11], %fd42; - -BB9_36: - ret; -} - - // .globl _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 8 .b8 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8], - .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<20>; - .reg .b32 %r<30>; - .reg .f64 %fd<43>; - .reg .b64 %rd<11>; - - - ld.param.u64 %rd2, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd3, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r6, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r5, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3]; - ld.param.u64 %rd4, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5]; - ld.param.f64 %fd42, [_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r7, %ctaid.x; - setp.ge.u32 %p1, %r7, %r6; - @%p1 bra BB10_34; - - mov.u32 %r29, %tid.x; - mul.lo.s32 %r2, %r7, %r5; - setp.ge.u32 %p2, %r29, %r5; - @%p2 bra BB10_3; - -BB10_2: - add.s32 %r9, %r29, %r2; - mul.wide.u32 %rd5, %r9, 8; - add.s64 %rd6, %rd2, %rd5; - ld.f64 %fd27, [%rd6]; - add.f64 %fd42, %fd42, %fd27; - mov.u32 %r10, %ntid.x; - add.s32 %r29, %r10, %r29; - setp.lt.u32 %p3, %r29, %r5; - @%p3 bra BB10_2; - -BB10_3: - mov.u32 %r11, %tid.x; - mul.wide.u32 %rd7, %r11, 8; - mov.u64 %rd8, sdata; - add.s64 %rd1, %rd8, %rd7; - st.shared.f64 [%rd1], %fd42; - bar.sync 0; - mov.u32 %r12, %ntid.x; - setp.lt.u32 %p4, %r12, 1024; - @%p4 bra BB10_7; - - setp.gt.u32 %p5, %r11, 511; - @%p5 bra BB10_6; - - ld.shared.f64 %fd28, [%rd1+4096]; - add.f64 %fd42, %fd42, %fd28; - st.shared.f64 [%rd1], %fd42; - -BB10_6: - bar.sync 0; - -BB10_7: - setp.lt.u32 %p6, %r12, 512; - @%p6 bra BB10_11; - - setp.gt.u32 %p7, %r11, 255; - @%p7 bra BB10_10; - - ld.shared.f64 %fd29, [%rd1+2048]; - add.f64 %fd42, %fd42, %fd29; - st.shared.f64 [%rd1], %fd42; - -BB10_10: - bar.sync 0; - -BB10_11: - setp.lt.u32 %p8, %r12, 256; - @%p8 bra BB10_15; - - setp.gt.u32 %p9, %r11, 127; - @%p9 bra BB10_14; - - ld.shared.f64 %fd30, [%rd1+1024]; - add.f64 %fd42, %fd42, %fd30; - st.shared.f64 [%rd1], %fd42; - -BB10_14: - bar.sync 0; - -BB10_15: - setp.lt.u32 %p10, %r12, 128; - @%p10 bra BB10_19; - - setp.gt.u32 %p11, %r11, 63; - @%p11 bra BB10_18; - - ld.shared.f64 %fd31, [%rd1+512]; - add.f64 %fd42, %fd42, %fd31; - st.shared.f64 [%rd1], %fd42; - -BB10_18: - bar.sync 0; - -BB10_19: - setp.gt.u32 %p12, %r11, 31; - @%p12 bra BB10_32; - - setp.lt.u32 %p13, %r12, 64; - @%p13 bra BB10_22; - - ld.volatile.shared.f64 %fd32, [%rd1+256]; - add.f64 %fd42, %fd42, %fd32; - st.volatile.shared.f64 [%rd1], %fd42; - -BB10_22: - setp.lt.u32 %p14, %r12, 32; - @%p14 bra BB10_24; - - ld.volatile.shared.f64 %fd33, [%rd1+128]; - add.f64 %fd42, %fd42, %fd33; - st.volatile.shared.f64 [%rd1], %fd42; - -BB10_24: - setp.lt.u32 %p15, %r12, 16; - @%p15 bra BB10_26; - - ld.volatile.shared.f64 %fd34, [%rd1+64]; - add.f64 %fd42, %fd42, %fd34; - st.volatile.shared.f64 [%rd1], %fd42; - -BB10_26: - setp.lt.u32 %p16, %r12, 8; - @%p16 bra BB10_28; - - ld.volatile.shared.f64 %fd35, [%rd1+32]; - add.f64 %fd42, %fd42, %fd35; - st.volatile.shared.f64 [%rd1], %fd42; - -BB10_28: - setp.lt.u32 %p17, %r12, 4; - @%p17 bra BB10_30; - - ld.volatile.shared.f64 %fd36, [%rd1+16]; - add.f64 %fd42, %fd42, %fd36; - st.volatile.shared.f64 [%rd1], %fd42; - -BB10_30: - setp.lt.u32 %p18, %r12, 2; - @%p18 bra BB10_32; - - ld.volatile.shared.f64 %fd37, [%rd1+8]; - add.f64 %fd38, %fd42, %fd37; - st.volatile.shared.f64 [%rd1], %fd38; - -BB10_32: - setp.ne.s32 %p19, %r11, 0; - @%p19 bra BB10_34; - - ld.shared.f64 %fd39, [sdata]; - cvt.rn.f64.s64 %fd40, %rd4; - div.rn.f64 %fd41, %fd39, %fd40; - mul.wide.u32 %rd9, %r7, 8; - add.s64 %rd10, %rd3, %rd9; - st.f64 [%rd10], %fd41; - -BB10_34: - ret; -} - - // .globl _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d -.visible .func _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d( - .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0, - .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1, - .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2, - .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3, - .param .align 1 .b8 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1], - .param .align 8 .b8 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8], - .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<11>; - .reg .f64 %fd<9>; - .reg .b64 %rd<8>; - - - ld.param.u64 %rd1, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0]; - ld.param.u64 %rd2, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1]; - ld.param.u32 %r5, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2]; - ld.param.u32 %r6, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3]; - ld.param.u64 %rd3, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5]; - ld.param.f64 %fd8, [_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6]; - mov.u32 %r7, %ntid.x; - mov.u32 %r8, %ctaid.x; - mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r7, %r8, %r9; - setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB11_5; - - mul.lo.s32 %r2, %r6, %r5; - setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB11_4; - - mov.u32 %r10, %r1; - -BB11_3: - mov.u32 %r3, %r10; - mul.wide.u32 %rd4, %r3, 8; - add.s64 %rd5, %rd1, %rd4; - ld.f64 %fd5, [%rd5]; - add.f64 %fd8, %fd8, %fd5; - add.s32 %r4, %r3, %r6; - setp.lt.u32 %p3, %r4, %r2; - mov.u32 %r10, %r4; - @%p3 bra BB11_3; - -BB11_4: - cvt.rn.f64.s64 %fd6, %rd3; - div.rn.f64 %fd7, %fd8, %fd6; - mul.wide.u32 %rd6, %r1, 8; - add.s64 %rd7, %rd2, %rd6; - st.f64 [%rd7], %fd7; - -BB11_5: - ret; -} - - // .globl copyUpperToLowerTriangleDense -.visible .entry copyUpperToLowerTriangleDense( - .param .u64 copyUpperToLowerTriangleDense_param_0, - .param .u32 copyUpperToLowerTriangleDense_param_1, - .param .u32 copyUpperToLowerTriangleDense_param_2 +.visible .entry copy_u2l_dense( + .param .u64 copy_u2l_dense_param_0, + .param .u32 copy_u2l_dense_param_1, + .param .u32 copy_u2l_dense_param_2 ) { .reg .pred %p<4>; @@ -1713,9 +31,9 @@ BB11_5: .reg .b64 %rd<7>; - ld.param.u64 %rd1, [copyUpperToLowerTriangleDense_param_0]; - ld.param.u32 %r4, [copyUpperToLowerTriangleDense_param_1]; - ld.param.u32 %r5, [copyUpperToLowerTriangleDense_param_2]; + ld.param.u64 %rd1, [copy_u2l_dense_param_0]; + ld.param.u32 %r4, [copy_u2l_dense_param_1]; + ld.param.u32 %r5, [copy_u2l_dense_param_2]; mov.u32 %r6, %ntid.x; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %tid.x; @@ -1728,10 +46,10 @@ BB11_5: setp.gt.s32 %p1, %r2, %r1; setp.lt.s32 %p2, %r3, %r5; and.pred %p3, %p1, %p2; - @!%p3 bra BB12_2; - bra.uni BB12_1; + @!%p3 bra BB0_2; + bra.uni BB0_1; -BB12_1: +BB0_1: cvta.to.global.u64 %rd2, %rd1; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd3, %r12, 8; @@ -1741,93 +59,7 @@ BB12_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB12_2: - ret; -} - - // .globl dense_matrix_set -.visible .entry dense_matrix_set( - .param .u64 dense_matrix_set_param_0, - .param .f64 dense_matrix_set_param_1, - .param .u32 dense_matrix_set_param_2, - .param .u32 dense_matrix_set_param_3 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<13>; - .reg .f64 %fd<2>; - .reg .b64 %rd<5>; - - - ld.param.u64 %rd1, [dense_matrix_set_param_0]; - ld.param.f64 %fd1, [dense_matrix_set_param_1]; - ld.param.u32 %r2, [dense_matrix_set_param_2]; - ld.param.u32 %r3, [dense_matrix_set_param_3]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r7, %r3, %r10; - mad.lo.s32 %r1, %r8, %r9, %r11; - mul.lo.s32 %r12, %r3, %r2; - setp.ge.s32 %p1, %r1, %r12; - @%p1 bra BB13_2; - - cvta.to.global.u64 %rd2, %rd1; - mul.wide.s32 %rd3, %r1, 8; - add.s64 %rd4, %rd2, %rd3; - st.global.f64 [%rd4], %fd1; - -BB13_2: - ret; -} - - // .globl dense_matrix_copy -.visible .entry dense_matrix_copy( - .param .u64 dense_matrix_copy_param_0, - .param .u64 dense_matrix_copy_param_1, - .param .u32 dense_matrix_copy_param_2, - .param .u32 dense_matrix_copy_param_3 -) -{ - .reg .pred %p<4>; - .reg .b32 %r<12>; - .reg .f64 %fd<2>; - .reg .b64 %rd<8>; - - - ld.param.u64 %rd1, [dense_matrix_copy_param_0]; - ld.param.u64 %rd2, [dense_matrix_copy_param_1]; - ld.param.u32 %r2, [dense_matrix_copy_param_2]; - ld.param.u32 %r3, [dense_matrix_copy_param_3]; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %ntid.x; - mov.u32 %r6, %tid.x; - mad.lo.s32 %r7, %r5, %r4, %r6; - mov.u32 %r8, %ntid.y; - mov.u32 %r9, %ctaid.y; - mov.u32 %r10, %tid.y; - mad.lo.s32 %r11, %r8, %r9, %r10; - mad.lo.s32 %r1, %r7, %r3, %r11; - setp.lt.s32 %p1, %r7, %r2; - setp.lt.s32 %p2, %r11, %r3; - and.pred %p3, %p1, %p2; - @!%p3 bra BB14_2; - bra.uni BB14_1; - -BB14_1: - cvta.to.global.u64 %rd3, %rd1; - mul.wide.s32 %rd4, %r1, 8; - add.s64 %rd5, %rd3, %rd4; - ld.global.f64 %fd1, [%rd5]; - cvta.to.global.u64 %rd6, %rd2; - add.s64 %rd7, %rd6, %rd4; - st.global.f64 [%rd7], %fd1; - -BB14_2: +BB0_2: ret; } @@ -1860,10 +92,10 @@ BB14_2: setp.lt.s32 %p1, %r1, %r4; setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB15_2; - bra.uni BB15_1; + @!%p3 bra BB1_2; + bra.uni BB1_1; -BB15_1: +BB1_1: cvta.to.global.u64 %rd3, %rd1; mad.lo.s32 %r11, %r1, %r3, %r2; mul.wide.s32 %rd4, %r11, 8; @@ -1875,75 +107,78 @@ BB15_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB15_2: +BB1_2: ret; } - // .globl reluBackward -.visible .entry reluBackward( - .param .u64 reluBackward_param_0, - .param .u64 reluBackward_param_1, - .param .u64 reluBackward_param_2, - .param .u32 reluBackward_param_3, - .param .u32 reluBackward_param_4 + // .globl relu_backward +.visible .entry relu_backward( + .param .u64 relu_backward_param_0, + .param .u64 relu_backward_param_1, + .param .u64 relu_backward_param_2, + .param .u32 relu_backward_param_3, + .param .u32 relu_backward_param_4 ) { .reg .pred %p<5>; .reg .b32 %r<12>; .reg .f64 %fd<6>; - .reg .b64 %rd<13>; + .reg .b64 %rd<14>; - ld.param.u64 %rd1, [reluBackward_param_0]; - ld.param.u64 %rd2, [reluBackward_param_1]; - ld.param.u64 %rd3, [reluBackward_param_2]; - ld.param.u32 %r5, [reluBackward_param_3]; - ld.param.u32 %r4, [reluBackward_param_4]; - mov.u32 %r6, %ntid.x; - mov.u32 %r7, %ctaid.x; - mov.u32 %r8, %tid.x; - mad.lo.s32 %r1, %r6, %r7, %r8; - mov.u32 %r9, %ntid.y; - mov.u32 %r10, %ctaid.y; - mov.u32 %r11, %tid.y; - mad.lo.s32 %r2, %r9, %r10, %r11; - setp.lt.s32 %p1, %r1, %r5; - setp.lt.s32 %p2, %r2, %r4; + ld.param.u64 %rd2, [relu_backward_param_0]; + ld.param.u64 %rd3, [relu_backward_param_1]; + ld.param.u64 %rd4, [relu_backward_param_2]; + ld.param.u32 %r4, [relu_backward_param_3]; + ld.param.u32 %r3, [relu_backward_param_4]; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r1, %r5, %r6, %r7; + mov.u32 %r8, %ntid.y; + mov.u32 %r9, %ctaid.y; + mov.u32 %r10, %tid.y; + mad.lo.s32 %r2, %r8, %r9, %r10; + setp.lt.s32 %p1, %r1, %r4; + setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB16_4; - bra.uni BB16_1; + @!%p3 bra BB2_4; + bra.uni BB2_1; -BB16_1: - cvta.to.global.u64 %rd4, %rd1; - mad.lo.s32 %r3, %r1, %r4, %r2; - mul.wide.s32 %rd5, %r3, 8; - add.s64 %rd6, %rd4, %rd5; - ld.global.f64 %fd4, [%rd6]; +BB2_1: + cvta.to.global.u64 %rd5, %rd2; + mad.lo.s32 %r11, %r1, %r3, %r2; + cvt.s64.s32 %rd1, %r11; + mul.wide.s32 %rd6, %r11, 8; + add.s64 %rd7, %rd5, %rd6; + ld.global.f64 %fd4, [%rd7]; mov.f64 %fd5, 0d0000000000000000; setp.leu.f64 %p4, %fd4, 0d0000000000000000; - @%p4 bra BB16_3; + @%p4 bra BB2_3; - cvta.to.global.u64 %rd7, %rd2; - add.s64 %rd9, %rd7, %rd5; - ld.global.f64 %fd5, [%rd9]; + cvta.to.global.u64 %rd8, %rd3; + shl.b64 %rd9, %rd1, 3; + add.s64 %rd10, %rd8, %rd9; + ld.global.f64 %fd5, [%rd10]; -BB16_3: - cvta.to.global.u64 %rd10, %rd3; - add.s64 %rd12, %rd10, %rd5; - st.global.f64 [%rd12], %fd5; +BB2_3: + cvta.to.global.u64 %rd11, %rd4; + shl.b64 %rd12, %rd1, 3; + add.s64 %rd13, %rd11, %rd12; + st.global.f64 [%rd13], %fd5; -BB16_4: +BB2_4: ret; } - // .globl biasAdd -.visible .entry biasAdd( - .param .u64 biasAdd_param_0, - .param .u64 biasAdd_param_1, - .param .u64 biasAdd_param_2, - .param .u32 biasAdd_param_3, - .param .u32 biasAdd_param_4, - .param .u32 biasAdd_param_5 + // .globl bias_add +.visible .entry bias_add( + .param .u64 bias_add_param_0, + .param .u64 bias_add_param_1, + .param .u64 bias_add_param_2, + .param .u32 bias_add_param_3, + .param .u32 bias_add_param_4, + .param .u32 bias_add_param_5 ) { .reg .pred %p<4>; @@ -1952,12 +187,12 @@ BB16_4: .reg .b64 %rd<12>; - ld.param.u64 %rd1, [biasAdd_param_0]; - ld.param.u64 %rd2, [biasAdd_param_1]; - ld.param.u64 %rd3, [biasAdd_param_2]; - ld.param.u32 %r5, [biasAdd_param_3]; - ld.param.u32 %r3, [biasAdd_param_4]; - ld.param.u32 %r4, [biasAdd_param_5]; + ld.param.u64 %rd1, [bias_add_param_0]; + ld.param.u64 %rd2, [bias_add_param_1]; + ld.param.u64 %rd3, [bias_add_param_2]; + ld.param.u32 %r5, [bias_add_param_3]; + ld.param.u32 %r3, [bias_add_param_4]; + ld.param.u32 %r4, [bias_add_param_5]; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; @@ -1969,10 +204,10 @@ BB16_4: setp.lt.s32 %p1, %r1, %r5; setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB17_2; - bra.uni BB17_1; + @!%p3 bra BB3_2; + bra.uni BB3_1; -BB17_1: +BB3_1: cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r12, %r1, %r3, %r2; mul.wide.s32 %rd5, %r12, 8; @@ -1988,21 +223,21 @@ BB17_1: add.s64 %rd11, %rd10, %rd5; st.global.f64 [%rd11], %fd3; -BB17_2: +BB3_2: ret; } - // .globl compareAndSet -.visible .entry compareAndSet( - .param .u64 compareAndSet_param_0, - .param .u64 compareAndSet_param_1, - .param .u32 compareAndSet_param_2, - .param .u32 compareAndSet_param_3, - .param .f64 compareAndSet_param_4, - .param .f64 compareAndSet_param_5, - .param .f64 compareAndSet_param_6, - .param .f64 compareAndSet_param_7, - .param .f64 compareAndSet_param_8 + // .globl compare_and_set +.visible .entry compare_and_set( + .param .u64 compare_and_set_param_0, + .param .u64 compare_and_set_param_1, + .param .u32 compare_and_set_param_2, + .param .u32 compare_and_set_param_3, + .param .f64 compare_and_set_param_4, + .param .f64 compare_and_set_param_5, + .param .f64 compare_and_set_param_6, + .param .f64 compare_and_set_param_7, + .param .f64 compare_and_set_param_8 ) { .reg .pred %p<6>; @@ -2011,15 +246,15 @@ BB17_2: .reg .b64 %rd<8>; - ld.param.u64 %rd2, [compareAndSet_param_0]; - ld.param.u64 %rd3, [compareAndSet_param_1]; - ld.param.u32 %r2, [compareAndSet_param_2]; - ld.param.u32 %r3, [compareAndSet_param_3]; - ld.param.f64 %fd2, [compareAndSet_param_4]; - ld.param.f64 %fd3, [compareAndSet_param_5]; - ld.param.f64 %fd4, [compareAndSet_param_6]; - ld.param.f64 %fd5, [compareAndSet_param_7]; - ld.param.f64 %fd6, [compareAndSet_param_8]; + ld.param.u64 %rd2, [compare_and_set_param_0]; + ld.param.u64 %rd3, [compare_and_set_param_1]; + ld.param.u32 %r2, [compare_and_set_param_2]; + ld.param.u32 %r3, [compare_and_set_param_3]; + ld.param.f64 %fd2, [compare_and_set_param_4]; + ld.param.f64 %fd3, [compare_and_set_param_5]; + ld.param.f64 %fd4, [compare_and_set_param_6]; + ld.param.f64 %fd5, [compare_and_set_param_7]; + ld.param.f64 %fd6, [compare_and_set_param_8]; mov.u32 %r4, %ctaid.x; mov.u32 %r5, %ntid.x; mov.u32 %r6, %tid.x; @@ -2032,10 +267,10 @@ BB17_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB18_6; - bra.uni BB18_1; + @!%p3 bra BB4_6; + bra.uni BB4_1; -BB18_1: +BB4_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -2045,26 +280,26 @@ BB18_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB18_5; - bra.uni BB18_2; + @%p4 bra BB4_5; + bra.uni BB4_2; -BB18_5: +BB4_5: st.global.f64 [%rd1], %fd4; - bra.uni BB18_6; + bra.uni BB4_6; -BB18_2: +BB4_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB18_4; - bra.uni BB18_3; + @%p5 bra BB4_4; + bra.uni BB4_3; -BB18_4: +BB4_4: st.global.f64 [%rd1], %fd5; - bra.uni BB18_6; + bra.uni BB4_6; -BB18_3: +BB4_3: st.global.f64 [%rd1], %fd6; -BB18_6: +BB4_6: ret; } @@ -2080,9 +315,9 @@ BB18_6: .param .u32 matrix_matrix_cellwise_op_param_7 ) { - .reg .pred %p<52>; - .reg .b32 %r<56>; - .reg .f64 %fd<40>; + .reg .pred %p<54>; + .reg .b32 %r<55>; + .reg .f64 %fd<39>; .reg .b64 %rd<15>; @@ -2105,93 +340,93 @@ BB18_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB19_55; - bra.uni BB19_1; + @!%p4 bra BB5_53; + bra.uni BB5_1; -BB19_1: +BB5_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r54, %r1; - @%p5 bra BB19_5; + mov.u32 %r53, %r1; + @%p5 bra BB5_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r55, %r3; - @%p6 bra BB19_4; + mov.u32 %r54, %r3; + @%p6 bra BB5_4; - mov.u32 %r55, %r2; + mov.u32 %r54, %r2; -BB19_4: - mov.u32 %r49, %r55; - mov.u32 %r4, %r49; - mov.u32 %r54, %r4; +BB5_4: + mov.u32 %r48, %r54; + mov.u32 %r4, %r48; + mov.u32 %r53, %r4; -BB19_5: - mov.u32 %r5, %r54; +BB5_5: + mov.u32 %r5, %r53; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r52, %r1; - @%p7 bra BB19_9; + mov.u32 %r51, %r1; + @%p7 bra BB5_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r53, %r3; - @%p8 bra BB19_8; + mov.u32 %r52, %r3; + @%p8 bra BB5_8; - mov.u32 %r53, %r2; + mov.u32 %r52, %r2; -BB19_8: - mov.u32 %r52, %r53; +BB5_8: + mov.u32 %r51, %r52; -BB19_9: +BB5_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r52, 8; + mul.wide.s32 %rd9, %r51, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd39, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd38, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB19_19; + @%p9 bra BB5_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB19_15; + @%p19 bra BB5_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB19_53; + @%p23 bra BB5_51; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB19_52; - bra.uni BB19_13; + @%p24 bra BB5_50; + bra.uni BB5_13; -BB19_52: - sub.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; +BB5_50: + sub.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_19: +BB5_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB19_24; + @%p10 bra BB5_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB19_34; + @%p16 bra BB5_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB19_33; - bra.uni BB19_22; + @%p17 bra BB5_33; + bra.uni BB5_22; -BB19_33: +BB5_33: setp.gt.f64 %p29, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; + bra.uni BB5_52; -BB19_15: +BB5_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB19_51; + @%p20 bra BB5_49; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB19_35; - bra.uni BB19_17; + @%p21 bra BB5_35; + bra.uni BB5_17; -BB19_35: +BB5_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -2221,133 +456,133 @@ BB19_35: param0, param1 ); - ld.param.f64 %fd38, [retval0+0]; + ld.param.f64 %fd37, [retval0+0]; //{ }// Callseq End 0 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB19_37; - bra.uni BB19_36; + @!%p1 bra BB5_37; + bra.uni BB5_36; -BB19_36: +BB5_36: { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd38; + mov.b64 {%temp, %r23}, %fd37; } xor.b32 %r24, %r23, -2147483648; { .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd38; + mov.b64 {%r25, %temp}, %fd37; } - mov.b64 %fd38, {%r25, %r24}; + mov.b64 %fd37, {%r25, %r24}; -BB19_37: - mov.f64 %fd37, %fd38; +BB5_37: + mov.f64 %fd36, %fd37; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB19_40; - bra.uni BB19_38; + @%p34 bra BB5_40; + bra.uni BB5_38; -BB19_40: +BB5_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; - mov.b64 %fd37, {%r29, %r28}; - bra.uni BB19_41; + mov.b64 %fd36, {%r29, %r28}; + bra.uni BB5_41; -BB19_24: +BB5_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB19_28; + @%p11 bra BB5_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB19_32; - bra.uni BB19_26; + @%p14 bra BB5_32; + bra.uni BB5_26; -BB19_32: +BB5_32: setp.eq.f64 %p27, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB5_52; -BB19_28: +BB5_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB19_31; - bra.uni BB19_29; + @%p12 bra BB5_31; + bra.uni BB5_29; -BB19_31: - min.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; +BB5_31: + min.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_53: - add.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; +BB5_51: + add.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_13: +BB5_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB19_14; - bra.uni BB19_54; + @%p25 bra BB5_14; + bra.uni BB5_52; -BB19_14: - mul.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; +BB5_14: + mul.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_34: +BB5_34: setp.le.f64 %p30, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; + bra.uni BB5_52; -BB19_22: +BB5_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB19_23; - bra.uni BB19_54; + @%p18 bra BB5_23; + bra.uni BB5_52; -BB19_23: +BB5_23: setp.ge.f64 %p28, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; + bra.uni BB5_52; -BB19_51: - div.rn.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; +BB5_49: + div.rn.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_17: +BB5_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB19_18; - bra.uni BB19_54; + @%p22 bra BB5_18; + bra.uni BB5_52; -BB19_18: +BB5_18: setp.lt.f64 %p31, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; + bra.uni BB5_52; -BB19_26: +BB5_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB19_27; - bra.uni BB19_54; + @%p15 bra BB5_27; + bra.uni BB5_52; -BB19_27: +BB5_27: setp.neu.f64 %p26, %fd1, %fd2; - selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB19_54; + selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB5_52; -BB19_29: +BB5_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB19_54; + @%p13 bra BB5_52; - max.f64 %fd39, %fd1, %fd2; - bra.uni BB19_54; + max.f64 %fd38, %fd1, %fd2; + bra.uni BB5_52; -BB19_38: +BB5_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB19_41; + @%p35 bra BB5_41; cvt.rzi.f64.f64 %fd29, %fd2; setp.neu.f64 %p36, %fd29, %fd2; - selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36; + selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; -BB19_41: - mov.f64 %fd17, %fd37; +BB5_41: + mov.f64 %fd17, %fd36; add.f64 %fd18, %fd1, %fd2; { .reg .b32 %temp; @@ -2355,79 +590,77 @@ BB19_41: } and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; - mov.f64 %fd36, %fd17; - @%p39 bra BB19_50; + mov.f64 %fd35, %fd17; + @%p39 bra BB5_48; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; - mov.f64 %fd36, %fd18; - @%p40 bra BB19_50; + mov.f64 %fd35, %fd18; + @%p40 bra BB5_48; abs.f64 %fd30, %fd2; setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; - mov.f64 %fd35, %fd18; - mov.f64 %fd36, %fd35; - @%p41 bra BB19_50; - - and.b32 %r32, %r9, 2147483647; - setp.ne.s32 %p42, %r32, 2146435072; - @%p42 bra BB19_46; + mov.f64 %fd34, %fd18; + mov.f64 %fd35, %fd34; + @%p41 bra BB5_48; { .reg .b32 %temp; - mov.b64 {%r33, %temp}, %fd2; + mov.b64 {%r32, %temp}, %fd2; } - setp.eq.s32 %p43, %r33, 0; - @%p43 bra BB19_49; - -BB19_46: - and.b32 %r34, %r8, 2147483647; - setp.ne.s32 %p44, %r34, 2146435072; - mov.f64 %fd33, %fd17; - mov.f64 %fd36, %fd33; - @%p44 bra BB19_50; - + and.b32 %r33, %r9, 2147483647; + setp.eq.s32 %p42, %r33, 2146435072; + setp.eq.s32 %p43, %r32, 0; + and.pred %p44, %p42, %p43; + @%p44 bra BB5_47; + bra.uni BB5_45; + +BB5_47: + setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; + selp.b32 %r41, 2146435072, 0, %p48; + xor.b32 %r42, %r41, 2146435072; + setp.lt.s32 %p49, %r9, 0; + selp.b32 %r43, %r42, %r41, %p49; + setp.eq.f64 %p50, %fd1, 0dBFF0000000000000; + selp.b32 %r44, 1072693248, %r43, %p50; + mov.u32 %r45, 0; + mov.b64 %fd35, {%r45, %r44}; + bra.uni BB5_48; + +BB5_45: { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd1; + mov.b64 {%r34, %temp}, %fd1; } - setp.ne.s32 %p45, %r35, 0; - mov.f64 %fd36, %fd17; - @%p45 bra BB19_50; - + and.b32 %r35, %r8, 2147483647; + setp.eq.s32 %p45, %r35, 2146435072; + setp.eq.s32 %p46, %r34, 0; + and.pred %p47, %p45, %p46; + mov.f64 %fd35, %fd17; + @!%p47 bra BB5_48; + bra.uni BB5_46; + +BB5_46: shr.s32 %r36, %r9, 31; and.b32 %r37, %r36, -2146435072; - add.s32 %r38, %r37, 2146435072; - or.b32 %r39, %r38, -2147483648; - selp.b32 %r40, %r39, %r38, %p1; - mov.u32 %r41, 0; - mov.b64 %fd36, {%r41, %r40}; - bra.uni BB19_50; - -BB19_49: - setp.gt.f64 %p46, %fd11, 0d3FF0000000000000; - selp.b32 %r42, 2146435072, 0, %p46; - xor.b32 %r43, %r42, 2146435072; - setp.lt.s32 %p47, %r9, 0; - selp.b32 %r44, %r43, %r42, %p47; - setp.eq.f64 %p48, %fd1, 0dBFF0000000000000; - selp.b32 %r45, 1072693248, %r44, %p48; - mov.u32 %r46, 0; - mov.b64 %fd36, {%r46, %r45}; - -BB19_50: - setp.eq.f64 %p49, %fd2, 0d0000000000000000; - setp.eq.f64 %p50, %fd1, 0d3FF0000000000000; - or.pred %p51, %p50, %p49; - selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51; - -BB19_54: + selp.b32 %r38, -1048576, 2146435072, %p1; + add.s32 %r39, %r38, %r37; + mov.u32 %r40, 0; + mov.b64 %fd35, {%r40, %r39}; + +BB5_48: + setp.eq.f64 %p51, %fd2, 0d0000000000000000; + setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; + or.pred %p53, %p52, %p51; + selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; + +BB5_52: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; - st.global.f64 [%rd14], %fd39; + st.global.f64 [%rd14], %fd38; bar.sync 0; -BB19_55: +BB5_53: ret; } @@ -2441,9 +674,9 @@ BB19_55: .param .u32 matrix_scalar_op_param_5 ) { - .reg .pred %p<91>; - .reg .b32 %r<64>; - .reg .f64 %fd<77>; + .reg .pred %p<95>; + .reg .b32 %r<62>; + .reg .f64 %fd<75>; .reg .b64 %rd<12>; @@ -2458,7 +691,7 @@ BB19_55: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB20_94; + @%p3 bra BB6_90; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -2467,86 +700,86 @@ BB19_55: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB20_48; + @%p4 bra BB6_46; - mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd66, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 5; - @%p5 bra BB20_12; + @%p5 bra BB6_12; setp.gt.s32 %p15, %r6, 2; - @%p15 bra BB20_8; + @%p15 bra BB6_8; setp.eq.s32 %p19, %r6, 0; - @%p19 bra BB20_46; + @%p19 bra BB6_44; setp.eq.s32 %p20, %r6, 1; - @%p20 bra BB20_45; - bra.uni BB20_6; + @%p20 bra BB6_43; + bra.uni BB6_6; -BB20_45: - sub.f64 %fd67, %fd52, %fd1; - bra.uni BB20_47; +BB6_43: + sub.f64 %fd66, %fd52, %fd1; + bra.uni BB6_45; -BB20_48: - mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p48, %r6, 5; - @%p48 bra BB20_58; +BB6_46: + mov.f64 %fd74, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p50, %r6, 5; + @%p50 bra BB6_56; - setp.gt.s32 %p58, %r6, 2; - @%p58 bra BB20_54; + setp.gt.s32 %p60, %r6, 2; + @%p60 bra BB6_52; - setp.eq.s32 %p62, %r6, 0; - @%p62 bra BB20_92; + setp.eq.s32 %p64, %r6, 0; + @%p64 bra BB6_88; - setp.eq.s32 %p63, %r6, 1; - @%p63 bra BB20_91; - bra.uni BB20_52; + setp.eq.s32 %p65, %r6, 1; + @%p65 bra BB6_87; + bra.uni BB6_50; -BB20_91: - sub.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; +BB6_87: + sub.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; -BB20_12: +BB6_12: setp.gt.s32 %p6, %r6, 8; - @%p6 bra BB20_17; + @%p6 bra BB6_17; setp.eq.s32 %p12, %r6, 6; - @%p12 bra BB20_27; + @%p12 bra BB6_27; setp.eq.s32 %p13, %r6, 7; - @%p13 bra BB20_26; - bra.uni BB20_15; + @%p13 bra BB6_26; + bra.uni BB6_15; -BB20_26: +BB6_26: setp.lt.f64 %p25, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; + bra.uni BB6_45; -BB20_58: - setp.gt.s32 %p49, %r6, 8; - @%p49 bra BB20_63; +BB6_56: + setp.gt.s32 %p51, %r6, 8; + @%p51 bra BB6_61; - setp.eq.s32 %p55, %r6, 6; - @%p55 bra BB20_73; + setp.eq.s32 %p57, %r6, 6; + @%p57 bra BB6_71; - setp.eq.s32 %p56, %r6, 7; - @%p56 bra BB20_72; - bra.uni BB20_61; + setp.eq.s32 %p58, %r6, 7; + @%p58 bra BB6_70; + bra.uni BB6_59; -BB20_72: - setp.gt.f64 %p68, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68; - bra.uni BB20_93; +BB6_70: + setp.gt.f64 %p70, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70; + bra.uni BB6_89; -BB20_8: +BB6_8: setp.eq.s32 %p16, %r6, 3; - @%p16 bra BB20_44; + @%p16 bra BB6_42; setp.eq.s32 %p17, %r6, 4; - @%p17 bra BB20_28; - bra.uni BB20_10; + @%p17 bra BB6_28; + bra.uni BB6_10; -BB20_28: +BB6_28: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd52; @@ -2576,64 +809,64 @@ BB20_28: param0, param1 ); - ld.param.f64 %fd66, [retval0+0]; + ld.param.f64 %fd65, [retval0+0]; //{ }// Callseq End 1 setp.lt.s32 %p29, %r2, 0; and.pred %p1, %p29, %p28; - @!%p1 bra BB20_30; - bra.uni BB20_29; + @!%p1 bra BB6_30; + bra.uni BB6_29; -BB20_29: +BB6_29: { .reg .b32 %temp; - mov.b64 {%temp, %r14}, %fd66; + mov.b64 {%temp, %r14}, %fd65; } xor.b32 %r15, %r14, -2147483648; { .reg .b32 %temp; - mov.b64 {%r16, %temp}, %fd66; + mov.b64 {%r16, %temp}, %fd65; } - mov.b64 %fd66, {%r16, %r15}; + mov.b64 %fd65, {%r16, %r15}; -BB20_30: - mov.f64 %fd65, %fd66; +BB6_30: + mov.f64 %fd64, %fd65; setp.eq.f64 %p30, %fd52, 0d0000000000000000; - @%p30 bra BB20_33; - bra.uni BB20_31; + @%p30 bra BB6_33; + bra.uni BB6_31; -BB20_33: +BB6_33: selp.b32 %r17, %r2, 0, %p28; or.b32 %r18, %r17, 2146435072; setp.lt.s32 %p34, %r3, 0; selp.b32 %r19, %r18, %r17, %p34; mov.u32 %r20, 0; - mov.b64 %fd65, {%r20, %r19}; - bra.uni BB20_34; + mov.b64 %fd64, {%r20, %r19}; + bra.uni BB6_34; -BB20_17: +BB6_17: setp.gt.s32 %p7, %r6, 10; - @%p7 bra BB20_21; + @%p7 bra BB6_21; setp.eq.s32 %p10, %r6, 9; - @%p10 bra BB20_25; - bra.uni BB20_19; + @%p10 bra BB6_25; + bra.uni BB6_19; -BB20_25: +BB6_25: setp.eq.f64 %p23, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; + bra.uni BB6_45; -BB20_54: - setp.eq.s32 %p59, %r6, 3; - @%p59 bra BB20_90; +BB6_52: + setp.eq.s32 %p61, %r6, 3; + @%p61 bra BB6_86; - setp.eq.s32 %p60, %r6, 4; - @%p60 bra BB20_74; - bra.uni BB20_56; + setp.eq.s32 %p62, %r6, 4; + @%p62 bra BB6_72; + bra.uni BB6_54; -BB20_74: +BB6_72: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -2642,11 +875,11 @@ BB20_74: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd52; } - bfe.u32 %r38, %r5, 20, 11; - add.s32 %r39, %r38, -1012; + bfe.u32 %r37, %r5, 20, 11; + add.s32 %r38, %r37, -1012; mov.b64 %rd11, %fd52; - shl.b64 %rd3, %rd11, %r39; - setp.eq.s64 %p71, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd11, %r38; + setp.eq.s64 %p73, %rd3, -9223372036854775808; abs.f64 %fd35, %fd1; // Callseq Start 2 { @@ -2663,201 +896,201 @@ BB20_74: param0, param1 ); - ld.param.f64 %fd75, [retval0+0]; + ld.param.f64 %fd73, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p72, %r4, 0; - and.pred %p2, %p72, %p71; - @!%p2 bra BB20_76; - bra.uni BB20_75; + setp.lt.s32 %p74, %r4, 0; + and.pred %p2, %p74, %p73; + @!%p2 bra BB6_74; + bra.uni BB6_73; -BB20_75: +BB6_73: { .reg .b32 %temp; - mov.b64 {%temp, %r40}, %fd75; + mov.b64 {%temp, %r39}, %fd73; } - xor.b32 %r41, %r40, -2147483648; + xor.b32 %r40, %r39, -2147483648; { .reg .b32 %temp; - mov.b64 {%r42, %temp}, %fd75; + mov.b64 {%r41, %temp}, %fd73; } - mov.b64 %fd75, {%r42, %r41}; + mov.b64 %fd73, {%r41, %r40}; -BB20_76: - mov.f64 %fd74, %fd75; - setp.eq.f64 %p73, %fd1, 0d0000000000000000; - @%p73 bra BB20_79; - bra.uni BB20_77; - -BB20_79: - selp.b32 %r43, %r4, 0, %p71; - or.b32 %r44, %r43, 2146435072; - setp.lt.s32 %p77, %r5, 0; - selp.b32 %r45, %r44, %r43, %p77; - mov.u32 %r46, 0; - mov.b64 %fd74, {%r46, %r45}; - bra.uni BB20_80; - -BB20_63: - setp.gt.s32 %p50, %r6, 10; - @%p50 bra BB20_67; - - setp.eq.s32 %p53, %r6, 9; - @%p53 bra BB20_71; - bra.uni BB20_65; - -BB20_71: - setp.eq.f64 %p66, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66; - bra.uni BB20_93; - -BB20_21: +BB6_74: + mov.f64 %fd72, %fd73; + setp.eq.f64 %p75, %fd1, 0d0000000000000000; + @%p75 bra BB6_77; + bra.uni BB6_75; + +BB6_77: + selp.b32 %r42, %r4, 0, %p73; + or.b32 %r43, %r42, 2146435072; + setp.lt.s32 %p79, %r5, 0; + selp.b32 %r44, %r43, %r42, %p79; + mov.u32 %r45, 0; + mov.b64 %fd72, {%r45, %r44}; + bra.uni BB6_78; + +BB6_61: + setp.gt.s32 %p52, %r6, 10; + @%p52 bra BB6_65; + + setp.eq.s32 %p55, %r6, 9; + @%p55 bra BB6_69; + bra.uni BB6_63; + +BB6_69: + setp.eq.f64 %p68, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68; + bra.uni BB6_89; + +BB6_21: setp.eq.s32 %p8, %r6, 11; - @%p8 bra BB20_24; - bra.uni BB20_22; + @%p8 bra BB6_24; + bra.uni BB6_22; -BB20_24: - min.f64 %fd67, %fd52, %fd1; - bra.uni BB20_47; +BB6_24: + min.f64 %fd66, %fd52, %fd1; + bra.uni BB6_45; -BB20_46: - add.f64 %fd67, %fd1, %fd52; - bra.uni BB20_47; +BB6_44: + add.f64 %fd66, %fd1, %fd52; + bra.uni BB6_45; -BB20_6: +BB6_6: setp.eq.s32 %p21, %r6, 2; - @%p21 bra BB20_7; - bra.uni BB20_47; + @%p21 bra BB6_7; + bra.uni BB6_45; -BB20_7: - mul.f64 %fd67, %fd1, %fd52; - bra.uni BB20_47; +BB6_7: + mul.f64 %fd66, %fd1, %fd52; + bra.uni BB6_45; -BB20_27: +BB6_27: setp.ge.f64 %p26, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB6_45; -BB20_15: +BB6_15: setp.eq.s32 %p14, %r6, 8; - @%p14 bra BB20_16; - bra.uni BB20_47; + @%p14 bra BB6_16; + bra.uni BB6_45; -BB20_16: +BB6_16: setp.le.f64 %p24, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; + bra.uni BB6_45; -BB20_44: - div.rn.f64 %fd67, %fd52, %fd1; - bra.uni BB20_47; +BB6_42: + div.rn.f64 %fd66, %fd52, %fd1; + bra.uni BB6_45; -BB20_10: +BB6_10: setp.eq.s32 %p18, %r6, 5; - @%p18 bra BB20_11; - bra.uni BB20_47; + @%p18 bra BB6_11; + bra.uni BB6_45; -BB20_11: +BB6_11: setp.gt.f64 %p27, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB6_45; -BB20_67: - setp.eq.s32 %p51, %r6, 11; - @%p51 bra BB20_70; - bra.uni BB20_68; +BB6_65: + setp.eq.s32 %p53, %r6, 11; + @%p53 bra BB6_68; + bra.uni BB6_66; -BB20_70: - min.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; +BB6_68: + min.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; -BB20_19: +BB6_19: setp.eq.s32 %p11, %r6, 10; - @%p11 bra BB20_20; - bra.uni BB20_47; + @%p11 bra BB6_20; + bra.uni BB6_45; -BB20_20: +BB6_20: setp.neu.f64 %p22, %fd1, %fd52; - selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB20_47; + selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; + bra.uni BB6_45; -BB20_22: +BB6_22: setp.ne.s32 %p9, %r6, 12; - @%p9 bra BB20_47; - - max.f64 %fd67, %fd52, %fd1; - bra.uni BB20_47; - -BB20_92: - add.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; - -BB20_52: - setp.eq.s32 %p64, %r6, 2; - @%p64 bra BB20_53; - bra.uni BB20_93; - -BB20_53: - mul.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; - -BB20_73: - setp.le.f64 %p69, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69; - bra.uni BB20_93; - -BB20_61: - setp.eq.s32 %p57, %r6, 8; - @%p57 bra BB20_62; - bra.uni BB20_93; - -BB20_62: - setp.ge.f64 %p67, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67; - bra.uni BB20_93; - -BB20_90: - div.rn.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; - -BB20_56: - setp.eq.s32 %p61, %r6, 5; - @%p61 bra BB20_57; - bra.uni BB20_93; - -BB20_57: - setp.lt.f64 %p70, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70; - bra.uni BB20_93; - -BB20_65: - setp.eq.s32 %p54, %r6, 10; - @%p54 bra BB20_66; - bra.uni BB20_93; - -BB20_66: - setp.neu.f64 %p65, %fd1, %fd52; - selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65; - bra.uni BB20_93; - -BB20_68: - setp.ne.s32 %p52, %r6, 12; - @%p52 bra BB20_93; - - max.f64 %fd76, %fd1, %fd52; - bra.uni BB20_93; - -BB20_31: + @%p9 bra BB6_45; + + max.f64 %fd66, %fd52, %fd1; + bra.uni BB6_45; + +BB6_88: + add.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; + +BB6_50: + setp.eq.s32 %p66, %r6, 2; + @%p66 bra BB6_51; + bra.uni BB6_89; + +BB6_51: + mul.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; + +BB6_71: + setp.le.f64 %p71, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71; + bra.uni BB6_89; + +BB6_59: + setp.eq.s32 %p59, %r6, 8; + @%p59 bra BB6_60; + bra.uni BB6_89; + +BB6_60: + setp.ge.f64 %p69, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69; + bra.uni BB6_89; + +BB6_86: + div.rn.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; + +BB6_54: + setp.eq.s32 %p63, %r6, 5; + @%p63 bra BB6_55; + bra.uni BB6_89; + +BB6_55: + setp.lt.f64 %p72, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72; + bra.uni BB6_89; + +BB6_63: + setp.eq.s32 %p56, %r6, 10; + @%p56 bra BB6_64; + bra.uni BB6_89; + +BB6_64: + setp.neu.f64 %p67, %fd1, %fd52; + selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67; + bra.uni BB6_89; + +BB6_66: + setp.ne.s32 %p54, %r6, 12; + @%p54 bra BB6_89; + + max.f64 %fd74, %fd1, %fd52; + bra.uni BB6_89; + +BB6_31: setp.gt.s32 %p31, %r2, -1; - @%p31 bra BB20_34; + @%p31 bra BB6_34; cvt.rzi.f64.f64 %fd54, %fd1; setp.neu.f64 %p32, %fd54, %fd1; - selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p32; + selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p32; -BB20_34: - mov.f64 %fd16, %fd65; +BB6_34: + mov.f64 %fd16, %fd64; add.f64 %fd17, %fd1, %fd52; { .reg .b32 %temp; @@ -2865,161 +1098,157 @@ BB20_34: } and.b32 %r22, %r21, 2146435072; setp.ne.s32 %p35, %r22, 2146435072; - mov.f64 %fd64, %fd16; - @%p35 bra BB20_43; + mov.f64 %fd63, %fd16; + @%p35 bra BB6_41; setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000; - mov.f64 %fd64, %fd17; - @%p36 bra BB20_43; + mov.f64 %fd63, %fd17; + @%p36 bra BB6_41; abs.f64 %fd55, %fd1; setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000; - mov.f64 %fd63, %fd17; - mov.f64 %fd64, %fd63; - @%p37 bra BB20_43; - - and.b32 %r23, %r3, 2147483647; - setp.ne.s32 %p38, %r23, 2146435072; - @%p38 bra BB20_39; - - { - .reg .b32 %temp; - mov.b64 {%r24, %temp}, %fd1; - } - setp.eq.s32 %p39, %r24, 0; - @%p39 bra BB20_42; - -BB20_39: - and.b32 %r25, %r2, 2147483647; - setp.ne.s32 %p40, %r25, 2146435072; - mov.f64 %fd61, %fd16; - mov.f64 %fd64, %fd61; - @%p40 bra BB20_43; + mov.f64 %fd62, %fd17; + mov.f64 %fd63, %fd62; + @%p37 bra BB6_41; { .reg .b32 %temp; - mov.b64 {%r26, %temp}, %fd52; + mov.b64 {%r23, %temp}, %fd1; } - setp.ne.s32 %p41, %r26, 0; - mov.f64 %fd64, %fd16; - @%p41 bra BB20_43; - - shr.s32 %r27, %r3, 31; - and.b32 %r28, %r27, -2146435072; - add.s32 %r29, %r28, 2146435072; - or.b32 %r30, %r29, -2147483648; - selp.b32 %r31, %r30, %r29, %p1; - mov.u32 %r32, 0; - mov.b64 %fd64, {%r32, %r31}; - bra.uni BB20_43; - -BB20_77: - setp.gt.s32 %p74, %r4, -1; - @%p74 bra BB20_80; + and.b32 %r24, %r3, 2147483647; + setp.eq.s32 %p38, %r24, 2146435072; + setp.eq.s32 %p39, %r23, 0; + and.pred %p40, %p38, %p39; + @%p40 bra BB6_40; + bra.uni BB6_38; + +BB6_40: + setp.gt.f64 %p44, %fd10, 0d3FF0000000000000; + selp.b32 %r32, 2146435072, 0, %p44; + xor.b32 %r33, %r32, 2146435072; + setp.lt.s32 %p45, %r3, 0; + selp.b32 %r34, %r33, %r32, %p45; + setp.eq.f64 %p46, %fd52, 0dBFF0000000000000; + selp.b32 %r35, 1072693248, %r34, %p46; + mov.u32 %r36, 0; + mov.b64 %fd63, {%r36, %r35}; + bra.uni BB6_41; + +BB6_75: + setp.gt.s32 %p76, %r4, -1; + @%p76 bra BB6_78; cvt.rzi.f64.f64 %fd57, %fd52; - setp.neu.f64 %p75, %fd57, %fd52; - selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p75; + setp.neu.f64 %p77, %fd57, %fd52; + selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p77; -BB20_80: - mov.f64 %fd41, %fd74; +BB6_78: + mov.f64 %fd41, %fd72; add.f64 %fd42, %fd1, %fd52; { .reg .b32 %temp; - mov.b64 {%temp, %r47}, %fd42; + mov.b64 {%temp, %r46}, %fd42; } - and.b32 %r48, %r47, 2146435072; - setp.ne.s32 %p78, %r48, 2146435072; - mov.f64 %fd73, %fd41; - @%p78 bra BB20_89; + and.b32 %r47, %r46, 2146435072; + setp.ne.s32 %p80, %r47, 2146435072; + mov.f64 %fd71, %fd41; + @%p80 bra BB6_85; - setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000; - mov.f64 %fd73, %fd42; - @%p79 bra BB20_89; + setp.gtu.f64 %p81, %fd35, 0d7FF0000000000000; + mov.f64 %fd71, %fd42; + @%p81 bra BB6_85; abs.f64 %fd58, %fd52; - setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000; - mov.f64 %fd72, %fd42; - mov.f64 %fd73, %fd72; - @%p80 bra BB20_89; + setp.gtu.f64 %p82, %fd58, 0d7FF0000000000000; + mov.f64 %fd70, %fd42; + mov.f64 %fd71, %fd70; + @%p82 bra BB6_85; + { + .reg .b32 %temp; + mov.b64 {%r48, %temp}, %fd52; + } and.b32 %r49, %r5, 2147483647; - setp.ne.s32 %p81, %r49, 2146435072; - @%p81 bra BB20_85; - + setp.eq.s32 %p83, %r49, 2146435072; + setp.eq.s32 %p84, %r48, 0; + and.pred %p85, %p83, %p84; + @%p85 bra BB6_84; + bra.uni BB6_82; + +BB6_84: + setp.gt.f64 %p89, %fd35, 0d3FF0000000000000; + selp.b32 %r57, 2146435072, 0, %p89; + xor.b32 %r58, %r57, 2146435072; + setp.lt.s32 %p90, %r5, 0; + selp.b32 %r59, %r58, %r57, %p90; + setp.eq.f64 %p91, %fd1, 0dBFF0000000000000; + selp.b32 %r60, 1072693248, %r59, %p91; + mov.u32 %r61, 0; + mov.b64 %fd71, {%r61, %r60}; + bra.uni BB6_85; + +BB6_38: { .reg .b32 %temp; - mov.b64 {%r50, %temp}, %fd52; + mov.b64 {%r25, %temp}, %fd52; } - setp.eq.s32 %p82, %r50, 0; - @%p82 bra BB20_88; - -BB20_85: - and.b32 %r51, %r4, 2147483647; - setp.ne.s32 %p83, %r51, 2146435072; - mov.f64 %fd70, %fd41; - mov.f64 %fd73, %fd70; - @%p83 bra BB20_89; - + and.b32 %r26, %r2, 2147483647; + setp.eq.s32 %p41, %r26, 2146435072; + setp.eq.s32 %p42, %r25, 0; + and.pred %p43, %p41, %p42; + mov.f64 %fd63, %fd16; + @!%p43 bra BB6_41; + bra.uni BB6_39; + +BB6_39: + shr.s32 %r27, %r3, 31; + and.b32 %r28, %r27, -2146435072; + selp.b32 %r29, -1048576, 2146435072, %p1; + add.s32 %r30, %r29, %r28; + mov.u32 %r31, 0; + mov.b64 %fd63, {%r31, %r30}; + +BB6_41: + setp.eq.f64 %p47, %fd1, 0d0000000000000000; + setp.eq.f64 %p48, %fd52, 0d3FF0000000000000; + or.pred %p49, %p48, %p47; + selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p49; + +BB6_45: + st.global.f64 [%rd1], %fd66; + bra.uni BB6_90; + +BB6_82: { .reg .b32 %temp; - mov.b64 {%r52, %temp}, %fd1; + mov.b64 {%r50, %temp}, %fd1; } - setp.ne.s32 %p84, %r52, 0; - mov.f64 %fd73, %fd41; - @%p84 bra BB20_89; - - shr.s32 %r53, %r5, 31; - and.b32 %r54, %r53, -2146435072; - add.s32 %r55, %r54, 2146435072; - or.b32 %r56, %r55, -2147483648; - selp.b32 %r57, %r56, %r55, %p2; - mov.u32 %r58, 0; - mov.b64 %fd73, {%r58, %r57}; - bra.uni BB20_89; - -BB20_42: - setp.gt.f64 %p42, %fd10, 0d3FF0000000000000; - selp.b32 %r33, 2146435072, 0, %p42; - xor.b32 %r34, %r33, 2146435072; - setp.lt.s32 %p43, %r3, 0; - selp.b32 %r35, %r34, %r33, %p43; - setp.eq.f64 %p44, %fd52, 0dBFF0000000000000; - selp.b32 %r36, 1072693248, %r35, %p44; - mov.u32 %r37, 0; - mov.b64 %fd64, {%r37, %r36}; - -BB20_43: - setp.eq.f64 %p45, %fd1, 0d0000000000000000; - setp.eq.f64 %p46, %fd52, 0d3FF0000000000000; - or.pred %p47, %p46, %p45; - selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p47; - -BB20_47: - st.global.f64 [%rd1], %fd67; - bra.uni BB20_94; - -BB20_88: - setp.gt.f64 %p85, %fd35, 0d3FF0000000000000; - selp.b32 %r59, 2146435072, 0, %p85; - xor.b32 %r60, %r59, 2146435072; - setp.lt.s32 %p86, %r5, 0; - selp.b32 %r61, %r60, %r59, %p86; - setp.eq.f64 %p87, %fd1, 0dBFF0000000000000; - selp.b32 %r62, 1072693248, %r61, %p87; - mov.u32 %r63, 0; - mov.b64 %fd73, {%r63, %r62}; - -BB20_89: - setp.eq.f64 %p88, %fd52, 0d0000000000000000; - setp.eq.f64 %p89, %fd1, 0d3FF0000000000000; - or.pred %p90, %p89, %p88; - selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p90; - -BB20_93: - st.global.f64 [%rd1], %fd76; - -BB20_94: + and.b32 %r51, %r4, 2147483647; + setp.eq.s32 %p86, %r51, 2146435072; + setp.eq.s32 %p87, %r50, 0; + and.pred %p88, %p86, %p87; + mov.f64 %fd71, %fd41; + @!%p88 bra BB6_85; + bra.uni BB6_83; + +BB6_83: + shr.s32 %r52, %r5, 31; + and.b32 %r53, %r52, -2146435072; + selp.b32 %r54, -1048576, 2146435072, %p2; + add.s32 %r55, %r54, %r53; + mov.u32 %r56, 0; + mov.b64 %fd71, {%r56, %r55}; + +BB6_85: + setp.eq.f64 %p92, %fd52, 0d0000000000000000; + setp.eq.f64 %p93, %fd1, 0d3FF0000000000000; + or.pred %p94, %p93, %p92; + selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p94; + +BB6_89: + st.global.f64 [%rd1], %fd74; + +BB6_90: bar.sync 0; ret; } @@ -3045,14 +1274,14 @@ BB20_94: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB21_2; + @%p1 bra BB7_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB21_2: +BB7_2: ret; } @@ -3080,9 +1309,9 @@ BB21_2: mov.f64 %fd76, 0d0000000000000000; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB22_4; + @%p1 bra BB8_4; -BB22_1: +BB8_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -3091,23 +1320,23 @@ BB22_1: add.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB22_3; + @%p2 bra BB8_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; add.f64 %fd78, %fd78, %fd31; -BB22_3: +BB8_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB22_1; + @%p3 bra BB8_1; -BB22_4: +BB8_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -3115,130 +1344,130 @@ BB22_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB22_8; + @%p4 bra BB8_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB22_7; + @%p5 bra BB8_7; ld.shared.f64 %fd32, [%rd1+4096]; add.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB22_7: +BB8_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB22_8: +BB8_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB22_12; + @%p6 bra BB8_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB22_11; + @%p7 bra BB8_11; ld.shared.f64 %fd33, [%rd1+2048]; add.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB22_11: +BB8_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB22_12: +BB8_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB22_16; + @%p8 bra BB8_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB22_15; + @%p9 bra BB8_15; ld.shared.f64 %fd34, [%rd1+1024]; add.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB22_15: +BB8_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB22_16: +BB8_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB22_20; + @%p10 bra BB8_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB22_19; + @%p11 bra BB8_19; ld.shared.f64 %fd35, [%rd1+512]; add.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB22_19: +BB8_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB22_20: +BB8_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB22_33; + @%p12 bra BB8_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB22_23; + @%p13 bra BB8_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; add.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB22_23: +BB8_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB22_25; + @%p14 bra BB8_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; add.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB22_25: +BB8_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB22_27; + @%p15 bra BB8_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; add.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB22_27: +BB8_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB22_29; + @%p16 bra BB8_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; add.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB22_29: +BB8_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB22_31; + @%p17 bra BB8_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; add.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB22_31: +BB8_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB22_33; + @%p18 bra BB8_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; add.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB22_33: +BB8_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB22_35; + @%p19 bra BB8_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -3246,7 +1475,7 @@ BB22_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB22_35: +BB8_35: ret; } @@ -3270,17 +1499,17 @@ BB22_35: ld.param.u32 %r4, [reduce_row_sum_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB23_35; + @%p1 bra BB9_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0d0000000000000000; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB23_4; + @%p2 bra BB9_4; cvta.to.global.u64 %rd3, %rd1; -BB23_3: +BB9_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -3290,9 +1519,9 @@ BB23_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB23_3; + @%p3 bra BB9_3; -BB23_4: +BB9_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -3302,130 +1531,130 @@ BB23_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB23_8; + @%p4 bra BB9_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB23_7; + @%p5 bra BB9_7; ld.shared.f64 %fd29, [%rd8+4096]; add.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB23_7: +BB9_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB23_8: +BB9_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB23_12; + @%p6 bra BB9_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB23_11; + @%p7 bra BB9_11; ld.shared.f64 %fd30, [%rd8+2048]; add.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB23_11: +BB9_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB23_12: +BB9_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB23_16; + @%p8 bra BB9_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB23_15; + @%p9 bra BB9_15; ld.shared.f64 %fd31, [%rd8+1024]; add.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB23_15: +BB9_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB23_16: +BB9_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB23_20; + @%p10 bra BB9_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB23_19; + @%p11 bra BB9_19; ld.shared.f64 %fd32, [%rd8+512]; add.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB23_19: +BB9_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB23_20: +BB9_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB23_33; + @%p12 bra BB9_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB23_23; + @%p13 bra BB9_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; add.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB23_23: +BB9_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB23_25; + @%p14 b
<TRUNCATED>
