http://git-wip-us.apache.org/repos/asf/systemml/blob/61139e40/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 1ab32f5..ac04967 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -4595,6 +4595,1739 @@ BB31_126: ret; } + // .globl sparse_dense_matrix_scalar_op_d +.visible .entry sparse_dense_matrix_scalar_op_d( + .param .u64 sparse_dense_matrix_scalar_op_d_param_0, + .param .u64 sparse_dense_matrix_scalar_op_d_param_1, + .param .u64 sparse_dense_matrix_scalar_op_d_param_2, + .param .f64 sparse_dense_matrix_scalar_op_d_param_3, + .param .u64 sparse_dense_matrix_scalar_op_d_param_4, + .param .u32 sparse_dense_matrix_scalar_op_d_param_5, + .param .u32 sparse_dense_matrix_scalar_op_d_param_6, + .param .u32 sparse_dense_matrix_scalar_op_d_param_7, + .param .u32 sparse_dense_matrix_scalar_op_d_param_8 +) +{ + .reg .pred %p<133>; + .reg .b32 %r<92>; + .reg .f64 %fd<99>; + .reg .b64 %rd<28>; + + + ld.param.u64 %rd4, [sparse_dense_matrix_scalar_op_d_param_0]; + ld.param.u64 %rd5, [sparse_dense_matrix_scalar_op_d_param_1]; + ld.param.u64 %rd6, [sparse_dense_matrix_scalar_op_d_param_2]; + ld.param.f64 %fd68, [sparse_dense_matrix_scalar_op_d_param_3]; + ld.param.u64 %rd7, [sparse_dense_matrix_scalar_op_d_param_4]; + ld.param.u32 %r9, [sparse_dense_matrix_scalar_op_d_param_5]; + ld.param.u32 %r6, [sparse_dense_matrix_scalar_op_d_param_6]; + ld.param.u32 %r7, [sparse_dense_matrix_scalar_op_d_param_7]; + ld.param.u32 %r8, [sparse_dense_matrix_scalar_op_d_param_8]; + mov.u32 %r10, %ntid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r12, %tid.x; + mad.lo.s32 %r1, %r10, %r11, %r12; + setp.ge.s32 %p3, %r1, %r9; + @%p3 bra BB32_142; + + cvta.to.global.u64 %rd8, %rd7; + cvta.to.global.u64 %rd9, %rd6; + mul.wide.s32 %rd10, %r1, 8; + add.s64 %rd11, %rd9, %rd10; + ld.global.f64 %fd1, [%rd11]; + cvta.to.global.u64 %rd12, %rd4; + mul.wide.s32 %rd13, %r1, 4; + add.s64 %rd14, %rd12, %rd13; + ld.global.u32 %r13, [%rd14]; + cvta.to.global.u64 %rd15, %rd5; + add.s64 %rd16, %rd15, %rd13; + ld.global.u32 %r14, [%rd16]; + mad.lo.s32 %r15, %r13, %r6, %r14; + mul.wide.s32 %rd17, %r15, 8; + add.s64 %rd1, %rd8, %rd17; + setp.eq.s32 %p4, %r8, 0; + @%p4 bra BB32_72; + + mov.f64 %fd94, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p5, %r7, 8; + @%p5 bra BB32_19; + + setp.gt.s32 %p19, %r7, 3; + @%p19 bra BB32_11; + + setp.gt.s32 %p26, %r7, 1; + @%p26 bra BB32_8; + + setp.eq.s32 %p29, %r7, 0; + @%p29 bra BB32_70; + bra.uni BB32_6; + +BB32_70: + add.f64 %fd94, %fd1, %fd68; + bra.uni BB32_71; + +BB32_72: + mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p69, %r7, 8; + @%p69 bra BB32_89; + + setp.gt.s32 %p83, %r7, 3; + @%p83 bra BB32_81; + + setp.gt.s32 %p90, %r7, 1; + @%p90 bra BB32_78; + + setp.eq.s32 %p93, %r7, 0; + @%p93 bra BB32_140; + bra.uni BB32_76; + +BB32_140: + add.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_19: + setp.gt.s32 %p6, %r7, 13; + @%p6 bra BB32_28; + + setp.gt.s32 %p13, %r7, 10; + @%p13 bra BB32_24; + + setp.eq.s32 %p17, %r7, 9; + @%p17 bra BB32_48; + bra.uni BB32_22; + +BB32_48: + setp.eq.f64 %p44, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44; + bra.uni BB32_71; + +BB32_89: + setp.gt.s32 %p70, %r7, 13; + @%p70 bra BB32_98; + + setp.gt.s32 %p77, %r7, 10; + @%p77 bra BB32_94; + + setp.eq.s32 %p81, %r7, 9; + @%p81 bra BB32_118; + bra.uni BB32_92; + +BB32_118: + setp.eq.f64 %p108, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p108; + bra.uni BB32_141; + +BB32_11: + setp.gt.s32 %p20, %r7, 5; + @%p20 bra BB32_15; + + setp.eq.s32 %p24, %r7, 4; + @%p24 bra BB32_51; + bra.uni BB32_13; + +BB32_51: + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd68; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r3}, %fd1; + } + bfe.u32 %r28, %r3, 20, 11; + add.s32 %r29, %r28, -1012; + mov.b64 %rd22, %fd1; + shl.b64 %rd2, %rd22, %r29; + setp.eq.s64 %p49, %rd2, -9223372036854775808; + abs.f64 %fd18, %fd68; + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd18; + .param .b64 param1; + st.param.f64 [param1+0], %fd1; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd24, [retval0+0]; + + //{ + }// Callseq End 3 + setp.lt.s32 %p50, %r2, 0; + and.pred %p1, %p50, %p49; + @!%p1 bra BB32_53; + bra.uni BB32_52; + +BB32_52: + { + .reg .b32 %temp; + mov.b64 {%temp, %r30}, %fd24; + } + xor.b32 %r31, %r30, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r32, %temp}, %fd24; + } + mov.b64 %fd24, {%r32, %r31}; + +BB32_53: + setp.eq.f64 %p51, %fd68, 0d0000000000000000; + @%p51 bra BB32_56; + bra.uni BB32_54; + +BB32_56: + selp.b32 %r33, %r2, 0, %p49; + or.b32 %r34, %r33, 2146435072; + setp.lt.s32 %p55, %r3, 0; + selp.b32 %r35, %r34, %r33, %p55; + mov.u32 %r36, 0; + mov.b64 %fd24, {%r36, %r35}; + bra.uni BB32_57; + +BB32_28: + setp.gt.s32 %p7, %r7, 15; + @%p7 bra BB32_32; + + setp.eq.s32 %p11, %r7, 14; + @%p11 bra BB32_45; + bra.uni BB32_30; + +BB32_45: + cvt.rni.s64.f64 %rd18, %fd68; + cvt.u32.u64 %r22, %rd18; + cvt.rni.s64.f64 %rd19, %fd1; + cvt.u32.u64 %r23, %rd19; + or.b32 %r24, %r23, %r22; + setp.eq.s32 %p41, %r24, 0; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41; + bra.uni BB32_71; + +BB32_81: + setp.gt.s32 %p84, %r7, 5; + @%p84 bra BB32_85; + + setp.eq.s32 %p88, %r7, 4; + @%p88 bra BB32_121; + bra.uni BB32_83; + +BB32_121: + { + .reg .b32 %temp; + mov.b64 {%temp, %r4}, %fd1; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r5}, %fd68; + } + bfe.u32 %r66, %r5, 20, 11; + add.s32 %r67, %r66, -1012; + mov.b64 %rd27, %fd68; + shl.b64 %rd3, %rd27, %r67; + setp.eq.s64 %p113, %rd3, -9223372036854775808; + abs.f64 %fd51, %fd1; + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd51; + .param .b64 param1; + st.param.f64 [param1+0], %fd68; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd57, [retval0+0]; + + //{ + }// Callseq End 4 + setp.lt.s32 %p114, %r4, 0; + and.pred %p2, %p114, %p113; + @!%p2 bra BB32_123; + bra.uni BB32_122; + +BB32_122: + { + .reg .b32 %temp; + mov.b64 {%temp, %r68}, %fd57; + } + xor.b32 %r69, %r68, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r70, %temp}, %fd57; + } + mov.b64 %fd57, {%r70, %r69}; + +BB32_123: + setp.eq.f64 %p115, %fd1, 0d0000000000000000; + @%p115 bra BB32_126; + bra.uni BB32_124; + +BB32_126: + selp.b32 %r71, %r4, 0, %p113; + or.b32 %r72, %r71, 2146435072; + setp.lt.s32 %p119, %r5, 0; + selp.b32 %r73, %r72, %r71, %p119; + mov.u32 %r74, 0; + mov.b64 %fd57, {%r74, %r73}; + bra.uni BB32_127; + +BB32_98: + setp.gt.s32 %p71, %r7, 15; + @%p71 bra BB32_102; + + setp.eq.s32 %p75, %r7, 14; + @%p75 bra BB32_115; + bra.uni BB32_100; + +BB32_115: + cvt.rni.s64.f64 %rd23, %fd1; + cvt.u32.u64 %r60, %rd23; + cvt.rni.s64.f64 %rd24, %fd68; + cvt.u32.u64 %r61, %rd24; + or.b32 %r62, %r61, %r60; + setp.eq.s32 %p105, %r62, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p105; + bra.uni BB32_141; + +BB32_8: + setp.eq.s32 %p27, %r7, 2; + @%p27 bra BB32_69; + bra.uni BB32_9; + +BB32_69: + mul.f64 %fd94, %fd1, %fd68; + bra.uni BB32_71; + +BB32_24: + setp.eq.s32 %p14, %r7, 11; + @%p14 bra BB32_47; + + setp.eq.s32 %p15, %r7, 12; + @%p15 bra BB32_46; + bra.uni BB32_26; + +BB32_46: + max.f64 %fd94, %fd68, %fd1; + bra.uni BB32_71; + +BB32_15: + setp.eq.s32 %p21, %r7, 6; + @%p21 bra BB32_50; + + setp.eq.s32 %p22, %r7, 7; + @%p22 bra BB32_49; + bra.uni BB32_17; + +BB32_49: + setp.lt.f64 %p46, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46; + bra.uni BB32_71; + +BB32_32: + setp.eq.s32 %p8, %r7, 16; + @%p8 bra BB32_44; + + setp.eq.s32 %p9, %r7, 17; + @%p9 bra BB32_39; + bra.uni BB32_34; + +BB32_39: + setp.eq.f64 %p34, %fd1, 0d0000000000000000; + setp.eq.f64 %p35, %fd1, 0d8000000000000000; + or.pred %p36, %p34, %p35; + mov.f64 %fd94, 0d7FF8000000000000; + @%p36 bra BB32_71; + + div.rn.f64 %fd94, %fd68, %fd1; + abs.f64 %fd72, %fd94; + setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; + @%p37 bra BB32_71; + + { + .reg .b32 %temp; + mov.b64 {%temp, %r19}, %fd94; + } + and.b32 %r20, %r19, 2147483647; + setp.ne.s32 %p38, %r20, 2146435072; + @%p38 bra BB32_43; + + { + .reg .b32 %temp; + mov.b64 {%r21, %temp}, %fd94; + } + setp.eq.s32 %p39, %r21, 0; + @%p39 bra BB32_71; + +BB32_43: + cvt.rmi.f64.f64 %fd73, %fd94; + mul.f64 %fd74, %fd1, %fd73; + sub.f64 %fd94, %fd68, %fd74; + bra.uni BB32_71; + +BB32_78: + setp.eq.s32 %p91, %r7, 2; + @%p91 bra BB32_139; + bra.uni BB32_79; + +BB32_139: + mul.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_94: + setp.eq.s32 %p78, %r7, 11; + @%p78 bra BB32_117; + + setp.eq.s32 %p79, %r7, 12; + @%p79 bra BB32_116; + bra.uni BB32_96; + +BB32_116: + max.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_85: + setp.eq.s32 %p85, %r7, 6; + @%p85 bra BB32_120; + + setp.eq.s32 %p86, %r7, 7; + @%p86 bra BB32_119; + bra.uni BB32_87; + +BB32_119: + setp.gt.f64 %p110, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110; + bra.uni BB32_141; + +BB32_102: + setp.eq.s32 %p72, %r7, 16; + @%p72 bra BB32_114; + + setp.eq.s32 %p73, %r7, 17; + @%p73 bra BB32_109; + bra.uni BB32_104; + +BB32_109: + setp.eq.f64 %p98, %fd68, 0d0000000000000000; + setp.eq.f64 %p99, %fd68, 0d8000000000000000; + or.pred %p100, %p98, %p99; + mov.f64 %fd98, 0d7FF8000000000000; + @%p100 bra BB32_141; + + div.rn.f64 %fd98, %fd1, %fd68; + abs.f64 %fd83, %fd98; + setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000; + @%p101 bra BB32_141; + + { + .reg .b32 %temp; + mov.b64 {%temp, %r57}, %fd98; + } + and.b32 %r58, %r57, 2147483647; + setp.ne.s32 %p102, %r58, 2146435072; + @%p102 bra BB32_113; + + { + .reg .b32 %temp; + mov.b64 {%r59, %temp}, %fd98; + } + setp.eq.s32 %p103, %r59, 0; + @%p103 bra BB32_141; + +BB32_113: + cvt.rmi.f64.f64 %fd84, %fd98; + mul.f64 %fd85, %fd84, %fd68; + sub.f64 %fd98, %fd1, %fd85; + bra.uni BB32_141; + +BB32_6: + setp.eq.s32 %p30, %r7, 1; + @%p30 bra BB32_7; + bra.uni BB32_71; + +BB32_7: + sub.f64 %fd94, %fd68, %fd1; + bra.uni BB32_71; + +BB32_22: + setp.eq.s32 %p18, %r7, 10; + @%p18 bra BB32_23; + bra.uni BB32_71; + +BB32_23: + setp.neu.f64 %p43, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43; + bra.uni BB32_71; + +BB32_13: + setp.eq.s32 %p25, %r7, 5; + @%p25 bra BB32_14; + bra.uni BB32_71; + +BB32_14: + setp.gt.f64 %p48, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB32_71; + +BB32_30: + setp.eq.s32 %p12, %r7, 15; + @%p12 bra BB32_31; + bra.uni BB32_71; + +BB32_31: + mul.f64 %fd76, %fd1, %fd68; + mov.f64 %fd77, 0d3FF0000000000000; + sub.f64 %fd94, %fd77, %fd76; + bra.uni BB32_71; + +BB32_9: + setp.eq.s32 %p28, %r7, 3; + @%p28 bra BB32_10; + bra.uni BB32_71; + +BB32_10: + div.rn.f64 %fd94, %fd68, %fd1; + bra.uni BB32_71; + +BB32_47: + min.f64 %fd94, %fd68, %fd1; + bra.uni BB32_71; + +BB32_26: + setp.eq.s32 %p16, %r7, 13; + @%p16 bra BB32_27; + bra.uni BB32_71; + +BB32_27: + cvt.rni.s64.f64 %rd20, %fd68; + cvt.u32.u64 %r25, %rd20; + cvt.rni.s64.f64 %rd21, %fd1; + cvt.u32.u64 %r26, %rd21; + and.b32 %r27, %r26, %r25; + setp.eq.s32 %p42, %r27, 0; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42; + bra.uni BB32_71; + +BB32_50: + setp.ltu.f64 %p47, %fd1, %fd68; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47; + bra.uni BB32_71; + +BB32_17: + setp.eq.s32 %p23, %r7, 8; + @%p23 bra BB32_18; + bra.uni BB32_71; + +BB32_18: + setp.gtu.f64 %p45, %fd1, %fd68; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45; + bra.uni BB32_71; + +BB32_44: + setp.neu.f64 %p40, %fd68, 0d0000000000000000; + sub.f64 %fd75, %fd68, %fd1; + selp.f64 %fd94, %fd75, 0d0000000000000000, %p40; + bra.uni BB32_71; + +BB32_34: + setp.ne.s32 %p10, %r7, 18; + @%p10 bra BB32_71; + + div.rn.f64 %fd94, %fd68, %fd1; + abs.f64 %fd70, %fd94; + setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; + @%p31 bra BB32_71; + + { + .reg .b32 %temp; + mov.b64 {%temp, %r16}, %fd94; + } + and.b32 %r17, %r16, 2147483647; + setp.ne.s32 %p32, %r17, 2146435072; + @%p32 bra BB32_38; + + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd94; + } + setp.eq.s32 %p33, %r18, 0; + @%p33 bra BB32_71; + +BB32_38: + cvt.rmi.f64.f64 %fd94, %fd94; + bra.uni BB32_71; + +BB32_76: + setp.eq.s32 %p94, %r7, 1; + @%p94 bra BB32_77; + bra.uni BB32_141; + +BB32_77: + sub.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_92: + setp.eq.s32 %p82, %r7, 10; + @%p82 bra BB32_93; + bra.uni BB32_141; + +BB32_93: + setp.neu.f64 %p107, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p107; + bra.uni BB32_141; + +BB32_83: + setp.eq.s32 %p89, %r7, 5; + @%p89 bra BB32_84; + bra.uni BB32_141; + +BB32_84: + setp.lt.f64 %p112, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p112; + bra.uni BB32_141; + +BB32_100: + setp.eq.s32 %p76, %r7, 15; + @%p76 bra BB32_101; + bra.uni BB32_141; + +BB32_101: + mul.f64 %fd87, %fd1, %fd68; + mov.f64 %fd88, 0d3FF0000000000000; + sub.f64 %fd98, %fd88, %fd87; + bra.uni BB32_141; + +BB32_79: + setp.eq.s32 %p92, %r7, 3; + @%p92 bra BB32_80; + bra.uni BB32_141; + +BB32_80: + div.rn.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_117: + min.f64 %fd98, %fd1, %fd68; + bra.uni BB32_141; + +BB32_96: + setp.eq.s32 %p80, %r7, 13; + @%p80 bra BB32_97; + bra.uni BB32_141; + +BB32_97: + cvt.rni.s64.f64 %rd25, %fd1; + cvt.u32.u64 %r63, %rd25; + cvt.rni.s64.f64 %rd26, %fd68; + cvt.u32.u64 %r64, %rd26; + and.b32 %r65, %r64, %r63; + setp.eq.s32 %p106, %r65, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p106; + bra.uni BB32_141; + +BB32_120: + setp.gtu.f64 %p111, %fd1, %fd68; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p111; + bra.uni BB32_141; + +BB32_87: + setp.eq.s32 %p87, %r7, 8; + @%p87 bra BB32_88; + bra.uni BB32_141; + +BB32_88: + setp.ltu.f64 %p109, %fd1, %fd68; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109; + bra.uni BB32_141; + +BB32_114: + setp.neu.f64 %p104, %fd1, 0d0000000000000000; + sub.f64 %fd86, %fd1, %fd68; + selp.f64 %fd98, %fd86, 0d0000000000000000, %p104; + bra.uni BB32_141; + +BB32_104: + setp.ne.s32 %p74, %r7, 18; + @%p74 bra BB32_141; + + div.rn.f64 %fd98, %fd1, %fd68; + abs.f64 %fd81, %fd98; + setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000; + @%p95 bra BB32_141; + + { + .reg .b32 %temp; + mov.b64 {%temp, %r54}, %fd98; + } + and.b32 %r55, %r54, 2147483647; + setp.ne.s32 %p96, %r55, 2146435072; + @%p96 bra BB32_108; + + { + .reg .b32 %temp; + mov.b64 {%r56, %temp}, %fd98; + } + setp.eq.s32 %p97, %r56, 0; + @%p97 bra BB32_141; + +BB32_108: + cvt.rmi.f64.f64 %fd98, %fd98; + bra.uni BB32_141; + +BB32_54: + setp.gt.s32 %p52, %r2, -1; + @%p52 bra BB32_57; + + cvt.rzi.f64.f64 %fd78, %fd1; + setp.neu.f64 %p53, %fd78, %fd1; + selp.f64 %fd24, 0dFFF8000000000000, %fd24, %p53; + +BB32_57: + add.f64 %fd93, %fd1, %fd68; + { + .reg .b32 %temp; + mov.b64 {%temp, %r37}, %fd93; + } + and.b32 %r38, %r37, 2146435072; + setp.ne.s32 %p56, %r38, 2146435072; + @%p56 bra BB32_58; + + setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000; + @%p57 bra BB32_68; + + abs.f64 %fd79, %fd1; + setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000; + @%p58 bra BB32_68; + + and.b32 %r39, %r3, 2147483647; + setp.ne.s32 %p59, %r39, 2146435072; + @%p59 bra BB32_63; + + { + .reg .b32 %temp; + mov.b64 {%r40, %temp}, %fd1; + } + setp.eq.s32 %p60, %r40, 0; + @%p60 bra BB32_67; + +BB32_63: + and.b32 %r41, %r2, 2147483647; + setp.ne.s32 %p61, %r41, 2146435072; + @%p61 bra BB32_64; + + { + .reg .b32 %temp; + mov.b64 {%r42, %temp}, %fd68; + } + setp.ne.s32 %p62, %r42, 0; + mov.f64 %fd93, %fd24; + @%p62 bra BB32_68; + + shr.s32 %r43, %r3, 31; + and.b32 %r44, %r43, -2146435072; + add.s32 %r45, %r44, 2146435072; + or.b32 %r46, %r45, -2147483648; + selp.b32 %r47, %r46, %r45, %p1; + mov.u32 %r48, 0; + mov.b64 %fd93, {%r48, %r47}; + bra.uni BB32_68; + +BB32_58: + mov.f64 %fd93, %fd24; + +BB32_68: + setp.eq.f64 %p66, %fd1, 0d0000000000000000; + setp.eq.f64 %p67, %fd68, 0d3FF0000000000000; + or.pred %p68, %p67, %p66; + selp.f64 %fd94, 0d3FF0000000000000, %fd93, %p68; + +BB32_71: + st.global.f64 [%rd1], %fd94; + bra.uni BB32_142; + +BB32_124: + setp.gt.s32 %p116, %r4, -1; + @%p116 bra BB32_127; + + cvt.rzi.f64.f64 %fd89, %fd68; + setp.neu.f64 %p117, %fd89, %fd68; + selp.f64 %fd57, 0dFFF8000000000000, %fd57, %p117; + +BB32_127: + add.f64 %fd97, %fd1, %fd68; + { + .reg .b32 %temp; + mov.b64 {%temp, %r75}, %fd97; + } + and.b32 %r76, %r75, 2146435072; + setp.ne.s32 %p120, %r76, 2146435072; + @%p120 bra BB32_128; + + setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000; + @%p121 bra BB32_138; + + abs.f64 %fd90, %fd68; + setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000; + @%p122 bra BB32_138; + + and.b32 %r77, %r5, 2147483647; + setp.ne.s32 %p123, %r77, 2146435072; + @%p123 bra BB32_133; + + { + .reg .b32 %temp; + mov.b64 {%r78, %temp}, %fd68; + } + setp.eq.s32 %p124, %r78, 0; + @%p124 bra BB32_137; + +BB32_133: + and.b32 %r79, %r4, 2147483647; + setp.ne.s32 %p125, %r79, 2146435072; + @%p125 bra BB32_134; + + { + .reg .b32 %temp; + mov.b64 {%r80, %temp}, %fd1; + } + setp.ne.s32 %p126, %r80, 0; + mov.f64 %fd97, %fd57; + @%p126 bra BB32_138; + + shr.s32 %r81, %r5, 31; + and.b32 %r82, %r81, -2146435072; + add.s32 %r83, %r82, 2146435072; + or.b32 %r84, %r83, -2147483648; + selp.b32 %r85, %r84, %r83, %p2; + mov.u32 %r86, 0; + mov.b64 %fd97, {%r86, %r85}; + bra.uni BB32_138; + +BB32_128: + mov.f64 %fd97, %fd57; + +BB32_138: + setp.eq.f64 %p130, %fd68, 0d0000000000000000; + setp.eq.f64 %p131, %fd1, 0d3FF0000000000000; + or.pred %p132, %p131, %p130; + selp.f64 %fd98, 0d3FF0000000000000, %fd97, %p132; + +BB32_141: + st.global.f64 [%rd1], %fd98; + +BB32_142: + bar.sync 0; + ret; + +BB32_64: + mov.f64 %fd93, %fd24; + bra.uni BB32_68; + +BB32_134: + mov.f64 %fd97, %fd57; + bra.uni BB32_138; + +BB32_67: + setp.gt.f64 %p63, %fd18, 0d3FF0000000000000; + selp.b32 %r49, 2146435072, 0, %p63; + xor.b32 %r50, %r49, 2146435072; + setp.lt.s32 %p64, %r3, 0; + selp.b32 %r51, %r50, %r49, %p64; + setp.eq.f64 %p65, %fd68, 0dBFF0000000000000; + selp.b32 %r52, 1072693248, %r51, %p65; + mov.u32 %r53, 0; + mov.b64 %fd93, {%r53, %r52}; + bra.uni BB32_68; + +BB32_137: + setp.gt.f64 %p127, %fd51, 0d3FF0000000000000; + selp.b32 %r87, 2146435072, 0, %p127; + xor.b32 %r88, %r87, 2146435072; + setp.lt.s32 %p128, %r5, 0; + selp.b32 %r89, %r88, %r87, %p128; + setp.eq.f64 %p129, %fd1, 0dBFF0000000000000; + selp.b32 %r90, 1072693248, %r89, %p129; + mov.u32 %r91, 0; + mov.b64 %fd97, {%r91, %r90}; + bra.uni BB32_138; +} + + // .globl sparse_dense_matrix_scalar_op_f +.visible .entry sparse_dense_matrix_scalar_op_f( + .param .u64 sparse_dense_matrix_scalar_op_f_param_0, + .param .u64 sparse_dense_matrix_scalar_op_f_param_1, + .param .u64 sparse_dense_matrix_scalar_op_f_param_2, + .param .f64 sparse_dense_matrix_scalar_op_f_param_3, + .param .u64 sparse_dense_matrix_scalar_op_f_param_4, + .param .u32 sparse_dense_matrix_scalar_op_f_param_5, + .param .u32 sparse_dense_matrix_scalar_op_f_param_6, + .param .u32 sparse_dense_matrix_scalar_op_f_param_7, + .param .u32 sparse_dense_matrix_scalar_op_f_param_8 +) +{ + .reg .pred %p<139>; + .reg .f32 %f<267>; + .reg .b32 %r<62>; + .reg .f64 %fd<2>; + .reg .b64 %rd<23>; + + + ld.param.u64 %rd2, [sparse_dense_matrix_scalar_op_f_param_0]; + ld.param.u64 %rd3, [sparse_dense_matrix_scalar_op_f_param_1]; + ld.param.u64 %rd4, [sparse_dense_matrix_scalar_op_f_param_2]; + ld.param.f64 %fd1, [sparse_dense_matrix_scalar_op_f_param_3]; + ld.param.u64 %rd5, [sparse_dense_matrix_scalar_op_f_param_4]; + ld.param.u32 %r5, [sparse_dense_matrix_scalar_op_f_param_5]; + ld.param.u32 %r2, [sparse_dense_matrix_scalar_op_f_param_6]; + ld.param.u32 %r3, [sparse_dense_matrix_scalar_op_f_param_7]; + ld.param.u32 %r4, [sparse_dense_matrix_scalar_op_f_param_8]; + cvt.rn.f32.f64 %f1, %fd1; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %ctaid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r6, %r7, %r8; + setp.ge.s32 %p3, %r1, %r5; + @%p3 bra BB33_126; + + cvta.to.global.u64 %rd6, %rd5; + cvta.to.global.u64 %rd7, %rd4; + mul.wide.s32 %rd8, %r1, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f2, [%rd9]; + cvta.to.global.u64 %rd10, %rd2; + add.s64 %rd11, %rd10, %rd8; + ld.global.u32 %r9, [%rd11]; + cvta.to.global.u64 %rd12, %rd3; + add.s64 %rd13, %rd12, %rd8; + ld.global.u32 %r10, [%rd13]; + mad.lo.s32 %r11, %r9, %r2, %r10; + mul.wide.s32 %rd14, %r11, 4; + add.s64 %rd1, %rd6, %rd14; + setp.eq.s32 %p4, %r4, 0; + @%p4 bra BB33_64; + + mov.f32 %f262, 0f7F7FFFFF; + setp.gt.s32 %p5, %r3, 8; + @%p5 bra BB33_19; + + setp.gt.s32 %p19, %r3, 3; + @%p19 bra BB33_11; + + setp.gt.s32 %p26, %r3, 1; + @%p26 bra BB33_8; + + setp.eq.s32 %p29, %r3, 0; + @%p29 bra BB33_62; + bra.uni BB33_6; + +BB33_62: + add.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_64: + mov.f32 %f266, 0f7F7FFFFF; + setp.gt.s32 %p72, %r3, 8; + @%p72 bra BB33_81; + + setp.gt.s32 %p86, %r3, 3; + @%p86 bra BB33_73; + + setp.gt.s32 %p93, %r3, 1; + @%p93 bra BB33_70; + + setp.eq.s32 %p96, %r3, 0; + @%p96 bra BB33_124; + bra.uni BB33_68; + +BB33_124: + add.f32 %f266, %f1, %f2; + bra.uni BB33_125; + +BB33_19: + setp.gt.s32 %p6, %r3, 13; + @%p6 bra BB33_28; + + setp.gt.s32 %p13, %r3, 10; + @%p13 bra BB33_24; + + setp.eq.s32 %p17, %r3, 9; + @%p17 bra BB33_44; + bra.uni BB33_22; + +BB33_44: + setp.eq.f32 %p40, %f1, %f2; + selp.f32 %f262, 0f3F800000, 0f00000000, %p40; + bra.uni BB33_63; + +BB33_81: + setp.gt.s32 %p73, %r3, 13; + @%p73 bra BB33_90; + + setp.gt.s32 %p80, %r3, 10; + @%p80 bra BB33_86; + + setp.eq.s32 %p84, %r3, 9; + @%p84 bra BB33_106; + bra.uni BB33_84; + +BB33_106: + setp.eq.f32 %p107, %f2, %f1; + selp.f32 %f266, 0f3F800000, 0f00000000, %p107; + bra.uni BB33_125; + +BB33_11: + setp.gt.s32 %p20, %r3, 5; + @%p20 bra BB33_15; + + setp.eq.s32 %p24, %r3, 4; + @%p24 bra BB33_47; + bra.uni BB33_13; + +BB33_47: + mul.f32 %f88, %f2, 0f3F000000; + cvt.rzi.f32.f32 %f89, %f88; + fma.rn.f32 %f90, %f89, 0fC0000000, %f2; + abs.f32 %f19, %f90; + abs.f32 %f20, %f1; + setp.lt.f32 %p45, %f20, 0f00800000; + mul.f32 %f91, %f20, 0f4B800000; + selp.f32 %f92, 0fC3170000, 0fC2FE0000, %p45; + selp.f32 %f93, %f91, %f20, %p45; + mov.b32 %r18, %f93; + and.b32 %r19, %r18, 8388607; + or.b32 %r20, %r19, 1065353216; + mov.b32 %f94, %r20; + shr.u32 %r21, %r18, 23; + cvt.rn.f32.u32 %f95, %r21; + add.f32 %f96, %f92, %f95; + setp.gt.f32 %p46, %f94, 0f3FB504F3; + mul.f32 %f97, %f94, 0f3F000000; + add.f32 %f98, %f96, 0f3F800000; + selp.f32 %f99, %f97, %f94, %p46; + selp.f32 %f100, %f98, %f96, %p46; + add.f32 %f101, %f99, 0fBF800000; + add.f32 %f87, %f99, 0f3F800000; + // inline asm + rcp.approx.ftz.f32 %f86,%f87; + // inline asm + add.f32 %f102, %f101, %f101; + mul.f32 %f103, %f86, %f102; + mul.f32 %f104, %f103, %f103; + mov.f32 %f105, 0f3C4CAF63; + mov.f32 %f106, 0f3B18F0FE; + fma.rn.f32 %f107, %f106, %f104, %f105; + mov.f32 %f108, 0f3DAAAABD; + fma.rn.f32 %f109, %f107, %f104, %f108; + mul.rn.f32 %f110, %f109, %f104; + mul.rn.f32 %f111, %f110, %f103; + sub.f32 %f112, %f101, %f103; + neg.f32 %f113, %f103; + add.f32 %f114, %f112, %f112; + fma.rn.f32 %f115, %f113, %f101, %f114; + mul.rn.f32 %f116, %f86, %f115; + add.f32 %f117, %f111, %f103; + sub.f32 %f118, %f103, %f117; + add.f32 %f119, %f111, %f118; + add.f32 %f120, %f116, %f119; + add.f32 %f121, %f117, %f120; + sub.f32 %f122, %f117, %f121; + add.f32 %f123, %f120, %f122; + mov.f32 %f124, 0f3F317200; + mul.rn.f32 %f125, %f100, %f124; + mov.f32 %f126, 0f35BFBE8E; + mul.rn.f32 %f127, %f100, %f126; + add.f32 %f128, %f125, %f121; + sub.f32 %f129, %f125, %f128; + add.f32 %f130, %f121, %f129; + add.f32 %f131, %f123, %f130; + add.f32 %f132, %f127, %f131; + add.f32 %f133, %f128, %f132; + sub.f32 %f134, %f128, %f133; + add.f32 %f135, %f132, %f134; + abs.f32 %f21, %f2; + setp.gt.f32 %p47, %f21, 0f77F684DF; + mul.f32 %f136, %f2, 0f39000000; + selp.f32 %f137, %f136, %f2, %p47; + mul.rn.f32 %f138, %f137, %f133; + neg.f32 %f139, %f138; + fma.rn.f32 %f140, %f137, %f133, %f139; + fma.rn.f32 %f141, %f137, %f135, %f140; + mov.f32 %f142, 0f00000000; + fma.rn.f32 %f143, %f142, %f133, %f141; + add.rn.f32 %f144, %f138, %f143; + neg.f32 %f145, %f144; + add.rn.f32 %f146, %f138, %f145; + add.rn.f32 %f147, %f146, %f143; + mov.b32 %r22, %f144; + setp.eq.s32 %p48, %r22, 1118925336; + add.s32 %r23, %r22, -1; + mov.b32 %f148, %r23; + add.f32 %f149, %f147, 0f37000000; + selp.f32 %f150, %f148, %f144, %p48; + selp.f32 %f22, %f149, %f147, %p48; + mul.f32 %f151, %f150, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f152, %f151; + mov.f32 %f153, 0fBF317200; + fma.rn.f32 %f154, %f152, %f153, %f150; + mov.f32 %f155, 0fB5BFBE8E; + fma.rn.f32 %f156, %f152, %f155, %f154; + mul.f32 %f157, %f156, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f158, %f157; + add.f32 %f159, %f152, 0f00000000; + ex2.approx.f32 %f160, %f159; + mul.f32 %f161, %f158, %f160; + setp.lt.f32 %p49, %f150, 0fC2D20000; + selp.f32 %f162, 0f00000000, %f161, %p49; + setp.gt.f32 %p50, %f150, 0f42D20000; + selp.f32 %f259, 0f7F800000, %f162, %p50; + setp.eq.f32 %p51, %f259, 0f7F800000; + @%p51 bra BB33_49; + + fma.rn.f32 %f259, %f259, %f22, %f259; + +BB33_49: + setp.lt.f32 %p52, %f1, 0f00000000; + setp.eq.f32 %p53, %f19, 0f3F800000; + and.pred %p1, %p52, %p53; + mov.b32 %r24, %f259; + xor.b32 %r25, %r24, -2147483648; + mov.b32 %f163, %r25; + selp.f32 %f261, %f163, %f259, %p1; + setp.eq.f32 %p54, %f1, 0f00000000; + @%p54 bra BB33_52; + bra.uni BB33_50; + +BB33_52: + add.f32 %f165, %f1, %f1; + mov.b32 %r26, %f165; + selp.b32 %r27, %r26, 0, %p53; + or.b32 %r28, %r27, 2139095040; + setp.lt.f32 %p58, %f2, 0f00000000; + selp.b32 %r29, %r28, %r27, %p58; + mov.b32 %f261, %r29; + bra.uni BB33_53; + +BB33_28: + setp.gt.s32 %p7, %r3, 15; + @%p7 bra BB33_32; + + setp.eq.s32 %p11, %r3, 14; + @%p11 bra BB33_41; + bra.uni BB33_30; + +BB33_41: + cvt.rni.s64.f32 %rd15, %f1; + cvt.u32.u64 %r12, %rd15; + cvt.rni.s64.f32 %rd16, %f2; + cvt.u32.u64 %r13, %rd16; + or.b32 %r14, %r13, %r12; + setp.eq.s32 %p37, %r14, 0; + selp.f32 %f262, 0f00000000, 0f3F800000, %p37; + bra.uni BB33_63; + +BB33_73: + setp.gt.s32 %p87, %r3, 5; + @%p87 bra BB33_77; + + setp.eq.s32 %p91, %r3, 4; + @%p91 bra BB33_109; + bra.uni BB33_75; + +BB33_109: + mul.f32 %f179, %f1, 0f3F000000; + cvt.rzi.f32.f32 %f180, %f179; + fma.rn.f32 %f181, %f180, 0fC0000000, %f1; + abs.f32 %f56, %f181; + abs.f32 %f57, %f2; + setp.lt.f32 %p112, %f57, 0f00800000; + mul.f32 %f182, %f57, 0f4B800000; + selp.f32 %f183, 0fC3170000, 0fC2FE0000, %p112; + selp.f32 %f184, %f182, %f57, %p112; + mov.b32 %r43, %f184; + and.b32 %r44, %r43, 8388607; + or.b32 %r45, %r44, 1065353216; + mov.b32 %f185, %r45; + shr.u32 %r46, %r43, 23; + cvt.rn.f32.u32 %f186, %r46; + add.f32 %f187, %f183, %f186; + setp.gt.f32 %p113, %f185, 0f3FB504F3; + mul.f32 %f188, %f185, 0f3F000000; + add.f32 %f189, %f187, 0f3F800000; + selp.f32 %f190, %f188, %f185, %p113; + selp.f32 %f191, %f189, %f187, %p113; + add.f32 %f192, %f190, 0fBF800000; + add.f32 %f178, %f190, 0f3F800000; + // inline asm + rcp.approx.ftz.f32 %f177,%f178; + // inline asm + add.f32 %f193, %f192, %f192; + mul.f32 %f194, %f177, %f193; + mul.f32 %f195, %f194, %f194; + mov.f32 %f196, 0f3C4CAF63; + mov.f32 %f197, 0f3B18F0FE; + fma.rn.f32 %f198, %f197, %f195, %f196; + mov.f32 %f199, 0f3DAAAABD; + fma.rn.f32 %f200, %f198, %f195, %f199; + mul.rn.f32 %f201, %f200, %f195; + mul.rn.f32 %f202, %f201, %f194; + sub.f32 %f203, %f192, %f194; + neg.f32 %f204, %f194; + add.f32 %f205, %f203, %f203; + fma.rn.f32 %f206, %f204, %f192, %f205; + mul.rn.f32 %f207, %f177, %f206; + add.f32 %f208, %f202, %f194; + sub.f32 %f209, %f194, %f208; + add.f32 %f210, %f202, %f209; + add.f32 %f211, %f207, %f210; + add.f32 %f212, %f208, %f211; + sub.f32 %f213, %f208, %f212; + add.f32 %f214, %f211, %f213; + mov.f32 %f215, 0f3F317200; + mul.rn.f32 %f216, %f191, %f215; + mov.f32 %f217, 0f35BFBE8E; + mul.rn.f32 %f218, %f191, %f217; + add.f32 %f219, %f216, %f212; + sub.f32 %f220, %f216, %f219; + add.f32 %f221, %f212, %f220; + add.f32 %f222, %f214, %f221; + add.f32 %f223, %f218, %f222; + add.f32 %f224, %f219, %f223; + sub.f32 %f225, %f219, %f224; + add.f32 %f226, %f223, %f225; + abs.f32 %f58, %f1; + setp.gt.f32 %p114, %f58, 0f77F684DF; + mul.f32 %f227, %f1, 0f39000000; + selp.f32 %f228, %f227, %f1, %p114; + mul.rn.f32 %f229, %f228, %f224; + neg.f32 %f230, %f229; + fma.rn.f32 %f231, %f228, %f224, %f230; + fma.rn.f32 %f232, %f228, %f226, %f231; + mov.f32 %f233, 0f00000000; + fma.rn.f32 %f234, %f233, %f224, %f232; + add.rn.f32 %f235, %f229, %f234; + neg.f32 %f236, %f235; + add.rn.f32 %f237, %f229, %f236; + add.rn.f32 %f238, %f237, %f234; + mov.b32 %r47, %f235; + setp.eq.s32 %p115, %r47, 1118925336; + add.s32 %r48, %r47, -1; + mov.b32 %f239, %r48; + add.f32 %f240, %f238, 0f37000000; + selp.f32 %f241, %f239, %f235, %p115; + selp.f32 %f59, %f240, %f238, %p115; + mul.f32 %f242, %f241, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f243, %f242; + mov.f32 %f244, 0fBF317200; + fma.rn.f32 %f245, %f243, %f244, %f241; + mov.f32 %f246, 0fB5BFBE8E; + fma.rn.f32 %f247, %f243, %f246, %f245; + mul.f32 %f248, %f247, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f249, %f248; + add.f32 %f250, %f243, 0f00000000; + ex2.approx.f32 %f251, %f250; + mul.f32 %f252, %f249, %f251; + setp.lt.f32 %p116, %f241, 0fC2D20000; + selp.f32 %f253, 0f00000000, %f252, %p116; + setp.gt.f32 %p117, %f241, 0f42D20000; + selp.f32 %f263, 0f7F800000, %f253, %p117; + setp.eq.f32 %p118, %f263, 0f7F800000; + @%p118 bra BB33_111; + + fma.rn.f32 %f263, %f263, %f59, %f263; + +BB33_111: + setp.lt.f32 %p119, %f2, 0f00000000; + setp.eq.f32 %p120, %f56, 0f3F800000; + and.pred %p2, %p119, %p120; + mov.b32 %r49, %f263; + xor.b32 %r50, %r49, -2147483648; + mov.b32 %f254, %r50; + selp.f32 %f265, %f254, %f263, %p2; + setp.eq.f32 %p121, %f2, 0f00000000; + @%p121 bra BB33_114; + bra.uni BB33_112; + +BB33_114: + add.f32 %f256, %f2, %f2; + mov.b32 %r51, %f256; + selp.b32 %r52, %r51, 0, %p120; + or.b32 %r53, %r52, 2139095040; + setp.lt.f32 %p125, %f1, 0f00000000; + selp.b32 %r54, %r53, %r52, %p125; + mov.b32 %f265, %r54; + bra.uni BB33_115; + +BB33_90: + setp.gt.s32 %p74, %r3, 15; + @%p74 bra BB33_94; + + setp.eq.s32 %p78, %r3, 14; + @%p78 bra BB33_103; + bra.uni BB33_92; + +BB33_103: + cvt.rni.s64.f32 %rd19, %f2; + cvt.u32.u64 %r37, %rd19; + cvt.rni.s64.f32 %rd20, %f1; + cvt.u32.u64 %r38, %rd20; + or.b32 %r39, %r38, %r37; + setp.eq.s32 %p104, %r39, 0; + selp.f32 %f266, 0f00000000, 0f3F800000, %p104; + bra.uni BB33_125; + +BB33_8: + setp.eq.s32 %p27, %r3, 2; + @%p27 bra BB33_61; + bra.uni BB33_9; + +BB33_61: + mul.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_24: + setp.eq.s32 %p14, %r3, 11; + @%p14 bra BB33_43; + + setp.eq.s32 %p15, %r3, 12; + @%p15 bra BB33_42; + bra.uni BB33_26; + +BB33_42: + max.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_15: + setp.eq.s32 %p21, %r3, 6; + @%p21 bra BB33_46; + + setp.eq.s32 %p22, %r3, 7; + @%p22 bra BB33_45; + bra.uni BB33_17; + +BB33_45: + setp.gt.f32 %p42, %f1, %f2; + selp.f32 %f262, 0f3F800000, 0f00000000, %p42; + bra.uni BB33_63; + +BB33_32: + setp.eq.s32 %p8, %r3, 16; + @%p8 bra BB33_40; + + setp.eq.s32 %p9, %r3, 17; + @%p9 bra BB33_37; + bra.uni BB33_34; + +BB33_37: + setp.eq.f32 %p32, %f2, 0f00000000; + setp.eq.f32 %p33, %f2, 0f80000000; + or.pred %p34, %p32, %p33; + mov.f32 %f262, 0f7FC00000; + @%p34 bra BB33_63; + + div.rn.f32 %f262, %f1, %f2; + abs.f32 %f80, %f262; + setp.geu.f32 %p35, %f80, 0f7F800000; + @%p35 bra BB33_63; + + cvt.rmi.f32.f32 %f81, %f262; + mul.f32 %f82, %f2, %f81; + sub.f32 %f262, %f1, %f82; + bra.uni BB33_63; + +BB33_70: + setp.eq.s32 %p94, %r3, 2; + @%p94 bra BB33_123; + bra.uni BB33_71; + +BB33_123: + mul.f32 %f266, %f1, %f2; + bra.uni BB33_125; + +BB33_86: + setp.eq.s32 %p81, %r3, 11; + @%p81 bra BB33_105; + + setp.eq.s32 %p82, %r3, 12; + @%p82 bra BB33_104; + bra.uni BB33_88; + +BB33_104: + max.f32 %f266, %f2, %f1; + bra.uni BB33_125; + +BB33_77: + setp.eq.s32 %p88, %r3, 6; + @%p88 bra BB33_108; + + setp.eq.s32 %p89, %r3, 7; + @%p89 bra BB33_107; + bra.uni BB33_79; + +BB33_107: + setp.gt.f32 %p109, %f2, %f1; + selp.f32 %f266, 0f3F800000, 0f00000000, %p109; + bra.uni BB33_125; + +BB33_94: + setp.eq.s32 %p75, %r3, 16; + @%p75 bra BB33_102; + + setp.eq.s32 %p76, %r3, 17; + @%p76 bra BB33_99; + bra.uni BB33_96; + +BB33_99: + setp.eq.f32 %p99, %f1, 0f00000000; + setp.eq.f32 %p100, %f1, 0f80000000; + or.pred %p101, %p99, %p100; + mov.f32 %f266, 0f7FC00000; + @%p101 bra BB33_125; + + div.rn.f32 %f266, %f2, %f1; + abs.f32 %f171, %f266; + setp.geu.f32 %p102, %f171, 0f7F800000; + @%p102 bra BB33_125; + + cvt.rmi.f32.f32 %f172, %f266; + mul.f32 %f173, %f1, %f172; + sub.f32 %f266, %f2, %f173; + bra.uni BB33_125; + +BB33_6: + setp.eq.s32 %p30, %r3, 1; + @%p30 bra BB33_7; + bra.uni BB33_63; + +BB33_7: + sub.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_22: + setp.eq.s32 %p18, %r3, 10; + @%p18 bra BB33_23; + bra.uni BB33_63; + +BB33_23: + setp.neu.f32 %p39, %f1, %f2; + selp.f32 %f262, 0f3F800000, 0f00000000, %p39; + bra.uni BB33_63; + +BB33_13: + setp.eq.s32 %p25, %r3, 5; + @%p25 bra BB33_14; + bra.uni BB33_63; + +BB33_14: + setp.lt.f32 %p44, %f1, %f2; + selp.f32 %f262, 0f3F800000, 0f00000000, %p44; + bra.uni BB33_63; + +BB33_30: + setp.eq.s32 %p12, %r3, 15; + @%p12 bra BB33_31; + bra.uni BB33_63; + +BB33_31: + mul.f32 %f84, %f1, %f2; + mov.f32 %f85, 0f3F800000; + sub.f32 %f262, %f85, %f84; + bra.uni BB33_63; + +BB33_9: + setp.eq.s32 %p28, %r3, 3; + @%p28 bra BB33_10; + bra.uni BB33_63; + +BB33_10: + div.rn.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_43: + min.f32 %f262, %f1, %f2; + bra.uni BB33_63; + +BB33_26: + setp.eq.s32 %p16, %r3, 13; + @%p16 bra BB33_27; + bra.uni BB33_63; + +BB33_27: + cvt.rni.s64.f32 %rd17, %f1; + cvt.u32.u64 %r15, %rd17; + cvt.rni.s64.f32 %rd18, %f2; + cvt.u32.u64 %r16, %rd18; + and.b32 %r17, %r16, %r15; + setp.eq.s32 %p38, %r17, 0; + selp.f32 %f262, 0f00000000, 0f3F800000, %p38; + bra.uni BB33_63; + +BB33_46: + setp.gtu.f32 %p43, %f1, %f2; + selp.f32 %f262, 0f00000000, 0f3F800000, %p43; + bra.uni BB33_63; + +BB33_17: + setp.eq.s32 %p23, %r3, 8; + @%p23 bra BB33_18; + bra.uni BB33_63; + +BB33_18: + setp.ltu.f32 %p41, %f1, %f2; + selp.f32 %f262, 0f00000000, 0f3F800000, %p41; + bra.uni BB33_63; + +BB33_40: + setp.neu.f32 %p36, %f1, 0f00000000; + sub.f32 %f83, %f1, %f2; + selp.f32 %f262, %f83, 0f00000000, %p36; + bra.uni BB33_63; + +BB33_34: + setp.ne.s32 %p10, %r3, 18; + @%p10 bra BB33_63; + + div.rn.f32 %f262, %f1, %f2; + abs.f32 %f78, %f262; + setp.geu.f32 %p31, %f78, 0f7F800000; + @%p31 bra BB33_63; + + cvt.rmi.f32.f32 %f262, %f262; + bra.uni BB33_63; + +BB33_68: + setp.eq.s32 %p97, %r3, 1; + @%p97 bra BB33_69; + bra.uni BB33_125; + +BB33_69: + sub.f32 %f266, %f2, %f1; + bra.uni BB33_125; + +BB33_84: + setp.eq.s32 %p85, %r3, 10; + @%p85 bra BB33_85; + bra.uni BB33_125; + +BB33_85: + setp.neu.f32 %p106, %f2, %f1; + selp.f32 %f266, 0f3F800000, 0f00000000, %p106; + bra.uni BB33_125; + +BB33_75: + setp.eq.s32 %p92, %r3, 5; + @%p92 bra BB33_76; + bra.uni BB33_125; + +BB33_76: + setp.lt.f32 %p111, %f2, %f1; + selp.f32 %f266, 0f3F800000, 0f00000000, %p111; + bra.uni BB33_125; + +BB33_92: + setp.eq.s32 %p79, %r3, 15; + @%p79 bra BB33_93; + bra.uni BB33_125; + +BB33_93: + mul.f32 %f175, %f1, %f2; + mov.f32 %f176, 0f3F800000; + sub.f32 %f266, %f176, %f175; + bra.uni BB33_125; + +BB33_71: + setp.eq.s32 %p95, %r3, 3; + @%p95 bra BB33_72; + bra.uni BB33_125; + +BB33_72: + div.rn.f32 %f266, %f2, %f1; + bra.uni BB33_125; + +BB33_105: + min.f32 %f266, %f2, %f1; + bra.uni BB33_125; + +BB33_88: + setp.eq.s32 %p83, %r3, 13; + @%p83 bra BB33_89; + bra.uni BB33_125; + +BB33_89: + cvt.rni.s64.f32 %rd21, %f2; + cvt.u32.u64 %r40, %rd21; + cvt.rni.s64.f32 %rd22, %f1; + cvt.u32.u64 %r41, %rd22; + and.b32 %r42, %r41, %r40; + setp.eq.s32 %p105, %r42, 0; + selp.f32 %f266, 0f00000000, 0f3F800000, %p105; + bra.uni BB33_125; + +BB33_108: + setp.gtu.f32 %p110, %f2, %f1; + selp.f32 %f266, 0f00000000, 0f3F800000, %p110; + bra.uni BB33_125; + +BB33_79: + setp.eq.s32 %p90, %r3, 8; + @%p90 bra BB33_80; + bra.uni BB33_125; + +BB33_80: + setp.ltu.f32 %p108, %f2, %f1; + selp.f32 %f266, 0f00000000, 0f3F800000, %p108; + bra.uni BB33_125; + +BB33_102: + setp.neu.f32 %p103, %f2, 0f00000000; + sub.f32 %f174, %f2, %f1; + selp.f32 %f266, %f174, 0f00000000, %p103; + bra.uni BB33_125; + +BB33_96: + setp.ne.s32 %p77, %r3, 18; + @%p77 bra BB33_125; + + div.rn.f32 %f266, %f2, %f1; + abs.f32 %f169, %f266; + setp.geu.f32 %p98, %f169, 0f7F800000; + @%p98 bra BB33_125; + + cvt.rmi.f32.f32 %f266, %f266; + bra.uni BB33_125; + +BB33_50: + setp.geu.f32 %p55, %f1, 0f00000000; + @%p55 bra BB33_53; + + cvt.rzi.f32.f32 %f164, %f2; + setp.neu.f32 %p56, %f164, %f2; + selp.f32 %f261, 0f7FFFFFFF, %f261, %p56; + +BB33_53: + add.f32 %f166, %f20, %f21; + mov.b32 %r30, %f166; + setp.lt.s32 %p59, %r30, 2139095040; + @%p59 bra BB33_60; + + setp.gtu.f32 %p60, %f20, 0f7F800000; + setp.gtu.f32 %p61, %f21, 0f7F800000; + or.pred %p62, %p60, %p61; + @%p62 bra BB33_59; + bra.uni BB33_55; + +BB33_59: + add.f32 %f261, %f1, %f2; + bra.uni BB33_60; + +BB33_55: + setp.eq.f32 %p63, %f21, 0f7F800000; + @%p63 bra BB33_58; + bra.uni BB33_56; + +BB33_58: + setp.gt.f32 %p66, %f20, 0f3F800000; + selp.b32 %r34, 2139095040, 0, %p66; + xor.b32 %r35, %r34, 2139095040; + setp.lt.f32 %p67, %f2, 0f00000000; + selp.b32 %r36, %r35, %r34, %p67; + mov.b32 %f167, %r36; + setp.eq.f32 %p68, %f1, 0fBF800000; + selp.f32 %f261, 0f3F800000, %f167, %p68; + bra.uni BB33_60; + +BB33_112: + setp.geu.f32 %p122, %f2, 0f00000000; + @%p122 bra BB33_115; + + cvt.rzi.f32.f32 %f255, %f1; + setp.neu.f32 %p123, %f255, %f1; + selp.f32 %f265, 0f7FFFFFFF, %f265, %p123; + +BB33_115: + add.f32 %f257, %f57, %f58; + mov.b32 %r55, %f257; + setp.lt.s32 %p126, %r55, 2139095040; + @%p126 bra BB33_122; + + setp.gtu.f32 %p127, %f57, 0f7F800000; + setp.gtu.f32 %p128, %f58, 0f7F800000; + or.pred %p129, %p127, %p128; + @%p129 bra BB33_121; + bra.uni BB33_117; + +BB33_121: + add.f32 %f265, %f1, %f2; + bra.uni BB33_122; + +BB33_117: + setp.eq.f32 %p130, %f58, 0f7F800000; + @%p130 bra BB33_120; + bra.uni BB33_118; + +BB33_120: + setp.gt.f32 %p133, %f57, 0f3F800000; + selp.b32 %r59, 2139095040, 0, %p133; + xor.b32 %r60, %r59, 2139095040; + setp.lt.f32 %p134, %f1, 0f00000000; + selp.b32 %r61, %r60, %r59, %p134; + mov.b32 %f258, %r61; + setp.eq.f32 %p135, %f2, 0fBF800000; + selp.f32 %f265, 0f3F800000, %f258, %p135; + bra.uni BB33_122; + +BB33_56: + setp.neu.f32 %p64, %f20, 0f7F800000; + @%p64 bra BB33_60; + + setp.ltu.f32 %p65, %f2, 0f00000000; + selp.b32 %r31, 0, 2139095040, %p65; + or.b32 %r32, %r31, -2147483648; + selp.b32 %r33, %r32, %r31, %p1; + mov.b32 %f261, %r33; + +BB33_60: + setp.eq.f32 %p69, %f2, 0f00000000; + setp.eq.f32 %p70, %f1, 0f3F800000; + or.pred %p71, %p70, %p69; + selp.f32 %f262, 0f3F800000, %f261, %p71; + +BB33_63: + st.global.f32 [%rd1], %f262; + bra.uni BB33_126; + +BB33_118: + setp.neu.f32 %p131, %f57, 0f7F800000; + @%p131 bra BB33_122; + + setp.ltu.f32 %p132, %f1, 0f00000000; + selp.b32 %r56, 0, 2139095040, %p132; + or.b32 %r57, %r56, -2147483648; + selp.b32 %r58, %r57, %r56, %p2; + mov.b32 %f265, %r58; + +BB33_122: + setp.eq.f32 %p136, %f1, 0f00000000; + setp.eq.f32 %p137, %f2, 0f3F800000; + or.pred %p138, %p137, %p136; + selp.f32 %f266, 0f3F800000, %f265, %p138; + +BB33_125: + st.global.f32 [%rd1], %f266; + +BB33_126: + bar.sync 0; + ret; +} + // .globl fill_d .visible .entry fill_d( .param .u64 fill_d_param_0, @@ -4616,14 +6349,14 @@ BB31_126: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB32_2; + @%p1 bra BB34_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB32_2: +BB34_2: ret; } @@ -4649,7 +6382,7 @@ BB32_2: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB33_2; + @%p1 bra BB35_2; cvt.rn.f32.f64 %f1, %fd1; cvta.to.global.u64 %rd2, %rd1; @@ -4657,7 +6390,7 @@ BB32_2: add.s64 %rd4, %rd2, %rd3; st.global.f32 [%rd4], %f1; -BB33_2: +BB35_2: ret; } @@ -4697,10 +6430,10 @@ BB33_2: setp.lt.s32 %p1, %r1, %r7; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB34_2; - bra.uni BB34_1; + @!%p3 bra BB36_2; + bra.uni BB36_1; -BB34_1: +BB36_1: cvta.to.global.u64 %rd5, %rd2; mad.lo.s32 %r13, %r1, %r4, %r2; mul.wide.s32 %rd6, %r13, 8; @@ -4711,14 +6444,14 @@ BB34_1: add.s64 %rd9, %rd1, %rd8; st.global.f64 [%rd9], %fd1; -BB34_2: +BB36_2: setp.lt.s32 %p4, %r1, %r5; setp.lt.s32 %p5, %r2, %r6; and.pred %p6, %p4, %p5; - @!%p6 bra BB34_4; - bra.uni BB34_3; + @!%p6 bra BB36_4; + bra.uni BB36_3; -BB34_3: +BB36_3: cvta.to.global.u64 %rd10, %rd3; mad.lo.s32 %r15, %r1, %r6, %r2; mul.wide.s32 %rd11, %r15, 8; @@ -4730,7 +6463,7 @@ BB34_3: add.s64 %rd14, %rd1, %rd13; st.global.f64 [%rd14], %fd2; -BB34_4: +BB36_4: ret; } @@ -4770,10 +6503,10 @@ BB34_4: setp.lt.s32 %p1, %r1, %r7; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB35_2; - bra.uni BB35_1; + @!%p3 bra BB37_2; + bra.uni BB37_1; -BB35_1: +BB37_1: cvta.to.global.u64 %rd5, %rd2; mad.lo.s32 %r13, %r1, %r4, %r2; mul.wide.s32 %rd6, %r13, 4; @@ -4784,14 +6517,14 @@ BB35_1: add.s64 %rd9, %rd1, %rd8; st.global.f32 [%rd9], %f1; -BB35_2: +BB37_2: setp.lt.s32 %p4, %r1, %r5; setp.lt.s32 %p5, %r2, %r6; and.pred %p6, %p4, %p5; - @!%p6 bra BB35_4; - bra.uni BB35_3; + @!%p6 bra BB37_4; + bra.uni BB37_3; -BB35_3: +BB37_3: cvta.to.global.u64 %rd10, %rd3; mad.lo.s32 %r15, %r1, %r6, %r2; mul.wide.s32 %rd11, %r15, 4; @@ -4803,7 +6536,7 @@ BB35_3: add.s64 %rd14, %rd1, %rd13; st.global.f32 [%rd14], %f2; -BB35_4: +BB37_4: ret; } @@ -4842,10 +6575,10 @@ BB35_4: setp.lt.s32 %p1, %r1, %r3; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB36_2; - bra.uni BB36_1; + @!%p3 bra BB38_2; + bra.uni BB38_1; -BB36_1: +BB38_1: cvta.to.global.u64 %rd5, %rd2; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd6, %r12, 8; @@ -4854,14 +6587,14 @@ BB36_1: add.s64 %rd8, %rd1, %rd6; st.global.f64 [%rd8], %fd1; -BB36_2: +BB38_2: setp.lt.s32 %p4, %r1, %r5; setp.lt.s32 %p5, %r2, %r6; and.pred %p6, %p4, %p5; - @!%p6 bra BB36_4; - bra.uni BB36_3; + @!%p6 bra BB38_4; + bra.uni BB38_3; -BB36_3: +BB38_3: cvta.to.global.u64 %rd9, %rd3; mad.lo.s32 %r13, %r1, %r6, %r2; mul.wide.s32 %rd10, %r13, 8; @@ -4873,7 +6606,7 @@ BB36_3: add.s64 %rd13, %rd1, %rd12; st.global.f64 [%rd13], %fd2; -BB36_4: +BB38_4: ret; } @@ -4912,10 +6645,10 @@ BB36_4: setp.lt.s32 %p1, %r1, %r3; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB37_2; - bra.uni BB37_1; + @!%p3 bra BB39_2; + bra.uni BB39_1; -BB37_1: +BB39_1: cvta.to.global.u64 %rd5, %rd2; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd6, %r12, 4; @@ -4924,14 +6657,14 @@ BB37_1: add.s64 %rd8, %rd1, %rd6; st.global.f32 [%rd8], %f1; -BB37_2: +BB39_2: setp.lt.s32 %p4, %r1, %r5; setp.lt.s32 %p5, %r2, %r6; and.pred %p6, %p4, %p5; - @!%p6 bra BB37_4; - bra.uni BB37_3; + @!%p6 bra BB39_4; + bra.uni BB39_3; -BB37_3: +BB39_3: cvta.to.global.u64 %rd9, %rd3; mad.lo.s32 %r13, %r1, %r6, %r2; mul.wide.s32 %rd10, %r13, 4; @@ -4943,7 +6676,7 @@ BB37_3: add.s64 %rd13, %rd1, %rd12; st.global.f32 [%rd13], %f2; -BB37_4: +BB39_4: ret; } @@ -4970,9 +6703,9 @@ BB37_4: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f64 %fd44, 0d0000000000000000; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB38_4; + @%p1 bra BB40_4; -BB38_1: +BB40_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 8; add.s64 %rd5, %rd3, %rd4; @@ -4980,135 +6713,135 @@ BB38_1: add.f64 %fd44, %fd44, %fd30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB38_3; + @%p2 bra BB40_3; mul.wide.u32 %rd7, %r3, 8; add.s64 %rd8, %rd3, %rd7; ld.global.f64 %fd31, [%rd8]; add.f64 %fd44, %fd44, %fd31; -BB38_3: +BB40_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB38_1; + @%p3 bra BB40_1; -BB38_4: +BB40_4: shl.b32 %r16, %r7, 3; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f64 [%r5], %fd44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB38_8; + @%p4 bra BB40_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB38_7; + @%p5 bra BB40_7; ld.shared.f64 %fd32, [%r5+4096]; add.f64 %fd44, %fd44, %fd32; st.shared.f64 [%r5], %fd44; -BB38_7: +BB40_7: bar.sync 0; -BB38_8: +BB40_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB38_12; + @%p6 bra BB40_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB38_11; + @%p7 bra BB40_11; ld.shared.f64 %fd33, [%r5+2048]; add.f64 %fd44, %fd44, %fd33; st.shared.f64 [%r5], %fd44; -BB38_11: +BB40_11: bar.sync 0; -BB38_12: +BB40_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB38_16; + @%p8 bra BB40_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB38_15; + @%p9 bra BB40_15; ld.shared.f64 %fd34, [%r5+1024]; add.f64 %fd44, %fd44, %fd34; st.shared.f64 [%r5], %fd44; -BB38_15: +BB40_15: bar.sync 0; -BB38_16: +BB40_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB38_20; + @%p10 bra BB40_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB38_19; + @%p11 bra BB40_19; ld.shared.f64 %fd35, [%r5+512]; add.f64 %fd44, %fd44, %fd35; st.shared.f64 [%r5], %fd44; -BB38_19: +BB40_19: bar.sync 0; -BB38_20: +BB40_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB38_33; + @%p12 bra BB40_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB38_23; + @%p13 bra BB40_23; ld.volatile.shared.f64 %fd36, [%r5+256]; add.f64 %fd44, %fd44, %fd36; st.volatile.shared.f64 [%r5], %fd44; -BB38_23: +BB40_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB38_25; + @%p14 bra BB40_25; ld.volatile.shared.f64 %fd37, [%r5+128]; add.f64 %fd44, %fd44, %fd37; st.volatile.shared.f64 [%r5], %fd44; -BB38_25: +BB40_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB38_27; + @%p15 bra BB40_27; ld.volatile.shared.f64 %fd38, [%r5+64]; add.f64 %fd44, %fd44, %fd38; st.volatile.shared.f64 [%r5], %fd44; -BB38_27: +BB40_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB38_29; + @%p16 bra BB40_29; ld.volatile.shared.f64 %fd39, [%r5+32]; add.f64 %fd44, %fd44, %fd39; st.volatile.shared.f64 [%r5], %fd44; -BB38_29: +BB40_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB38_31; + @%p17 bra BB40_31; ld.volatile.shared.f64 %fd40, [%r5+16]; add.f64 %fd44, %fd44, %fd40; st.volatile.shared.f64 [%r5], %fd44; -BB38_31: +BB40_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB38_33; + @%p18 bra BB40_33; ld.volatile.shared.f64 %fd41, [%r5+8]; add.f64 %fd42, %fd44, %fd41; st.volatile.shared.f64 [%r5], %fd42; -BB38_33: +BB40_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB38_35; + @%p19 bra BB40_35; ld.shared.f64 %fd43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -5116,7 +6849,7 @@ BB38_33: add.s64 %rd11, %rd9, %rd10; st.global.f64 [%rd11], %fd43; -BB38_35: +BB40_35: ret; } @@ -5143,9 +6876,9 @@ BB38_35: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f32 %f44, 0f00000000; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB39_4; + @%p1 bra BB41_4; -BB39_1: +BB41_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 4; add.s64 %rd5, %rd3, %rd4; @@ -5153,135 +6886,135 @@ BB39_1: add.f32 %f44, %f44, %f30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB39_3; + @%p2 bra BB41_3; mul.wide.u32 %rd7, %r3, 4; add.s64 %rd8, %rd3, %rd7; ld.global.f32 %f31, [%rd8]; add.f32 %f44, %f44, %f31; -BB39_3: +BB41_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB39_1; + @%p3 bra BB41_1; -BB39_4: +BB41_4: shl.b32 %r16, %r7, 2; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f32 [%r5], %f44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB39_8; + @%p4 bra BB41_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB39_7; + @%p5 bra BB41_7; ld.shared.f32 %f32, [%r5+2048]; add.f32 %f44, %f44, %f32; st.shared.f32 [%r5], %f44; -BB39_7: +BB41_7: bar.sync 0; -BB39_8: +BB41_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB39_12; + @%p6 bra BB41_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB39_11; + @%p7 bra BB41_11; ld.shared.f32 %f33, [%r5+1024]; add.f32 %f44, %f44, %f33; st.shared.f32 [%r5], %f44; -BB39_11: +BB41_11: bar.sync 0; -BB39_12: +BB41_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB39_16; + @%p8 bra BB41_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB39_15; + @%p9 bra BB41_15; ld.shared.f32 %f34, [%r5+512]; add.f32 %f44, %f44, %f34; st.shared.f32 [%r5], %f44; -BB39_15: +BB41_15: bar.sync 0; -BB39_16: +BB41_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB39_20; + @%p10 bra BB41_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB39_19; + @%p11 bra BB41_19; ld.shared.f32 %f35, [%r5+256]; add.f32 %f44, %f44, %f35; st.shared.f32 [%r5], %f44; -BB39_19: +BB41_19: bar.sync 0; -BB39_20: +BB41_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB39_33; + @%p12 bra BB41_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB39_23; + @%p13 bra BB41_23; ld.volatile.shared.f32 %f36, [%r5+128]; add.f32 %f44, %f44, %f36; st.volatile.shared.f32 [%r5], %f44; -BB39_23: +BB41_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB39_25; + @%p14 bra BB41_25; ld.volatile.shared.f32 %f37, [%r5+64]; add.f32 %f44, %f44, %f37; st.volatile.shared.f32 [%r5], %f44; -BB39_25: +BB41_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB39_27; + @%p15 bra BB41_27; ld.volatile.shared.f32 %f38, [%r5+32]; add.f32 %f44, %f44, %f38; st.volatile.shared.f32 [%r5], %f44; -BB39_27: +BB41_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB39_29; + @%p16 bra BB41_29; ld.volatile.shared.f32 %f39, [%r5+16]; add.f32 %f44, %f44, %f39; st.volatile.shared.f32 [%r5], %f44; -BB39_29: +BB41_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB39_31; + @%p17 bra BB41_31; ld.volatile.shared.f32 %f40, [%r5+8]; add.f32 %f44, %f44, %f40; st.volatile.shared.f32 [%r5], %f44; -BB39_31: +BB41_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB39_33; + @%p18 bra BB41_33; ld.volatile.shared.f32 %f41, [%r5+4]; add.f32 %f42, %f44, %f41; st.volatile.shared.f32 [%r5], %f42; -BB39_33: +BB41_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB39_35; + @%p19 bra BB41_35; ld.shared.f32 %f43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -5289,7 +7022,7 @@ BB39_33: add.s64 %rd11, %rd9, %rd10; st.global.f32 [%rd11], %f43; -BB39_35: +BB41_35: ret; } @@ -5313,16 +7046,16 @@ BB39_35: ld.param.u32 %r4, [reduce_row_sum_d_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB40_35; + @%p1 bra BB42_35; mov.u32 %r71, %tid.x; mov.f64 %fd6, 0d0000000000000000; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB40_4; + @%p2 bra BB42_4; cvta.to.global.u64 %rd3, %rd1; -BB40_3: +BB42_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -5331,9 +7064,9 @@ BB40_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB40_3; + @%p3 bra BB42_3; -BB40_4: +BB42_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 3; mov.u32 %r12, my_sdata; @@ -5342,114 +7075,114 @@ BB40_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB40_8; + @%p4 bra BB42_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB40_7; + @%p5 bra BB42_7; ld.shared.f64 %fd29, [%r13+4096]; add.f64 %fd6, %fd6, %fd29; st.shared.f64 [%r13], %fd6; -BB40_7: +BB42_7: bar.sync 0; -BB40_8: +BB42_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB40_12; + @%p6 bra BB42_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB40_11; + @%p7 bra BB42_11; ld.shared.f64 %fd30, [%r13+2048]; add.f64 %fd6, %fd6, %fd30; st.shared.f64 [%r13], %fd6; -BB40_11: +BB42_11: bar.sync 0; -BB40_12: +BB42_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB40_16; + @%p8 bra BB42_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB40_15; + @%p9 bra BB42_15; ld.shared.f64 %fd31, [%r13+1024]; add.f64 %fd6, %fd6, %fd31; st.shared.f64 [%r13], %fd6; -BB40_15: +BB42_15: bar.sync 0; -BB40_16: +BB42_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB40_20; + @%p10 bra BB42_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB40_19; + @%p11 bra BB42_19; ld.shared.f64 %fd32, [%r13+512]; add.f64 %fd6, %fd6, %fd32; st.shared.f64 [%r13], %fd6; -BB40_19: +BB42_19: bar.sync 0; -BB40_20: +BB42_20: setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB40_33; + @%p12 bra BB42_33; setp.lt.u32 %p13, %r14, 64; - @%p13 bra BB40_23; + @%p13 bra BB42_23; ld.volatile.shared.f64 %fd33, [%r13+256]; add.f64 %fd6, %fd6, %fd33; st.volatile.shared.f64 [%r13], %fd6; -BB40_23: +BB42_23: setp.lt.u32 %p14, %r14, 32; - @%p14 bra BB40_25; + @%p14 bra BB42_25; ld.volatile.shared.f64 %fd34, [%r13+128]; add.f64 %fd6, %fd6, %fd34; st.volatile.shared.f64 [%r13], %fd6; -BB40_25: +BB42_25: setp.lt.u32 %p15, %r14, 16; - @%p15 bra BB40_27; + @%p15 bra BB42_27; ld.volatile.shared.f64 %fd35, [%r13+64]; add.f64 %fd6, %fd6, %fd35; st.volatile.shared.f64 [%r13], %fd6; -BB40_27: +BB42_27: setp.lt.u32 %p16, %r14, 8; - @%p16 bra BB40_29; + @%p16 bra BB42_29; ld.volatile.shared.f64 %fd36, [%r13+32]; add.f64 %fd6, %fd6, %fd36; st.volatile.shared.f64 [%r13], %fd6; -BB40_29: +BB42_29: setp.lt.u32 %p17, %r14, 4; - @%p17 bra BB40_31; + @%p17 bra BB42_31; ld.volatile.shared.f64 %fd37, [%r13+16]; add.f64 %fd6, %fd6, %fd37; st.volatile.shared.f64 [%r13], %fd6; -BB40_31: +BB42_31: setp.lt.u32 %p18, %r14, 2; - @%p18 bra BB40_33; + @%p18 bra BB42_33; ld.volatile.shared.f64 %fd38, [%r13+8]; add.f64 %fd39, %fd6, %fd38; st.volatile.shared.f64 [%r13], %fd39; -BB40_33: +BB42_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB40_35; + @%p19 bra BB42_35; ld.shared.f64 %fd40, [my_sdata]; cvta.to.global.u64 %rd6, %rd2; @@ -5457,7 +7190,7 @@ BB40_33: add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd40; -BB40_35: +BB42_35: ret; } @@ -5481,16 +7214,16 @@ BB40_35: ld.param.u32 %r4, [reduce_row_sum_f_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB41_35; + @%p1 bra BB43_35; mov.u32 %r71, %tid.x; mov.f32 %f6, 0f00000000; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB41_4; + @%p2 bra BB43_4; cvta.to.global.u64 %rd3, %rd1; -BB41_3: +BB43_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 4; add.s64 %rd5, %rd3, %rd4; @@ -5499,9 +7232,9 @@ BB41_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB41_3; + @%p3 bra BB43_3; -BB41_4: +BB43_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 2; mov.u32 %r12, my_sdata; @@ -5510,114 +7243,114 @@ BB41_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB41_8; + @%p4 bra BB43_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB41_7; + @%p5 bra BB43_7; ld.shared.f32 %f29, [%r13+2048]; add.f32 %f6, %f6, %f29; st.shared.f32 [%r13], %f6; -BB41_7: +BB43_7: bar.sync 0; -BB41_8: +BB43_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB41_12; + @%p6 bra BB43_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB41_11; + @%p7 bra BB43_11; ld.shared.f32 %f30, [%r13+1024]; add.f32 %f6, %f6, %f30; st.shared.f32 [%r13], %f6; -BB41_11: +BB43_11: bar.sync 0; -BB41_12: +BB43_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB41_16; + @%p8 bra BB43_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB41_15; + @%p9 bra BB43_15; ld.shared.f32 %f31, [%r13+512]; add.f32 %f6, %f6, %f31; st.shared.f32 [%r13], %f6; -BB41_15: +BB43_15: bar.sync 0; -BB41_16: +BB43_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB41_20; + @%p10 bra BB43_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB41_19; + @%p11 bra BB43_19; ld.shared.f32 %f32, [%r13+256]; add.f32 %f6, %f6, %f32; st.shared.f32 [%r13], %f6; -BB41_19: +BB43_19: bar.sync 0; -BB41_20: +BB43_20: setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB41_33; + @%p12 bra BB43_33; setp.lt.u32 %p13, %r14, 64; - @%p13 bra BB41_23; + @%p13 bra BB43_23; ld.volatile.shared.f32 %f33, [%r13+128]; add.f32 %f6, %f6, %f33; st.volatile.shared.f32 [%r13], %f6; -BB41_23: +BB43_23: setp.lt.u32 %p14, %r14, 32; - @%p14 bra BB41_25; + @%p14 bra BB43_25; ld.volatile.shared.f32 %f34, [%r13+64]; add.f32 %f6, %f6, %f34; st.volatile.shared.f32 [%r13], %f6; -BB41_25: +BB43_25: setp.lt.u32 %p15, %r14, 16; - @%p15 bra BB41_27; + @%p15 bra BB43_27; ld.volatile.shared.f32 %f35, [%r13+32]; add.f32 %f6, %f6, %f35; st.volatile.shared.f32 [%r13], %f6; -BB41_27: +BB43_27: setp.lt.u32 %p16, %r14, 8; - @%p16 bra BB41_29; + @%p16 bra BB43_29; ld.volatile.shared.f32 %f36, [%r13+16]; add.f32 %f6, %f6, %f36; st.volatile.shared.f32 [%r13], %f6; -BB41_29: +BB43_29: setp.lt.u32 %p17, %r14, 4; - @%p17 bra BB41_31; + @%p17 bra BB43_31; ld.volatile.shared.f32 %f37, [%r13+8]; add.f32 %f6, %f6, %f37; st.volatile.shared.f32 [%r13], %f6; -BB41_31: +BB43_31: setp.lt.u32 %p18, %r14, 2; - @%p18 bra BB41_33; + @%p18 bra BB43_33; ld.volatile.shared.f32 %f38, [%r13+4]; add.f32 %f39, %f6, %f38; st.volatile.shared.f32 [%r13], %f39; -BB41_33: +BB43_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB41_35; + @%p19 bra BB43_35; ld.shared.f32 %f40, [my_sdata]; cvta.to.global.u64 %rd6, %rd2; @@ -5625,7 +7358,7 @@ BB41_33: add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f40; -BB41_35: +BB43_35: ret; } @@ -5652,32 +7385,32 @@ BB41_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB42_5; + @%p1 bra BB44_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d0000000000000000; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB42_4; + @%p2 bra BB44_4; mov.u32 %r10, %r1; -BB42_3: +BB44_3: mul.wide.u32 %rd4, %r10, 8; add.s64 %rd5, %rd1, %rd4; ld.global.f64 %fd6, [%rd5]; add.f64 %fd8, %fd8, %fd6; add.s32 %r10, %r10, %r6; setp.lt.u32 %p3, %r10, %r2; - @%p3 bra BB42_3; + @%p3 bra BB44_3; -BB42_4: +BB44_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB42_5: +BB44_5: ret; } @@ -5704,32 +7437,32 @@ BB42_5: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB43_5; + @%p1 bra BB45_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f32 %f8, 0f00000000; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB43_4; + @%p2 bra BB45_4; mov.u32 %r10, %r1; -BB43_3: +BB45_3: mul.wide.u32 %rd4, %r10, 4; add.s64 %rd5, %rd1, %rd4; ld.global.f32 %f6, [%rd5]; add.f32 %f8, %f8, %f6; add.s32 %r10, %r10, %r6; setp.lt.u32 %p3, %r10, %r2; - @%p3 bra BB43_3; + @%p3 bra BB45_3; -BB43_4: +BB45_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f8; -BB43_5: +BB45_5: ret; } @@ -5756,9 +7489,9 @@ BB43_5: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f64 %fd44, 0dFFEFFFFFFFFFFFFF; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB44_4; + @%p1 bra BB46_4; -BB44_1: +BB46_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 8; add.s64 %rd5, %rd3, %rd4; @@ -5766,135 +7499,135 @@ BB44_1: max.f64 %fd44, %fd44, %fd30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB44_3; + @%p2 bra BB46_3; mul.wide.u32 %rd7, %r3, 8; add.s64 %rd8, %rd3, %rd7; ld.global.f64 %fd31, [%rd8]; max.f64 %fd44, %fd44, %fd31; -BB44_3: +BB46_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB44_1; + @%p3 bra BB46_1; -BB44_4: +BB46_4: shl.b32 %r16, %r7, 3; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f64 [%r5], %fd44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB44_8; + @%p4 bra BB46_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB44_7; + @%p5 bra BB46_7; ld.shared.f64 %fd32, [%r5+4096]; max.f64 %fd44, %fd44, %fd32; st.shared.f64 [%r5], %fd44; -BB44_7: +BB46_7: bar.sync 0; -BB44_8: +BB46_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB44_12; + @%p6 bra BB46_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB44_11; + @%p7 bra BB46_11; ld.shared.f64 %fd33, [%r5+2048]; max.f64 %fd44, %fd44, %fd33; st.shared.f64 [%r5], %fd44; -BB44_11: +BB46_11: bar.sync 0; -BB44_12: +BB46_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB44_16; + @%p8 bra BB46_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB44_15; + @%p9 bra BB46_15; ld.shared.f64 %fd34, [%r5+1024]; max.f64 %fd44, %fd44, %fd34; st.shared.f64 [%r5], %fd44; -BB44_15: +BB46_15: bar.sync 0; -BB44_16: +BB46_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB44_20; + @%p10 bra BB46_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB44_19; + @%p11 bra BB46_19; ld.shared.f64 %fd35, [%r5+512]; max.f64 %fd44, %fd44, %fd35; st.shared.f64 [%r5], %fd44; -BB44_19: +BB46_19: bar.sync 0; -BB44_20: +BB46_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB44_33; + @%p12 bra BB46_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB44_23; + @%p13 bra BB46_23; ld.volatile.shared.f64 %fd36, [%r5+256]; max.f64 %fd44, %fd44, %fd36; st.volatile.shared.f64 [%r5], %fd44; -BB44_23: +BB46_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB44_25; + @%p14 bra BB46_25; ld.volatile.shared.f64 %fd37, [%r5+128]; max.f64 %fd44, %fd44, %fd37; st.volatile.shared.f64 [%r5], %fd44; -BB44_25: +BB46_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB44_27; + @%p15 bra BB46_27; ld.volatile.shared.f64 %fd38, [%r5+64]; max.f64 %fd44, %fd44, %fd38; st.volatile.shared.f64 [%r5], %fd44; -BB44_27: +BB46_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB44_29; + @%p16 bra BB46_29; ld.volatile.shared.f64 %fd39, [%r5+32]; max.f64 %fd44, %fd44, %fd39; st.volatile.shared.f64 [%r5], %fd44; -BB44_29: +BB46_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB44_31; + @%p17 bra BB46_31; ld.volatile.shared.f64 %fd40, [%r5+16]; max.f64 %fd44, %fd44, %fd40; st.volatile.shared.f64 [%r5], %fd44; -BB44_31: +BB46_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB44_33; + @%p18 bra BB46_33; ld.volatile.shared.f64 %fd41, [%r5+8]; max.f64 %fd42, %fd44, %fd41; st.volatile.shared.f64 [%r5], %fd42; -BB44_33: +BB46_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB44_35; + @%p19 bra BB46_35; ld.shared.f64 %fd43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -5902,7 +7635,7 @@ BB44_33: add.s64 %rd11, %rd9, %rd10; st.global.f64 [%rd11], %fd43; -BB44_35: +BB46_35: ret; } @@ -5929,9 +7662,9 @@ BB44_35: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f32 %f44, 0fFF7FFFFF; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB45_4; + @%p1 bra BB47_4; -BB45_1: +BB47_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 4; add.s64 %rd5, %rd3, %rd4; @@ -5939,135 +7672,135 @@ BB45_1: max.f32 %f44, %f44, %f30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB45_3; + @%p2 bra BB47_3; mul.wide.u32 %rd7, %r3, 4; add.s64 %rd8, %rd3, %rd7; ld.global.f32 %f31, [%rd8]; max.f32 %f44, %f44, %f31; -BB45_3: +BB47_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB45_1; + @%p3 bra BB47_1; -BB45_4: +BB47_4: shl.b32 %r16, %r7, 2; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f32 [%r5], %f44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB45_8; + @%p4 bra BB47_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB45_7; + @%p5 bra BB47_7; ld.shared.f32 %f32, [%r5+2048]; max.f32 %f44, %f44, %f32; st.shared.f32 [%r5], %f44; -BB45_7: +BB47_7: bar.sync 0; -BB45_8: +BB47_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB45_12; + @%p6 bra BB47_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB45_11; + @%p7 bra BB47_11; ld.shared.f32 %f33, [%r5+1024]; max.f32 %f44, %f44, %f33; st.shared.f32 [%r5], %f44; -BB45_11: +BB47_11: bar.sync 0; -BB45_12: +BB47_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB45_16; + @%p8 bra BB47_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB45_15; + @%p9 bra BB47_15; ld.shared.f32 %f34, [%r5+512]; max.f32 %f44, %f44, %f34; st.shared.f32 [%r5], %f44; -BB45_15: +BB47_15: bar.sync 0; -BB45_16: +BB47_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB45_20; + @%p10 bra BB47_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB45_19; + @%p11 bra BB47_19; ld.shared.f32 %f35, [%r5+256]; max.f32 %f44, %f44, %f35; st.shared.f32 [%r5], %f44; -BB45_19: +BB47_19: bar.sync 0; -BB45_20: +BB47_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB45_33; + @%p12 bra BB47_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB45_23; + @%p13 bra BB47_23; ld.volatile.shared.f32 %f36, [%r5+128]; max.f32 %f44, %f44, %f36; st.volatile.shared.f32 [%r5], %f44; -BB45_23: +BB47_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB45_25; + @%p14 bra BB47_25; ld.volatile.shared.f32 %f37, [%r5+64]; max.f32 %f44, %f44, %f37; st.volatile.shared.f32 [%r5], %f44; -BB45_25: +BB47_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB45_27; + @%p15 bra BB47_27; ld.volatile.shared.f32 %f38, [%r5+32]; max.f32 %f44, %f44, %f38; st.volatile.shared.f32 [%r5], %f44; -BB45_27: +BB47_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB45_29; + @%p16 bra BB47_29; ld.volatile.shared.f32 %f39, [%r5+16]; max.f32 %f44, %f44, %f39; st.volatile.shared.f32 [%r5], %f44; -BB45_29: +BB47_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB45_31; + @%p17 bra BB47_31; ld.volatile.shared.f32 %f40, [%r5+8]; max.f32 %f44, %f44, %f40; st.volatile.shared.f32 [%r5], %f44; -BB45_31: +BB47_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB45_33; + @%p18 bra BB47_33; ld.volatile.shared.f32 %f41, [%r5+4]; max.f32 %f42, %f44, %f41; st.volatile.shared.f32 [%r5], %f42; -BB45_33: +BB47_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB45_35; + @%p19 bra BB47_35; ld.shared.f32 %f43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -6075,7 +7808,7 @@ BB45_33: add.s64 %rd11, %rd9, %rd10; st.global.f32 [%rd11], %f43; -BB45_35: +BB47_35: ret; } @@ -6099,16 +7832,16 @@ BB45_35: ld.param.u32 %r4, [reduce_row_max_d_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB46_35; + @%p1 bra BB48_35; mov.u32 %r71, %tid.x; mov.f64 %fd6, 0dFFEFFFFFFFFFFFFF; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB46_4; + @%p2 bra BB48_4; cvta.to.global.u64 %rd3, %rd1; -BB46_3: +BB48_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -6117,9 +7850,9 @@ BB46_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB46_3; + @%p3 bra BB48_3; -BB46_4: +BB48_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 3; mov.u32 %r12, my_sdata; @@ -6128,114 +7861,114 @@ BB46_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB46_8; + @%p4 bra BB48_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB46_7; + @%p5 bra BB48_7; ld.shared.f64 %fd29, [%r13+4096]; max.f64 %fd6, %fd6, %fd29; st.shared.f64 [%r13], %fd6; -BB46_7: +BB48_7: bar.sync 0; -BB46_8: +BB48_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB46_12; + @%p6 bra BB48_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB46_11; + @%p7 bra BB48_11; ld.shared.f64 %fd30, [%r13+2048]; max.f64 %fd6, %fd6, %fd30; st.shared.f64 [%r13], %fd6; -BB46_11: +BB48_11: bar.sync 0; -BB46_12: +BB48_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB46_16; + @%p8 bra BB48_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB46_15; + @%p9 bra BB48_15; ld.shared.f64 %fd31, [%r13+1024]; max.f64 %fd6, %fd6, %fd31; st.shared.f64 [%r13], %fd6; -BB46_15: +BB48_15: bar.sync 0; -BB46_16: +BB48_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB46_20; + @%p10 bra BB48_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB46_19; + @%p11 bra BB48_19; ld.shared.f64 %fd32, [%r13+512]; max.f64 %fd6, %fd6, %fd32; st.shared.f64 [%r13], %fd6; -BB46_19: +BB48_19: bar.sync 0; -BB46_20: +BB48_20: setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB46_33; + @%p12 bra BB48_33; setp.lt.u32 %p13, %r14, 64; - @%p13 bra BB46_23; + @%p13 bra BB48_23; ld.volatile.shared.f64 %fd33, [%r13+256]; max.f64 %fd6, %fd6, %fd33; st.volatile.shared.f64 [%r13], %fd6; -BB46_23: +BB48_23: setp.lt.u32 %p14, %r14, 32; - @%p14 bra BB46_25; + @%p14 bra BB48_25; ld.volatile.shared.f64 %fd34, [%r13+128]; max.f64 %fd6, %fd6, %fd34; st.volatile.shared.f64 [%r13], %fd6; -BB46_25: +BB48_25: setp.lt.u32 %p15, %r14, 16; - @%p15 bra BB46_27; + @%p15 bra BB48_27; ld.volatile.shared.f64 %fd35, [%r13+64]; max.f64 %fd6, %fd6, %fd35; st.volatile.shared.f64 [%r13], %fd6; -BB46_27: +BB48_27: setp.lt.u32 %p16, %r14, 8; - @%p16 bra BB46_29; + @%p16 bra BB48_29; ld.volatile.shared.f64 %fd36, [%r13+32]; max.f64 %fd6, %fd6, %fd36; st.volatile.shared.f64 [%r13], %fd6; -BB46_29: +BB48_29: setp.lt.u32 %p17, %r14, 4; - @%p17 bra BB46_31; + @%p17 bra BB48_31; ld.volatile.shared.f64 %fd37, [%r13+16]; max.f64 %fd6, %fd6, %fd37; st.volatile.shared.f64 [%r13], %fd6; -BB46_31: +BB48_31: setp.lt.u32 %p18, %r14, 2; - @%p18 bra BB46_33; + @%p18 bra BB48_33; ld.volatile.shared.f64 %fd38, [%r13+8]; max.f64 %fd39, %fd6, %fd38; st.volatile.shared.f64 [%r13], %fd39; -BB46_33: +BB48_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB46_35; + @%p19 bra BB48_35; ld.shared.f64 %fd40, [my_sdata]; cvta.to.global.u64 %rd6, %rd2; @@ -6243,7 +7976,7 @@ BB46_33: add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd40; -BB46_35: +BB48_35: ret; } @@ -6267,16 +8000,16 @@ BB46_35: ld.param.u32 %r4, [reduce_row_max_f_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB47_35; + @%p1 bra BB49_35; mov.u32 %r71, %tid.x; mov.f32 %f6, 0fFF7FFFFF; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB47_4; + @%p2 bra BB49_4; cvta.to.global.u64 %rd3, %rd1; -BB47_3: +BB49_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 4; add.s64 %rd5, %rd3, %rd4; @@ -6285,9 +8018,9 @@ BB47_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB47_3; + @%p3 bra BB49_3; -BB47_4: +BB49_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 2; mov.u32 %r12, my_sdata; @@ -6296,114 +8029,114 @@ BB47_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB47_8; + @%p4 bra BB49_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB47_7; + @%p5 bra BB49_7; ld.shared.f32 %f29, [%r13+2048]; max.f32 %f6, %f6, %f29; st.shared.f32 [%r13], %f6; -BB47_7: +BB49_7: bar.sync 0; -BB47_8: +BB49_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB47_12; + @%p6 bra BB49_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB47_11; + @%p7 bra BB49_11; ld.shared.f32 %f30, [%r13+1024]; max.f32 %f6, %f6, %f30; st.shared.f32 [%r13], %f6; -BB47_11: +BB49_11: bar.sync 0; -BB47_12: +BB49_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB47_16; + @%p8 bra BB49_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB47_15; + @%p9 bra BB49_15; ld.shared.f32 %f31, [%r13+512]; max.f32 %f6, %f6, %f31; st.shared.f32 [%r13], %f6; -BB47_15: +BB49_15: bar.sync 0; -BB47_16: +BB49_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB47_20; + @%p10 bra BB49_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB47_19; + @%p11 bra BB49_19; ld.shared.f32 %f32, [%r13+256]; max.f32 %f6, %f6, %f32; st.shared.f32 [%r13], %f6; -BB47_19: +BB49_19: bar.sync 0; -BB47_20: +BB49_20: setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB47_33; + @%p12 bra BB49_33; setp.lt.u32 %p13, %r14, 64; - @%p13 bra BB47_23; + @%p13 bra BB49_23; ld.volatile.shared.f32 %f33, [%r13+128]; max.f32 %f6, %f6, %f33; st.volatile.shared.f32 [%r13], %f6; -BB47_23: +BB49_23: setp.lt.u32 %p14, %r14, 32; - @%p14 bra BB47_25; + @%p14 bra BB49_25; ld.volatile.shared.f32 %f34, [%r13+64]; max.f32 %f6, %f6, %f34; st.volatile.shared.f32 [%r13], %f6; -BB47_25: +BB49_25: setp.lt.u32 %p15, %r14, 16; - @%p15 bra BB47_27; + @%p15 bra BB49_27; ld.volatile.shared.f32 %f35, [%r13+32]; max.f32 %f6, %f6, %f35; st.volatile.shared.f32 [%r13], %f6; -BB47_27: +BB49_27: setp.lt.u32 %p16, %r14, 8; - @%p16 bra BB47_29; + @%p16 bra BB49_29; ld.volatile.shared.f32 %f36, [%r13+16]; max.f32 %f6, %f6, %f36; st.volatile.shared.f32 [%r13], %f6; -BB47_29: +BB49_29: setp.lt.u32 %p17, %r14, 4; - @%p17 bra BB47_31; + @%p17 bra BB49_31; ld.volatile.shared.f32 %f37, [%r13+8]; max.f32 %f6, %f6, %f37; st.volatile.shared.f32 [%r13], %f6; -BB47_31: +BB49_31: setp.lt.u32 %p18, %r14, 2; - @%p18 bra BB47_33; + @%p18 bra BB49_33; ld.volatile.shared.f32 %f38, [%r13+4]; max.f32 %f39, %f6, %f38; st.volatile.shared.f32 [%r13], %f39; -BB47_33: +BB49_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB47_35; + @%p19 bra BB49_35; ld.shared.f32 %f40, [my_sdata]; cvta.to.global.u64 %rd6, %rd2; @@ -6411,7 +8144,7 @@ BB47_33: add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f40; -BB47_35: +BB49_35: ret; } @@ -6438,32 +8171,32 @@ BB47_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB48_5; + @%p1 bra BB50_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB48_4; + @%p2 bra BB50_4; mov.u32 %r10, %r1; -BB48_3: +BB50_3: mul.wide.u32 %rd4, %r10, 8; add.s64 %rd5, %rd1, %rd4; ld.global.f64 %fd6, [%rd5]; max.f64 %fd8, %fd8, %fd6; add.s32 %r10, %r10, %r6; setp.lt.u32 %p3, %r10, %r2; - @%p3 bra BB48_3; + @%p3 bra BB50_3; -BB48_4: +BB50_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB48_5: +BB50_5: ret; } @@ -6490,32 +8223,32 @@ BB48_5: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB49_5; + @%p1 bra BB51_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f32 %f8, 0fFF7FFFFF; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB49_4; + @%p2 bra BB51_4; mov.u32 %r10, %r1; -BB49_3: +BB51_3: mul.wide.u32 %rd4, %r10, 4; add.s64 %rd5, %rd1, %rd4; ld.global.f32 %f6, [%rd5]; max.f32 %f8, %f8, %f6; add.s32 %r10, %r10, %r6; setp.lt.u32 %p3, %r10, %r2; - @%p3 bra BB49_3; + @%p3 bra BB51_3; -BB49_4: +BB51_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f8; -BB49_5: +BB51_5: ret; } @@ -6542,9 +8275,9 @@ BB49_5: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f64 %fd44, 0d7FEFFFFFFFFFFFFF; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB50_4; + @%p1 bra BB52_4; -BB50_1: +BB52_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 8; add.s64 %rd5, %rd3, %rd4; @@ -6552,135 +8285,135 @@ BB50_1: min.f64 %fd44, %fd44, %fd30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB50_3; + @%p2 bra BB52_3; mul.wide.u32 %rd7, %r3, 8; add.s64 %rd8, %rd3, %rd7; ld.global.f64 %fd31, [%rd8]; min.f64 %fd44, %fd44, %fd31; -BB50_3: +BB52_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB50_1; + @%p3 bra BB52_1; -BB50_4: +BB52_4: shl.b32 %r16, %r7, 3; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f64 [%r5], %fd44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB50_8; + @%p4 bra BB52_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB50_7; + @%p5 bra BB52_7; ld.shared.f64 %fd32, [%r5+4096]; min.f64 %fd44, %fd44, %fd32; st.shared.f64 [%r5], %fd44; -BB50_7: +BB52_7: bar.sync 0; -BB50_8: +BB52_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB50_12; + @%p6 bra BB52_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB50_11; + @%p7 bra BB52_11; ld.shared.f64 %fd33, [%r5+2048]; min.f64 %fd44, %fd44, %fd33; st.shared.f64 [%r5], %fd44; -BB50_11: +BB52_11: bar.sync 0; -BB50_12: +BB52_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB50_16; + @%p8 bra BB52_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB50_15; + @%p9 bra BB52_15; ld.shared.f64 %fd34, [%r5+1024]; min.f64 %fd44, %fd44, %fd34; st.shared.f64 [%r5], %fd44; -BB50_15: +BB52_15: bar.sync 0; -BB50_16: +BB52_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB50_20; + @%p10 bra BB52_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB50_19; + @%p11 bra BB52_19; ld.shared.f64 %fd35, [%r5+512]; min.f64 %fd44, %fd44, %fd35; st.shared.f64 [%r5], %fd44; -BB50_19: +BB52_19: bar.sync 0; -BB50_20: +BB52_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB50_33; + @%p12 bra BB52_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB50_23; + @%p13 bra BB52_23; ld.volatile.shared.f64 %fd36, [%r5+256]; min.f64 %fd44, %fd44, %fd36; st.volatile.shared.f64 [%r5], %fd44; -BB50_23: +BB52_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB50_25; + @%p14 bra BB52_25; ld.volatile.shared.f64 %fd37, [%r5+128]; min.f64 %fd44, %fd44, %fd37; st.volatile.shared.f64 [%r5], %fd44; -BB50_25: +BB52_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB50_27; + @%p15 bra BB52_27; ld.volatile.shared.f64 %fd38, [%r5+64]; min.f64 %fd44, %fd44, %fd38; st.volatile.shared.f64 [%r5], %fd44; -BB50_27: +BB52_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB50_29; + @%p16 bra BB52_29; ld.volatile.shared.f64 %fd39, [%r5+32]; min.f64 %fd44, %fd44, %fd39; st.volatile.shared.f64 [%r5], %fd44; -BB50_29: +BB52_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB50_31; + @%p17 bra BB52_31; ld.volatile.shared.f64 %fd40, [%r5+16]; min.f64 %fd44, %fd44, %fd40; st.volatile.shared.f64 [%r5], %fd44; -BB50_31: +BB52_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB50_33; + @%p18 bra BB52_33; ld.volatile.shared.f64 %fd41, [%r5+8]; min.f64 %fd42, %fd44, %fd41; st.volatile.shared.f64 [%r5], %fd42; -BB50_33: +BB52_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB50_35; + @%p19 bra BB52_35; ld.shared.f64 %fd43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -6688,7 +8421,7 @@ BB50_33: add.s64 %rd11, %rd9, %rd10; st.global.f64 [%rd11], %fd43; -BB50_35: +BB52_35: ret; } @@ -6715,9 +8448,9 @@ BB50_35: mad.lo.s32 %r35, %r9, %r10, %r7; mov.f32 %f44, 0f7F7FFFFF; setp.ge.u32 %p1, %r35, %r6; - @%p1 bra BB51_4; + @%p1 bra BB53_4; -BB51_1: +BB53_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r35, 4; add.s64 %rd5, %rd3, %rd4; @@ -6725,135 +8458,135 @@ BB51_1: min.f32 %f44, %f44, %f30; add.s32 %r3, %r35, %r10; setp.ge.u32 %p2, %r3, %r6; - @%p2 bra BB51_3; + @%p2 bra BB53_3; mul.wide.u32 %rd7, %r3, 4; add.s64 %rd8, %rd3, %rd7; ld.global.f32 %f31, [%rd8]; min.f32 %f44, %f44, %f31; -BB51_3: +BB53_3: shl.b32 %r13, %r10, 1; mov.u32 %r14, %nctaid.x; mad.lo.s32 %r35, %r13, %r14, %r35; setp.lt.u32 %p3, %r35, %r6; - @%p3 bra BB51_1; + @%p3 bra BB53_1; -BB51_4: +BB53_4: shl.b32 %r16, %r7, 2; mov.u32 %r17, my_sdata; add.s32 %r5, %r17, %r16; st.shared.f32 [%r5], %f44; bar.sync 0; setp.lt.u32 %p4, %r10, 1024; - @%p4 bra BB51_8; + @%p4 bra BB53_8; setp.gt.u32 %p5, %r7, 511; - @%p5 bra BB51_7; + @%p5 bra BB53_7; ld.shared.f32 %f32, [%r5+2048]; min.f32 %f44, %f44, %f32; st.shared.f32 [%r5], %f44; -BB51_7: +BB53_7: bar.sync 0; -BB51_8: +BB53_8: setp.lt.u32 %p6, %r10, 512; - @%p6 bra BB51_12; + @%p6 bra BB53_12; setp.gt.u32 %p7, %r7, 255; - @%p7 bra BB51_11; + @%p7 bra BB53_11; ld.shared.f32 %f33, [%r5+1024]; min.f32 %f44, %f44, %f33; st.shared.f32 [%r5], %f44; -BB51_11: +BB53_11: bar.sync 0; -BB51_12: +BB53_12: setp.lt.u32 %p8, %r10, 256; - @%p8 bra BB51_16; + @%p8 bra BB53_16; setp.gt.u32 %p9, %r7, 127; - @%p9 bra BB51_15; + @%p9 bra BB53_15; ld.shared.f32 %f34, [%r5+512]; min.f32 %f44, %f44, %f34; st.shared.f32 [%r5], %f44; -BB51_15: +BB53_15: bar.sync 0; -BB51_16: +BB53_16: setp.lt.u32 %p10, %r10, 128; - @%p10 bra BB51_20; + @%p10 bra BB53_20; setp.gt.u32 %p11, %r7, 63; - @%p11 bra BB51_19; + @%p11 bra BB53_19; ld.shared.f32 %f35, [%r5+256]; min.f32 %f44, %f44, %f35; st.shared.f32 [%r5], %f44; -BB51_19: +BB53_19: bar.sync 0; -BB51_20: +BB53_20: setp.gt.u32 %p12, %r7, 31; - @%p12 bra BB51_33; + @%p12 bra BB53_33; setp.lt.u32 %p13, %r10, 64; - @%p13 bra BB51_23; + @%p13 bra BB53_23; ld.volatile.shared.f32 %f36, [%r5+128]; min.f32 %f44, %f44, %f36; st.volatile.shared.f32 [%r5], %f44; -BB51_23: +BB53_23: setp.lt.u32 %p14, %r10, 32; - @%p14 bra BB51_25; + @%p14 bra BB53_25; ld.volatile.shared.f32 %f37, [%r5+64]; min.f32 %f44, %f44, %f37; st.volatile.shared.f32 [%r5], %f44; -BB51_25: +BB53_25: setp.lt.u32 %p15, %r10, 16; - @%p15 bra BB51_27; + @%p15 bra BB53_27; ld.volatile.shared.f32 %f38, [%r5+32]; min.f32 %f44, %f44, %f38; st.volatile.shared.f32 [%r5], %f44; -BB51_27: +BB53_27: setp.lt.u32 %p16, %r10, 8; - @%p16 bra BB51_29; + @%p16 bra BB53_29; ld.volatile.shared.f32 %f39, [%r5+16]; min.f32 %f44, %f44, %f39; st.volatile.shared.f32 [%r5], %f44; -BB51_29: +BB53_29: setp.lt.u32 %p17, %r10, 4; - @%p17 bra BB51_31; + @%p17 bra BB53_31; ld.volatile.shared.f32 %f40, [%r5+8]; min.f32 %f44, %f44, %f40; st.volatile.shared.f32 [%r5], %f44; -BB51_31: +BB53_31: setp.lt.u32 %p18, %r10, 2; - @%p18 bra BB51_33; + @%p18 bra BB53_33; ld.volatile.shared.f32 %f41, [%r5+4]; min.f32 %f42, %f44, %f41; st.volatile.shared.f32 [%r5], %f42; -BB51_33: +BB53_33: setp.ne.s32 %p19, %r7, 0; - @%p19 bra BB51_35; + @%p19 bra BB53_35; ld.shared.f32 %f43, [my_sdata]; cvta.to.global.u64 %rd9, %rd2; @@ -6861,7 +8594,7 @@ BB51_33: add.s64 %rd11, %rd9, %rd10; st.global.f32 [%rd11], %f43; -BB51_35: +BB53_35: ret; } @@ -6885,16 +8618,16 @@ BB51_35: ld.param.u32 %r4, [reduce_row_min_d_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB52_35; + @%p1 bra BB54_35; mov.u32 %r71, %tid.x; mov.f64 %fd6, 0d7FEFFFFFFFFFFFFF; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB52_4; + @%p2 bra BB54_4; cvta.to.global.u64 %rd3, %rd1; -BB52_3: +BB54_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -6903,9 +8636,9 @@ BB52_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB52_3; + @%p3 bra BB54_3; -BB52_4: +BB54_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 3; mov.u32 %r12, my_sdata; @@ -6914,114 +8647,114 @@ BB52_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB52_8; + @%p4 bra BB54_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB52_7; + @%p5 bra BB54_7; ld.shared.f64 %fd29, [%r13+4096]; min.f64 %fd6, %fd6, %fd29; st.shared.f64 [%r13], %fd6; -BB52_7: +BB54_7: bar.sync 0; -BB52_8: +BB54_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB52_12; + @%p6 bra BB54_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB52_11; + @%p7 bra BB54_11; ld.shared.f64 %fd30, [%r13+2048]; min.f64 %fd6, %fd6, %fd30; st.shared.f64 [%r13], %fd6; -BB52_11: +BB54_11: bar.sync 0; -BB52_12: +BB54_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB52_16; + @%p8 bra BB54_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB52_15; + @%p9 bra BB54_15; ld.shared.f64 %fd31, [%r13+1024]; min.f64 %fd6, %fd6, %fd31; st.shared.f64 [%r13], %fd6; -BB52_15: +BB54_15: bar.sync 0; -BB52_16: +BB54_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB52_20; + @%p10 bra BB54_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB52_19; + @%p11 bra BB54_19; ld.shared.f64 %fd32, [%r13+512]; min.f64 %fd6, %fd6, %fd32; st.shared.f64 [%r13], %fd6; -BB52_19: +BB54_19: bar.sync 0; -BB52_20: +BB54_20: setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB52_33; + @%p12 bra BB54_33; setp.lt.u32 %p13, %r14, 64; - @%p13 bra BB52_23; + @%p13 bra BB54_23; ld.volatile.shared.f64 %fd33, [%r13+256]; min.f64 %fd6, %fd6, %fd33; st.volatile.shared.f64 [%r13], %fd6; -BB52_23: +BB54_23: setp.lt.u32 %p14, %r14, 32; - @%p14 bra BB52_25; + @%p14 bra BB54_25; ld.volatile.shared.f64 %fd34, [%r13+128]; min.f64 %fd6, %fd6, %fd34; st.volatile.shared.f64 [%r13], %fd6; -BB52_25: +BB54_25: setp.lt.u32 %p15, %r14, 16; - @%p15 bra BB52_27; + @%p15 bra BB54_27; ld.volatile.shared.f64 %fd35, [%r13+64]; min.f64 %fd6, %fd6, %fd35; st.volatile.shared.f64 [%r13], %fd6; -BB52_27: +BB54_27: setp.lt.u32 %p16, %r14, 8; - @%p16 bra BB52_29; + @%p16 bra BB54_29; ld.volatile.shared.f64 %fd36, [%r13+32]; min.f64 %fd6, %fd6, %fd36; st.volatile.shared.f64 [%r13], %fd6; -BB52_29: +BB54_29: setp.lt.u32 %p17, %r14, 4; - @%p17 bra BB52_31; + @%p17 bra BB54_31; ld.volatile.shared.f64 %fd37, [%r13+16]; min.f64 %fd6, %fd6, %fd37; st.volatile.shared.f64 [%r13], %fd6; -BB52_31: +BB54_31: setp.lt.u32 %p18, %r14, 2; - @%p18 bra BB52_33; + @%p18 bra BB54_33; ld.volatile.shared.f64 %fd38, [%r13+8]; min.f64 %fd39, %fd6, %fd38; st.volatile.shared.f64 [%r13], %fd39; -BB52_33: +BB54_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB52_35; + @%p19 bra BB54_35; ld.shared.f64 %fd40, [my_sdata]; cvta.to.global.u64 %rd6, %rd2; @@ -7029,7 +8762,7 @@ BB52_33: add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd40; -BB52_35: +BB54_35: ret; } @@ -7053,16 +8786,16 @@ BB52_35: ld.param.u32 %r4, [reduce_row_min_f_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB53_35; + @%p1 bra BB55_35; mov.u32 %r71, %tid.x; mov.f32 %f6, 0f7F7FFFFF; setp.ge.u32 %p2, %r71, %r4; - @%p2 bra BB53_4; + @%p2 bra BB55_4; cvta.to.global.u64 %rd3, %rd1; -BB53_3: +BB55_3: mad.lo.s32 %r8, %r6, %r4, %r71; mul.wide.u32 %rd4, %r8, 4; add.s64 %rd5, %rd3, %rd4; @@ -7071,9 +8804,9 @@ BB53_3: mov.u32 %r9, %ntid.x; add.s32 %r71, %r9, %r71; setp.lt.u32 %p3, %r71, %r4; - @%p3 bra BB53_3; + @%p3 bra BB55_3; -BB53_4: +BB55_4: mov.u32 %r10, %tid.x; shl.b32 %r11, %r10, 2; mov.u32 %r12, my_sdata; @@ -7082,114 +8815,114 @@ BB53_4: bar.sync 0; mov.u32 %r14, %ntid.x; setp.lt.u32 %p4, %r14, 1024; - @%p4 bra BB53_8; + @%p4 bra BB55_8; setp.gt.u32 %p5, %r10, 511; - @%p5 bra BB53_7; + @%p5 bra BB55_7; ld.shared.f32 %f29, [%r13+2048]; min.f32 %f6, %f6, %f29; st.shared.f32 [%r13], %f6; -BB53_7: +BB55_7: bar.sync 0; -BB53_8: +BB55_8: setp.lt.u32 %p6, %r14, 512; - @%p6 bra BB53_12; + @%p6 bra BB55_12; setp.gt.u32 %p7, %r10, 255; - @%p7 bra BB53_11; + @%p7 bra BB55_11; ld.shared.f32 %f30, [%r13+1024]; min.f32 %f6, %f6, %f30; st.shared.f32 [%r13], %f6; -BB53_11: +BB55_11: bar.sync 0; -BB53_12: +BB55_12: setp.lt.u32 %p8, %r14, 256; - @%p8 bra BB53_16; + @%p8 bra BB55_16; setp.gt.u32 %p9, %r10, 127; - @%p9 bra BB53_15; + @%p9 bra BB55_15; ld.shared.f32 %f31, [%r13+512]; min.f32 %f6, %f6, %f31; st.shared.f32 [%r13], %f6; -BB53_15: +BB55_15: bar.sync 0; -BB53_16: +BB55_16: setp.lt.u32 %p10, %r14, 128; - @%p10 bra BB53_20; + @%p10 bra BB55_20; setp.gt.u32 %p11, %r10, 63; - @%p11 bra BB53_19; + @%p11 bra BB55_19; ld.shared.f32 %f32, [%r13+256
<TRUNCATED>