http://git-wip-us.apache.org/repos/asf/systemml/blob/61139e40/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 1ab32f5..ac04967 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -4595,6 +4595,1739 @@ BB31_126:
        ret;
 }
 
+       // .globl       sparse_dense_matrix_scalar_op_d
+.visible .entry sparse_dense_matrix_scalar_op_d(
+       .param .u64 sparse_dense_matrix_scalar_op_d_param_0,
+       .param .u64 sparse_dense_matrix_scalar_op_d_param_1,
+       .param .u64 sparse_dense_matrix_scalar_op_d_param_2,
+       .param .f64 sparse_dense_matrix_scalar_op_d_param_3,
+       .param .u64 sparse_dense_matrix_scalar_op_d_param_4,
+       .param .u32 sparse_dense_matrix_scalar_op_d_param_5,
+       .param .u32 sparse_dense_matrix_scalar_op_d_param_6,
+       .param .u32 sparse_dense_matrix_scalar_op_d_param_7,
+       .param .u32 sparse_dense_matrix_scalar_op_d_param_8
+)
+{
+       .reg .pred      %p<133>;
+       .reg .b32       %r<92>;
+       .reg .f64       %fd<99>;
+       .reg .b64       %rd<28>;
+
+
+       ld.param.u64    %rd4, [sparse_dense_matrix_scalar_op_d_param_0];
+       ld.param.u64    %rd5, [sparse_dense_matrix_scalar_op_d_param_1];
+       ld.param.u64    %rd6, [sparse_dense_matrix_scalar_op_d_param_2];
+       ld.param.f64    %fd68, [sparse_dense_matrix_scalar_op_d_param_3];
+       ld.param.u64    %rd7, [sparse_dense_matrix_scalar_op_d_param_4];
+       ld.param.u32    %r9, [sparse_dense_matrix_scalar_op_d_param_5];
+       ld.param.u32    %r6, [sparse_dense_matrix_scalar_op_d_param_6];
+       ld.param.u32    %r7, [sparse_dense_matrix_scalar_op_d_param_7];
+       ld.param.u32    %r8, [sparse_dense_matrix_scalar_op_d_param_8];
+       mov.u32         %r10, %ntid.x;
+       mov.u32         %r11, %ctaid.x;
+       mov.u32         %r12, %tid.x;
+       mad.lo.s32      %r1, %r10, %r11, %r12;
+       setp.ge.s32     %p3, %r1, %r9;
+       @%p3 bra        BB32_142;
+
+       cvta.to.global.u64      %rd8, %rd7;
+       cvta.to.global.u64      %rd9, %rd6;
+       mul.wide.s32    %rd10, %r1, 8;
+       add.s64         %rd11, %rd9, %rd10;
+       ld.global.f64   %fd1, [%rd11];
+       cvta.to.global.u64      %rd12, %rd4;
+       mul.wide.s32    %rd13, %r1, 4;
+       add.s64         %rd14, %rd12, %rd13;
+       ld.global.u32   %r13, [%rd14];
+       cvta.to.global.u64      %rd15, %rd5;
+       add.s64         %rd16, %rd15, %rd13;
+       ld.global.u32   %r14, [%rd16];
+       mad.lo.s32      %r15, %r13, %r6, %r14;
+       mul.wide.s32    %rd17, %r15, 8;
+       add.s64         %rd1, %rd8, %rd17;
+       setp.eq.s32     %p4, %r8, 0;
+       @%p4 bra        BB32_72;
+
+       mov.f64         %fd94, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p5, %r7, 8;
+       @%p5 bra        BB32_19;
+
+       setp.gt.s32     %p19, %r7, 3;
+       @%p19 bra       BB32_11;
+
+       setp.gt.s32     %p26, %r7, 1;
+       @%p26 bra       BB32_8;
+
+       setp.eq.s32     %p29, %r7, 0;
+       @%p29 bra       BB32_70;
+       bra.uni         BB32_6;
+
+BB32_70:
+       add.f64         %fd94, %fd1, %fd68;
+       bra.uni         BB32_71;
+
+BB32_72:
+       mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p69, %r7, 8;
+       @%p69 bra       BB32_89;
+
+       setp.gt.s32     %p83, %r7, 3;
+       @%p83 bra       BB32_81;
+
+       setp.gt.s32     %p90, %r7, 1;
+       @%p90 bra       BB32_78;
+
+       setp.eq.s32     %p93, %r7, 0;
+       @%p93 bra       BB32_140;
+       bra.uni         BB32_76;
+
+BB32_140:
+       add.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_19:
+       setp.gt.s32     %p6, %r7, 13;
+       @%p6 bra        BB32_28;
+
+       setp.gt.s32     %p13, %r7, 10;
+       @%p13 bra       BB32_24;
+
+       setp.eq.s32     %p17, %r7, 9;
+       @%p17 bra       BB32_48;
+       bra.uni         BB32_22;
+
+BB32_48:
+       setp.eq.f64     %p44, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44;
+       bra.uni         BB32_71;
+
+BB32_89:
+       setp.gt.s32     %p70, %r7, 13;
+       @%p70 bra       BB32_98;
+
+       setp.gt.s32     %p77, %r7, 10;
+       @%p77 bra       BB32_94;
+
+       setp.eq.s32     %p81, %r7, 9;
+       @%p81 bra       BB32_118;
+       bra.uni         BB32_92;
+
+BB32_118:
+       setp.eq.f64     %p108, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p108;
+       bra.uni         BB32_141;
+
+BB32_11:
+       setp.gt.s32     %p20, %r7, 5;
+       @%p20 bra       BB32_15;
+
+       setp.eq.s32     %p24, %r7, 4;
+       @%p24 bra       BB32_51;
+       bra.uni         BB32_13;
+
+BB32_51:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r2}, %fd68;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r3}, %fd1;
+       }
+       bfe.u32         %r28, %r3, 20, 11;
+       add.s32         %r29, %r28, -1012;
+       mov.b64          %rd22, %fd1;
+       shl.b64         %rd2, %rd22, %r29;
+       setp.eq.s64     %p49, %rd2, -9223372036854775808;
+       abs.f64         %fd18, %fd68;
+       // Callseq Start 3
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd18;
+       .param .b64 param1;
+       st.param.f64    [param1+0], %fd1;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_accurate_pow, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd24, [retval0+0];
+       
+       //{
+       }// Callseq End 3
+       setp.lt.s32     %p50, %r2, 0;
+       and.pred        %p1, %p50, %p49;
+       @!%p1 bra       BB32_53;
+       bra.uni         BB32_52;
+
+BB32_52:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r30}, %fd24;
+       }
+       xor.b32         %r31, %r30, -2147483648;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r32, %temp}, %fd24;
+       }
+       mov.b64         %fd24, {%r32, %r31};
+
+BB32_53:
+       setp.eq.f64     %p51, %fd68, 0d0000000000000000;
+       @%p51 bra       BB32_56;
+       bra.uni         BB32_54;
+
+BB32_56:
+       selp.b32        %r33, %r2, 0, %p49;
+       or.b32          %r34, %r33, 2146435072;
+       setp.lt.s32     %p55, %r3, 0;
+       selp.b32        %r35, %r34, %r33, %p55;
+       mov.u32         %r36, 0;
+       mov.b64         %fd24, {%r36, %r35};
+       bra.uni         BB32_57;
+
+BB32_28:
+       setp.gt.s32     %p7, %r7, 15;
+       @%p7 bra        BB32_32;
+
+       setp.eq.s32     %p11, %r7, 14;
+       @%p11 bra       BB32_45;
+       bra.uni         BB32_30;
+
+BB32_45:
+       cvt.rni.s64.f64 %rd18, %fd68;
+       cvt.u32.u64     %r22, %rd18;
+       cvt.rni.s64.f64 %rd19, %fd1;
+       cvt.u32.u64     %r23, %rd19;
+       or.b32          %r24, %r23, %r22;
+       setp.eq.s32     %p41, %r24, 0;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41;
+       bra.uni         BB32_71;
+
+BB32_81:
+       setp.gt.s32     %p84, %r7, 5;
+       @%p84 bra       BB32_85;
+
+       setp.eq.s32     %p88, %r7, 4;
+       @%p88 bra       BB32_121;
+       bra.uni         BB32_83;
+
+BB32_121:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r4}, %fd1;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r5}, %fd68;
+       }
+       bfe.u32         %r66, %r5, 20, 11;
+       add.s32         %r67, %r66, -1012;
+       mov.b64          %rd27, %fd68;
+       shl.b64         %rd3, %rd27, %r67;
+       setp.eq.s64     %p113, %rd3, -9223372036854775808;
+       abs.f64         %fd51, %fd1;
+       // Callseq Start 4
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd51;
+       .param .b64 param1;
+       st.param.f64    [param1+0], %fd68;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_accurate_pow, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd57, [retval0+0];
+       
+       //{
+       }// Callseq End 4
+       setp.lt.s32     %p114, %r4, 0;
+       and.pred        %p2, %p114, %p113;
+       @!%p2 bra       BB32_123;
+       bra.uni         BB32_122;
+
+BB32_122:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r68}, %fd57;
+       }
+       xor.b32         %r69, %r68, -2147483648;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r70, %temp}, %fd57;
+       }
+       mov.b64         %fd57, {%r70, %r69};
+
+BB32_123:
+       setp.eq.f64     %p115, %fd1, 0d0000000000000000;
+       @%p115 bra      BB32_126;
+       bra.uni         BB32_124;
+
+BB32_126:
+       selp.b32        %r71, %r4, 0, %p113;
+       or.b32          %r72, %r71, 2146435072;
+       setp.lt.s32     %p119, %r5, 0;
+       selp.b32        %r73, %r72, %r71, %p119;
+       mov.u32         %r74, 0;
+       mov.b64         %fd57, {%r74, %r73};
+       bra.uni         BB32_127;
+
+BB32_98:
+       setp.gt.s32     %p71, %r7, 15;
+       @%p71 bra       BB32_102;
+
+       setp.eq.s32     %p75, %r7, 14;
+       @%p75 bra       BB32_115;
+       bra.uni         BB32_100;
+
+BB32_115:
+       cvt.rni.s64.f64 %rd23, %fd1;
+       cvt.u32.u64     %r60, %rd23;
+       cvt.rni.s64.f64 %rd24, %fd68;
+       cvt.u32.u64     %r61, %rd24;
+       or.b32          %r62, %r61, %r60;
+       setp.eq.s32     %p105, %r62, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p105;
+       bra.uni         BB32_141;
+
+BB32_8:
+       setp.eq.s32     %p27, %r7, 2;
+       @%p27 bra       BB32_69;
+       bra.uni         BB32_9;
+
+BB32_69:
+       mul.f64         %fd94, %fd1, %fd68;
+       bra.uni         BB32_71;
+
+BB32_24:
+       setp.eq.s32     %p14, %r7, 11;
+       @%p14 bra       BB32_47;
+
+       setp.eq.s32     %p15, %r7, 12;
+       @%p15 bra       BB32_46;
+       bra.uni         BB32_26;
+
+BB32_46:
+       max.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB32_71;
+
+BB32_15:
+       setp.eq.s32     %p21, %r7, 6;
+       @%p21 bra       BB32_50;
+
+       setp.eq.s32     %p22, %r7, 7;
+       @%p22 bra       BB32_49;
+       bra.uni         BB32_17;
+
+BB32_49:
+       setp.lt.f64     %p46, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46;
+       bra.uni         BB32_71;
+
+BB32_32:
+       setp.eq.s32     %p8, %r7, 16;
+       @%p8 bra        BB32_44;
+
+       setp.eq.s32     %p9, %r7, 17;
+       @%p9 bra        BB32_39;
+       bra.uni         BB32_34;
+
+BB32_39:
+       setp.eq.f64     %p34, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p35, %fd1, 0d8000000000000000;
+       or.pred         %p36, %p34, %p35;
+       mov.f64         %fd94, 0d7FF8000000000000;
+       @%p36 bra       BB32_71;
+
+       div.rn.f64      %fd94, %fd68, %fd1;
+       abs.f64         %fd72, %fd94;
+       setp.gtu.f64    %p37, %fd72, 0d7FF0000000000000;
+       @%p37 bra       BB32_71;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r19}, %fd94;
+       }
+       and.b32         %r20, %r19, 2147483647;
+       setp.ne.s32     %p38, %r20, 2146435072;
+       @%p38 bra       BB32_43;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r21, %temp}, %fd94;
+       }
+       setp.eq.s32     %p39, %r21, 0;
+       @%p39 bra       BB32_71;
+
+BB32_43:
+       cvt.rmi.f64.f64 %fd73, %fd94;
+       mul.f64         %fd74, %fd1, %fd73;
+       sub.f64         %fd94, %fd68, %fd74;
+       bra.uni         BB32_71;
+
+BB32_78:
+       setp.eq.s32     %p91, %r7, 2;
+       @%p91 bra       BB32_139;
+       bra.uni         BB32_79;
+
+BB32_139:
+       mul.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_94:
+       setp.eq.s32     %p78, %r7, 11;
+       @%p78 bra       BB32_117;
+
+       setp.eq.s32     %p79, %r7, 12;
+       @%p79 bra       BB32_116;
+       bra.uni         BB32_96;
+
+BB32_116:
+       max.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_85:
+       setp.eq.s32     %p85, %r7, 6;
+       @%p85 bra       BB32_120;
+
+       setp.eq.s32     %p86, %r7, 7;
+       @%p86 bra       BB32_119;
+       bra.uni         BB32_87;
+
+BB32_119:
+       setp.gt.f64     %p110, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110;
+       bra.uni         BB32_141;
+
+BB32_102:
+       setp.eq.s32     %p72, %r7, 16;
+       @%p72 bra       BB32_114;
+
+       setp.eq.s32     %p73, %r7, 17;
+       @%p73 bra       BB32_109;
+       bra.uni         BB32_104;
+
+BB32_109:
+       setp.eq.f64     %p98, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p99, %fd68, 0d8000000000000000;
+       or.pred         %p100, %p98, %p99;
+       mov.f64         %fd98, 0d7FF8000000000000;
+       @%p100 bra      BB32_141;
+
+       div.rn.f64      %fd98, %fd1, %fd68;
+       abs.f64         %fd83, %fd98;
+       setp.gtu.f64    %p101, %fd83, 0d7FF0000000000000;
+       @%p101 bra      BB32_141;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r57}, %fd98;
+       }
+       and.b32         %r58, %r57, 2147483647;
+       setp.ne.s32     %p102, %r58, 2146435072;
+       @%p102 bra      BB32_113;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r59, %temp}, %fd98;
+       }
+       setp.eq.s32     %p103, %r59, 0;
+       @%p103 bra      BB32_141;
+
+BB32_113:
+       cvt.rmi.f64.f64 %fd84, %fd98;
+       mul.f64         %fd85, %fd84, %fd68;
+       sub.f64         %fd98, %fd1, %fd85;
+       bra.uni         BB32_141;
+
+BB32_6:
+       setp.eq.s32     %p30, %r7, 1;
+       @%p30 bra       BB32_7;
+       bra.uni         BB32_71;
+
+BB32_7:
+       sub.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB32_71;
+
+BB32_22:
+       setp.eq.s32     %p18, %r7, 10;
+       @%p18 bra       BB32_23;
+       bra.uni         BB32_71;
+
+BB32_23:
+       setp.neu.f64    %p43, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43;
+       bra.uni         BB32_71;
+
+BB32_13:
+       setp.eq.s32     %p25, %r7, 5;
+       @%p25 bra       BB32_14;
+       bra.uni         BB32_71;
+
+BB32_14:
+       setp.gt.f64     %p48, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB32_71;
+
+BB32_30:
+       setp.eq.s32     %p12, %r7, 15;
+       @%p12 bra       BB32_31;
+       bra.uni         BB32_71;
+
+BB32_31:
+       mul.f64         %fd76, %fd1, %fd68;
+       mov.f64         %fd77, 0d3FF0000000000000;
+       sub.f64         %fd94, %fd77, %fd76;
+       bra.uni         BB32_71;
+
+BB32_9:
+       setp.eq.s32     %p28, %r7, 3;
+       @%p28 bra       BB32_10;
+       bra.uni         BB32_71;
+
+BB32_10:
+       div.rn.f64      %fd94, %fd68, %fd1;
+       bra.uni         BB32_71;
+
+BB32_47:
+       min.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB32_71;
+
+BB32_26:
+       setp.eq.s32     %p16, %r7, 13;
+       @%p16 bra       BB32_27;
+       bra.uni         BB32_71;
+
+BB32_27:
+       cvt.rni.s64.f64 %rd20, %fd68;
+       cvt.u32.u64     %r25, %rd20;
+       cvt.rni.s64.f64 %rd21, %fd1;
+       cvt.u32.u64     %r26, %rd21;
+       and.b32         %r27, %r26, %r25;
+       setp.eq.s32     %p42, %r27, 0;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42;
+       bra.uni         BB32_71;
+
+BB32_50:
+       setp.ltu.f64    %p47, %fd1, %fd68;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47;
+       bra.uni         BB32_71;
+
+BB32_17:
+       setp.eq.s32     %p23, %r7, 8;
+       @%p23 bra       BB32_18;
+       bra.uni         BB32_71;
+
+BB32_18:
+       setp.gtu.f64    %p45, %fd1, %fd68;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45;
+       bra.uni         BB32_71;
+
+BB32_44:
+       setp.neu.f64    %p40, %fd68, 0d0000000000000000;
+       sub.f64         %fd75, %fd68, %fd1;
+       selp.f64        %fd94, %fd75, 0d0000000000000000, %p40;
+       bra.uni         BB32_71;
+
+BB32_34:
+       setp.ne.s32     %p10, %r7, 18;
+       @%p10 bra       BB32_71;
+
+       div.rn.f64      %fd94, %fd68, %fd1;
+       abs.f64         %fd70, %fd94;
+       setp.gtu.f64    %p31, %fd70, 0d7FF0000000000000;
+       @%p31 bra       BB32_71;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r16}, %fd94;
+       }
+       and.b32         %r17, %r16, 2147483647;
+       setp.ne.s32     %p32, %r17, 2146435072;
+       @%p32 bra       BB32_38;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r18, %temp}, %fd94;
+       }
+       setp.eq.s32     %p33, %r18, 0;
+       @%p33 bra       BB32_71;
+
+BB32_38:
+       cvt.rmi.f64.f64 %fd94, %fd94;
+       bra.uni         BB32_71;
+
+BB32_76:
+       setp.eq.s32     %p94, %r7, 1;
+       @%p94 bra       BB32_77;
+       bra.uni         BB32_141;
+
+BB32_77:
+       sub.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_92:
+       setp.eq.s32     %p82, %r7, 10;
+       @%p82 bra       BB32_93;
+       bra.uni         BB32_141;
+
+BB32_93:
+       setp.neu.f64    %p107, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p107;
+       bra.uni         BB32_141;
+
+BB32_83:
+       setp.eq.s32     %p89, %r7, 5;
+       @%p89 bra       BB32_84;
+       bra.uni         BB32_141;
+
+BB32_84:
+       setp.lt.f64     %p112, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p112;
+       bra.uni         BB32_141;
+
+BB32_100:
+       setp.eq.s32     %p76, %r7, 15;
+       @%p76 bra       BB32_101;
+       bra.uni         BB32_141;
+
+BB32_101:
+       mul.f64         %fd87, %fd1, %fd68;
+       mov.f64         %fd88, 0d3FF0000000000000;
+       sub.f64         %fd98, %fd88, %fd87;
+       bra.uni         BB32_141;
+
+BB32_79:
+       setp.eq.s32     %p92, %r7, 3;
+       @%p92 bra       BB32_80;
+       bra.uni         BB32_141;
+
+BB32_80:
+       div.rn.f64      %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_117:
+       min.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB32_141;
+
+BB32_96:
+       setp.eq.s32     %p80, %r7, 13;
+       @%p80 bra       BB32_97;
+       bra.uni         BB32_141;
+
+BB32_97:
+       cvt.rni.s64.f64 %rd25, %fd1;
+       cvt.u32.u64     %r63, %rd25;
+       cvt.rni.s64.f64 %rd26, %fd68;
+       cvt.u32.u64     %r64, %rd26;
+       and.b32         %r65, %r64, %r63;
+       setp.eq.s32     %p106, %r65, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p106;
+       bra.uni         BB32_141;
+
+BB32_120:
+       setp.gtu.f64    %p111, %fd1, %fd68;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p111;
+       bra.uni         BB32_141;
+
+BB32_87:
+       setp.eq.s32     %p87, %r7, 8;
+       @%p87 bra       BB32_88;
+       bra.uni         BB32_141;
+
+BB32_88:
+       setp.ltu.f64    %p109, %fd1, %fd68;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109;
+       bra.uni         BB32_141;
+
+BB32_114:
+       setp.neu.f64    %p104, %fd1, 0d0000000000000000;
+       sub.f64         %fd86, %fd1, %fd68;
+       selp.f64        %fd98, %fd86, 0d0000000000000000, %p104;
+       bra.uni         BB32_141;
+
+BB32_104:
+       setp.ne.s32     %p74, %r7, 18;
+       @%p74 bra       BB32_141;
+
+       div.rn.f64      %fd98, %fd1, %fd68;
+       abs.f64         %fd81, %fd98;
+       setp.gtu.f64    %p95, %fd81, 0d7FF0000000000000;
+       @%p95 bra       BB32_141;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r54}, %fd98;
+       }
+       and.b32         %r55, %r54, 2147483647;
+       setp.ne.s32     %p96, %r55, 2146435072;
+       @%p96 bra       BB32_108;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r56, %temp}, %fd98;
+       }
+       setp.eq.s32     %p97, %r56, 0;
+       @%p97 bra       BB32_141;
+
+BB32_108:
+       cvt.rmi.f64.f64 %fd98, %fd98;
+       bra.uni         BB32_141;
+
+BB32_54:
+       setp.gt.s32     %p52, %r2, -1;
+       @%p52 bra       BB32_57;
+
+       cvt.rzi.f64.f64 %fd78, %fd1;
+       setp.neu.f64    %p53, %fd78, %fd1;
+       selp.f64        %fd24, 0dFFF8000000000000, %fd24, %p53;
+
+BB32_57:
+       add.f64         %fd93, %fd1, %fd68;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r37}, %fd93;
+       }
+       and.b32         %r38, %r37, 2146435072;
+       setp.ne.s32     %p56, %r38, 2146435072;
+       @%p56 bra       BB32_58;
+
+       setp.gtu.f64    %p57, %fd18, 0d7FF0000000000000;
+       @%p57 bra       BB32_68;
+
+       abs.f64         %fd79, %fd1;
+       setp.gtu.f64    %p58, %fd79, 0d7FF0000000000000;
+       @%p58 bra       BB32_68;
+
+       and.b32         %r39, %r3, 2147483647;
+       setp.ne.s32     %p59, %r39, 2146435072;
+       @%p59 bra       BB32_63;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r40, %temp}, %fd1;
+       }
+       setp.eq.s32     %p60, %r40, 0;
+       @%p60 bra       BB32_67;
+
+BB32_63:
+       and.b32         %r41, %r2, 2147483647;
+       setp.ne.s32     %p61, %r41, 2146435072;
+       @%p61 bra       BB32_64;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r42, %temp}, %fd68;
+       }
+       setp.ne.s32     %p62, %r42, 0;
+       mov.f64         %fd93, %fd24;
+       @%p62 bra       BB32_68;
+
+       shr.s32         %r43, %r3, 31;
+       and.b32         %r44, %r43, -2146435072;
+       add.s32         %r45, %r44, 2146435072;
+       or.b32          %r46, %r45, -2147483648;
+       selp.b32        %r47, %r46, %r45, %p1;
+       mov.u32         %r48, 0;
+       mov.b64         %fd93, {%r48, %r47};
+       bra.uni         BB32_68;
+
+BB32_58:
+       mov.f64         %fd93, %fd24;
+
+BB32_68:
+       setp.eq.f64     %p66, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p67, %fd68, 0d3FF0000000000000;
+       or.pred         %p68, %p67, %p66;
+       selp.f64        %fd94, 0d3FF0000000000000, %fd93, %p68;
+
+BB32_71:
+       st.global.f64   [%rd1], %fd94;
+       bra.uni         BB32_142;
+
+BB32_124:
+       setp.gt.s32     %p116, %r4, -1;
+       @%p116 bra      BB32_127;
+
+       cvt.rzi.f64.f64 %fd89, %fd68;
+       setp.neu.f64    %p117, %fd89, %fd68;
+       selp.f64        %fd57, 0dFFF8000000000000, %fd57, %p117;
+
+BB32_127:
+       add.f64         %fd97, %fd1, %fd68;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r75}, %fd97;
+       }
+       and.b32         %r76, %r75, 2146435072;
+       setp.ne.s32     %p120, %r76, 2146435072;
+       @%p120 bra      BB32_128;
+
+       setp.gtu.f64    %p121, %fd51, 0d7FF0000000000000;
+       @%p121 bra      BB32_138;
+
+       abs.f64         %fd90, %fd68;
+       setp.gtu.f64    %p122, %fd90, 0d7FF0000000000000;
+       @%p122 bra      BB32_138;
+
+       and.b32         %r77, %r5, 2147483647;
+       setp.ne.s32     %p123, %r77, 2146435072;
+       @%p123 bra      BB32_133;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r78, %temp}, %fd68;
+       }
+       setp.eq.s32     %p124, %r78, 0;
+       @%p124 bra      BB32_137;
+
+BB32_133:
+       and.b32         %r79, %r4, 2147483647;
+       setp.ne.s32     %p125, %r79, 2146435072;
+       @%p125 bra      BB32_134;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r80, %temp}, %fd1;
+       }
+       setp.ne.s32     %p126, %r80, 0;
+       mov.f64         %fd97, %fd57;
+       @%p126 bra      BB32_138;
+
+       shr.s32         %r81, %r5, 31;
+       and.b32         %r82, %r81, -2146435072;
+       add.s32         %r83, %r82, 2146435072;
+       or.b32          %r84, %r83, -2147483648;
+       selp.b32        %r85, %r84, %r83, %p2;
+       mov.u32         %r86, 0;
+       mov.b64         %fd97, {%r86, %r85};
+       bra.uni         BB32_138;
+
+BB32_128:
+       mov.f64         %fd97, %fd57;
+
+BB32_138:
+       setp.eq.f64     %p130, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p131, %fd1, 0d3FF0000000000000;
+       or.pred         %p132, %p131, %p130;
+       selp.f64        %fd98, 0d3FF0000000000000, %fd97, %p132;
+
+BB32_141:
+       st.global.f64   [%rd1], %fd98;
+
+BB32_142:
+       bar.sync        0;
+       ret;
+
+BB32_64:
+       mov.f64         %fd93, %fd24;
+       bra.uni         BB32_68;
+
+BB32_134:
+       mov.f64         %fd97, %fd57;
+       bra.uni         BB32_138;
+
+BB32_67:
+       setp.gt.f64     %p63, %fd18, 0d3FF0000000000000;
+       selp.b32        %r49, 2146435072, 0, %p63;
+       xor.b32         %r50, %r49, 2146435072;
+       setp.lt.s32     %p64, %r3, 0;
+       selp.b32        %r51, %r50, %r49, %p64;
+       setp.eq.f64     %p65, %fd68, 0dBFF0000000000000;
+       selp.b32        %r52, 1072693248, %r51, %p65;
+       mov.u32         %r53, 0;
+       mov.b64         %fd93, {%r53, %r52};
+       bra.uni         BB32_68;
+
+BB32_137:
+       setp.gt.f64     %p127, %fd51, 0d3FF0000000000000;
+       selp.b32        %r87, 2146435072, 0, %p127;
+       xor.b32         %r88, %r87, 2146435072;
+       setp.lt.s32     %p128, %r5, 0;
+       selp.b32        %r89, %r88, %r87, %p128;
+       setp.eq.f64     %p129, %fd1, 0dBFF0000000000000;
+       selp.b32        %r90, 1072693248, %r89, %p129;
+       mov.u32         %r91, 0;
+       mov.b64         %fd97, {%r91, %r90};
+       bra.uni         BB32_138;
+}
+
+       // .globl       sparse_dense_matrix_scalar_op_f
+.visible .entry sparse_dense_matrix_scalar_op_f(
+       .param .u64 sparse_dense_matrix_scalar_op_f_param_0,
+       .param .u64 sparse_dense_matrix_scalar_op_f_param_1,
+       .param .u64 sparse_dense_matrix_scalar_op_f_param_2,
+       .param .f64 sparse_dense_matrix_scalar_op_f_param_3,
+       .param .u64 sparse_dense_matrix_scalar_op_f_param_4,
+       .param .u32 sparse_dense_matrix_scalar_op_f_param_5,
+       .param .u32 sparse_dense_matrix_scalar_op_f_param_6,
+       .param .u32 sparse_dense_matrix_scalar_op_f_param_7,
+       .param .u32 sparse_dense_matrix_scalar_op_f_param_8
+)
+{
+       .reg .pred      %p<139>;
+       .reg .f32       %f<267>;
+       .reg .b32       %r<62>;
+       .reg .f64       %fd<2>;
+       .reg .b64       %rd<23>;
+
+
+       ld.param.u64    %rd2, [sparse_dense_matrix_scalar_op_f_param_0];
+       ld.param.u64    %rd3, [sparse_dense_matrix_scalar_op_f_param_1];
+       ld.param.u64    %rd4, [sparse_dense_matrix_scalar_op_f_param_2];
+       ld.param.f64    %fd1, [sparse_dense_matrix_scalar_op_f_param_3];
+       ld.param.u64    %rd5, [sparse_dense_matrix_scalar_op_f_param_4];
+       ld.param.u32    %r5, [sparse_dense_matrix_scalar_op_f_param_5];
+       ld.param.u32    %r2, [sparse_dense_matrix_scalar_op_f_param_6];
+       ld.param.u32    %r3, [sparse_dense_matrix_scalar_op_f_param_7];
+       ld.param.u32    %r4, [sparse_dense_matrix_scalar_op_f_param_8];
+       cvt.rn.f32.f64  %f1, %fd1;
+       mov.u32         %r6, %ntid.x;
+       mov.u32         %r7, %ctaid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r6, %r7, %r8;
+       setp.ge.s32     %p3, %r1, %r5;
+       @%p3 bra        BB33_126;
+
+       cvta.to.global.u64      %rd6, %rd5;
+       cvta.to.global.u64      %rd7, %rd4;
+       mul.wide.s32    %rd8, %r1, 4;
+       add.s64         %rd9, %rd7, %rd8;
+       ld.global.f32   %f2, [%rd9];
+       cvta.to.global.u64      %rd10, %rd2;
+       add.s64         %rd11, %rd10, %rd8;
+       ld.global.u32   %r9, [%rd11];
+       cvta.to.global.u64      %rd12, %rd3;
+       add.s64         %rd13, %rd12, %rd8;
+       ld.global.u32   %r10, [%rd13];
+       mad.lo.s32      %r11, %r9, %r2, %r10;
+       mul.wide.s32    %rd14, %r11, 4;
+       add.s64         %rd1, %rd6, %rd14;
+       setp.eq.s32     %p4, %r4, 0;
+       @%p4 bra        BB33_64;
+
+       mov.f32         %f262, 0f7F7FFFFF;
+       setp.gt.s32     %p5, %r3, 8;
+       @%p5 bra        BB33_19;
+
+       setp.gt.s32     %p19, %r3, 3;
+       @%p19 bra       BB33_11;
+
+       setp.gt.s32     %p26, %r3, 1;
+       @%p26 bra       BB33_8;
+
+       setp.eq.s32     %p29, %r3, 0;
+       @%p29 bra       BB33_62;
+       bra.uni         BB33_6;
+
+BB33_62:
+       add.f32         %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_64:
+       mov.f32         %f266, 0f7F7FFFFF;
+       setp.gt.s32     %p72, %r3, 8;
+       @%p72 bra       BB33_81;
+
+       setp.gt.s32     %p86, %r3, 3;
+       @%p86 bra       BB33_73;
+
+       setp.gt.s32     %p93, %r3, 1;
+       @%p93 bra       BB33_70;
+
+       setp.eq.s32     %p96, %r3, 0;
+       @%p96 bra       BB33_124;
+       bra.uni         BB33_68;
+
+BB33_124:
+       add.f32         %f266, %f1, %f2;
+       bra.uni         BB33_125;
+
+BB33_19:
+       setp.gt.s32     %p6, %r3, 13;
+       @%p6 bra        BB33_28;
+
+       setp.gt.s32     %p13, %r3, 10;
+       @%p13 bra       BB33_24;
+
+       setp.eq.s32     %p17, %r3, 9;
+       @%p17 bra       BB33_44;
+       bra.uni         BB33_22;
+
+BB33_44:
+       setp.eq.f32     %p40, %f1, %f2;
+       selp.f32        %f262, 0f3F800000, 0f00000000, %p40;
+       bra.uni         BB33_63;
+
+BB33_81:
+       setp.gt.s32     %p73, %r3, 13;
+       @%p73 bra       BB33_90;
+
+       setp.gt.s32     %p80, %r3, 10;
+       @%p80 bra       BB33_86;
+
+       setp.eq.s32     %p84, %r3, 9;
+       @%p84 bra       BB33_106;
+       bra.uni         BB33_84;
+
+BB33_106:
+       setp.eq.f32     %p107, %f2, %f1;
+       selp.f32        %f266, 0f3F800000, 0f00000000, %p107;
+       bra.uni         BB33_125;
+
+BB33_11:
+       setp.gt.s32     %p20, %r3, 5;
+       @%p20 bra       BB33_15;
+
+       setp.eq.s32     %p24, %r3, 4;
+       @%p24 bra       BB33_47;
+       bra.uni         BB33_13;
+
+BB33_47:
+       mul.f32         %f88, %f2, 0f3F000000;
+       cvt.rzi.f32.f32 %f89, %f88;
+       fma.rn.f32      %f90, %f89, 0fC0000000, %f2;
+       abs.f32         %f19, %f90;
+       abs.f32         %f20, %f1;
+       setp.lt.f32     %p45, %f20, 0f00800000;
+       mul.f32         %f91, %f20, 0f4B800000;
+       selp.f32        %f92, 0fC3170000, 0fC2FE0000, %p45;
+       selp.f32        %f93, %f91, %f20, %p45;
+       mov.b32          %r18, %f93;
+       and.b32         %r19, %r18, 8388607;
+       or.b32          %r20, %r19, 1065353216;
+       mov.b32          %f94, %r20;
+       shr.u32         %r21, %r18, 23;
+       cvt.rn.f32.u32  %f95, %r21;
+       add.f32         %f96, %f92, %f95;
+       setp.gt.f32     %p46, %f94, 0f3FB504F3;
+       mul.f32         %f97, %f94, 0f3F000000;
+       add.f32         %f98, %f96, 0f3F800000;
+       selp.f32        %f99, %f97, %f94, %p46;
+       selp.f32        %f100, %f98, %f96, %p46;
+       add.f32         %f101, %f99, 0fBF800000;
+       add.f32         %f87, %f99, 0f3F800000;
+       // inline asm
+       rcp.approx.ftz.f32 %f86,%f87;
+       // inline asm
+       add.f32         %f102, %f101, %f101;
+       mul.f32         %f103, %f86, %f102;
+       mul.f32         %f104, %f103, %f103;
+       mov.f32         %f105, 0f3C4CAF63;
+       mov.f32         %f106, 0f3B18F0FE;
+       fma.rn.f32      %f107, %f106, %f104, %f105;
+       mov.f32         %f108, 0f3DAAAABD;
+       fma.rn.f32      %f109, %f107, %f104, %f108;
+       mul.rn.f32      %f110, %f109, %f104;
+       mul.rn.f32      %f111, %f110, %f103;
+       sub.f32         %f112, %f101, %f103;
+       neg.f32         %f113, %f103;
+       add.f32         %f114, %f112, %f112;
+       fma.rn.f32      %f115, %f113, %f101, %f114;
+       mul.rn.f32      %f116, %f86, %f115;
+       add.f32         %f117, %f111, %f103;
+       sub.f32         %f118, %f103, %f117;
+       add.f32         %f119, %f111, %f118;
+       add.f32         %f120, %f116, %f119;
+       add.f32         %f121, %f117, %f120;
+       sub.f32         %f122, %f117, %f121;
+       add.f32         %f123, %f120, %f122;
+       mov.f32         %f124, 0f3F317200;
+       mul.rn.f32      %f125, %f100, %f124;
+       mov.f32         %f126, 0f35BFBE8E;
+       mul.rn.f32      %f127, %f100, %f126;
+       add.f32         %f128, %f125, %f121;
+       sub.f32         %f129, %f125, %f128;
+       add.f32         %f130, %f121, %f129;
+       add.f32         %f131, %f123, %f130;
+       add.f32         %f132, %f127, %f131;
+       add.f32         %f133, %f128, %f132;
+       sub.f32         %f134, %f128, %f133;
+       add.f32         %f135, %f132, %f134;
+       abs.f32         %f21, %f2;
+       setp.gt.f32     %p47, %f21, 0f77F684DF;
+       mul.f32         %f136, %f2, 0f39000000;
+       selp.f32        %f137, %f136, %f2, %p47;
+       mul.rn.f32      %f138, %f137, %f133;
+       neg.f32         %f139, %f138;
+       fma.rn.f32      %f140, %f137, %f133, %f139;
+       fma.rn.f32      %f141, %f137, %f135, %f140;
+       mov.f32         %f142, 0f00000000;
+       fma.rn.f32      %f143, %f142, %f133, %f141;
+       add.rn.f32      %f144, %f138, %f143;
+       neg.f32         %f145, %f144;
+       add.rn.f32      %f146, %f138, %f145;
+       add.rn.f32      %f147, %f146, %f143;
+       mov.b32          %r22, %f144;
+       setp.eq.s32     %p48, %r22, 1118925336;
+       add.s32         %r23, %r22, -1;
+       mov.b32          %f148, %r23;
+       add.f32         %f149, %f147, 0f37000000;
+       selp.f32        %f150, %f148, %f144, %p48;
+       selp.f32        %f22, %f149, %f147, %p48;
+       mul.f32         %f151, %f150, 0f3FB8AA3B;
+       cvt.rzi.f32.f32 %f152, %f151;
+       mov.f32         %f153, 0fBF317200;
+       fma.rn.f32      %f154, %f152, %f153, %f150;
+       mov.f32         %f155, 0fB5BFBE8E;
+       fma.rn.f32      %f156, %f152, %f155, %f154;
+       mul.f32         %f157, %f156, 0f3FB8AA3B;
+       ex2.approx.ftz.f32      %f158, %f157;
+       add.f32         %f159, %f152, 0f00000000;
+       ex2.approx.f32  %f160, %f159;
+       mul.f32         %f161, %f158, %f160;
+       setp.lt.f32     %p49, %f150, 0fC2D20000;
+       selp.f32        %f162, 0f00000000, %f161, %p49;
+       setp.gt.f32     %p50, %f150, 0f42D20000;
+       selp.f32        %f259, 0f7F800000, %f162, %p50;
+       setp.eq.f32     %p51, %f259, 0f7F800000;
+       @%p51 bra       BB33_49;
+
+       fma.rn.f32      %f259, %f259, %f22, %f259;
+
+BB33_49:
+       setp.lt.f32     %p52, %f1, 0f00000000;
+       setp.eq.f32     %p53, %f19, 0f3F800000;
+       and.pred        %p1, %p52, %p53;
+       mov.b32          %r24, %f259;
+       xor.b32         %r25, %r24, -2147483648;
+       mov.b32          %f163, %r25;
+       selp.f32        %f261, %f163, %f259, %p1;
+       setp.eq.f32     %p54, %f1, 0f00000000;
+       @%p54 bra       BB33_52;
+       bra.uni         BB33_50;
+
+BB33_52:
+       add.f32         %f165, %f1, %f1;
+       mov.b32          %r26, %f165;
+       selp.b32        %r27, %r26, 0, %p53;
+       or.b32          %r28, %r27, 2139095040;
+       setp.lt.f32     %p58, %f2, 0f00000000;
+       selp.b32        %r29, %r28, %r27, %p58;
+       mov.b32          %f261, %r29;
+       bra.uni         BB33_53;
+
+BB33_28:
+       setp.gt.s32     %p7, %r3, 15;
+       @%p7 bra        BB33_32;
+
+       setp.eq.s32     %p11, %r3, 14;
+       @%p11 bra       BB33_41;
+       bra.uni         BB33_30;
+
+BB33_41:
+       cvt.rni.s64.f32 %rd15, %f1;
+       cvt.u32.u64     %r12, %rd15;
+       cvt.rni.s64.f32 %rd16, %f2;
+       cvt.u32.u64     %r13, %rd16;
+       or.b32          %r14, %r13, %r12;
+       setp.eq.s32     %p37, %r14, 0;
+       selp.f32        %f262, 0f00000000, 0f3F800000, %p37;
+       bra.uni         BB33_63;
+
+BB33_73:
+       setp.gt.s32     %p87, %r3, 5;
+       @%p87 bra       BB33_77;
+
+       setp.eq.s32     %p91, %r3, 4;
+       @%p91 bra       BB33_109;
+       bra.uni         BB33_75;
+
+BB33_109:
+       mul.f32         %f179, %f1, 0f3F000000;
+       cvt.rzi.f32.f32 %f180, %f179;
+       fma.rn.f32      %f181, %f180, 0fC0000000, %f1;
+       abs.f32         %f56, %f181;
+       abs.f32         %f57, %f2;
+       setp.lt.f32     %p112, %f57, 0f00800000;
+       mul.f32         %f182, %f57, 0f4B800000;
+       selp.f32        %f183, 0fC3170000, 0fC2FE0000, %p112;
+       selp.f32        %f184, %f182, %f57, %p112;
+       mov.b32          %r43, %f184;
+       and.b32         %r44, %r43, 8388607;
+       or.b32          %r45, %r44, 1065353216;
+       mov.b32          %f185, %r45;
+       shr.u32         %r46, %r43, 23;
+       cvt.rn.f32.u32  %f186, %r46;
+       add.f32         %f187, %f183, %f186;
+       setp.gt.f32     %p113, %f185, 0f3FB504F3;
+       mul.f32         %f188, %f185, 0f3F000000;
+       add.f32         %f189, %f187, 0f3F800000;
+       selp.f32        %f190, %f188, %f185, %p113;
+       selp.f32        %f191, %f189, %f187, %p113;
+       add.f32         %f192, %f190, 0fBF800000;
+       add.f32         %f178, %f190, 0f3F800000;
+       // inline asm
+       rcp.approx.ftz.f32 %f177,%f178;
+       // inline asm
+       add.f32         %f193, %f192, %f192;
+       mul.f32         %f194, %f177, %f193;
+       mul.f32         %f195, %f194, %f194;
+       mov.f32         %f196, 0f3C4CAF63;
+       mov.f32         %f197, 0f3B18F0FE;
+       fma.rn.f32      %f198, %f197, %f195, %f196;
+       mov.f32         %f199, 0f3DAAAABD;
+       fma.rn.f32      %f200, %f198, %f195, %f199;
+       mul.rn.f32      %f201, %f200, %f195;
+       mul.rn.f32      %f202, %f201, %f194;
+       sub.f32         %f203, %f192, %f194;
+       neg.f32         %f204, %f194;
+       add.f32         %f205, %f203, %f203;
+       fma.rn.f32      %f206, %f204, %f192, %f205;
+       mul.rn.f32      %f207, %f177, %f206;
+       add.f32         %f208, %f202, %f194;
+       sub.f32         %f209, %f194, %f208;
+       add.f32         %f210, %f202, %f209;
+       add.f32         %f211, %f207, %f210;
+       add.f32         %f212, %f208, %f211;
+       sub.f32         %f213, %f208, %f212;
+       add.f32         %f214, %f211, %f213;
+       mov.f32         %f215, 0f3F317200;
+       mul.rn.f32      %f216, %f191, %f215;
+       mov.f32         %f217, 0f35BFBE8E;
+       mul.rn.f32      %f218, %f191, %f217;
+       add.f32         %f219, %f216, %f212;
+       sub.f32         %f220, %f216, %f219;
+       add.f32         %f221, %f212, %f220;
+       add.f32         %f222, %f214, %f221;
+       add.f32         %f223, %f218, %f222;
+       add.f32         %f224, %f219, %f223;
+       sub.f32         %f225, %f219, %f224;
+       add.f32         %f226, %f223, %f225;
+       abs.f32         %f58, %f1;
+       setp.gt.f32     %p114, %f58, 0f77F684DF;
+       mul.f32         %f227, %f1, 0f39000000;
+       selp.f32        %f228, %f227, %f1, %p114;
+       mul.rn.f32      %f229, %f228, %f224;
+       neg.f32         %f230, %f229;
+       fma.rn.f32      %f231, %f228, %f224, %f230;
+       fma.rn.f32      %f232, %f228, %f226, %f231;
+       mov.f32         %f233, 0f00000000;
+       fma.rn.f32      %f234, %f233, %f224, %f232;
+       add.rn.f32      %f235, %f229, %f234;
+       neg.f32         %f236, %f235;
+       add.rn.f32      %f237, %f229, %f236;
+       add.rn.f32      %f238, %f237, %f234;
+       mov.b32          %r47, %f235;
+       setp.eq.s32     %p115, %r47, 1118925336;
+       add.s32         %r48, %r47, -1;
+       mov.b32          %f239, %r48;
+       add.f32         %f240, %f238, 0f37000000;
+       selp.f32        %f241, %f239, %f235, %p115;
+       selp.f32        %f59, %f240, %f238, %p115;
+       mul.f32         %f242, %f241, 0f3FB8AA3B;
+       cvt.rzi.f32.f32 %f243, %f242;
+       mov.f32         %f244, 0fBF317200;
+       fma.rn.f32      %f245, %f243, %f244, %f241;
+       mov.f32         %f246, 0fB5BFBE8E;
+       fma.rn.f32      %f247, %f243, %f246, %f245;
+       mul.f32         %f248, %f247, 0f3FB8AA3B;
+       ex2.approx.ftz.f32      %f249, %f248;
+       add.f32         %f250, %f243, 0f00000000;
+       ex2.approx.f32  %f251, %f250;
+       mul.f32         %f252, %f249, %f251;
+       setp.lt.f32     %p116, %f241, 0fC2D20000;
+       selp.f32        %f253, 0f00000000, %f252, %p116;
+       setp.gt.f32     %p117, %f241, 0f42D20000;
+       selp.f32        %f263, 0f7F800000, %f253, %p117;
+       setp.eq.f32     %p118, %f263, 0f7F800000;
+       @%p118 bra      BB33_111;
+
+       fma.rn.f32      %f263, %f263, %f59, %f263;
+
+BB33_111:
+       setp.lt.f32     %p119, %f2, 0f00000000;
+       setp.eq.f32     %p120, %f56, 0f3F800000;
+       and.pred        %p2, %p119, %p120;
+       mov.b32          %r49, %f263;
+       xor.b32         %r50, %r49, -2147483648;
+       mov.b32          %f254, %r50;
+       selp.f32        %f265, %f254, %f263, %p2;
+       setp.eq.f32     %p121, %f2, 0f00000000;
+       @%p121 bra      BB33_114;
+       bra.uni         BB33_112;
+
+BB33_114:
+       add.f32         %f256, %f2, %f2;
+       mov.b32          %r51, %f256;
+       selp.b32        %r52, %r51, 0, %p120;
+       or.b32          %r53, %r52, 2139095040;
+       setp.lt.f32     %p125, %f1, 0f00000000;
+       selp.b32        %r54, %r53, %r52, %p125;
+       mov.b32          %f265, %r54;
+       bra.uni         BB33_115;
+
+BB33_90:
+       setp.gt.s32     %p74, %r3, 15;
+       @%p74 bra       BB33_94;
+
+       setp.eq.s32     %p78, %r3, 14;
+       @%p78 bra       BB33_103;
+       bra.uni         BB33_92;
+
+BB33_103:
+       cvt.rni.s64.f32 %rd19, %f2;
+       cvt.u32.u64     %r37, %rd19;
+       cvt.rni.s64.f32 %rd20, %f1;
+       cvt.u32.u64     %r38, %rd20;
+       or.b32          %r39, %r38, %r37;
+       setp.eq.s32     %p104, %r39, 0;
+       selp.f32        %f266, 0f00000000, 0f3F800000, %p104;
+       bra.uni         BB33_125;
+
+BB33_8:
+       setp.eq.s32     %p27, %r3, 2;
+       @%p27 bra       BB33_61;
+       bra.uni         BB33_9;
+
+BB33_61:
+       mul.f32         %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_24:
+       setp.eq.s32     %p14, %r3, 11;
+       @%p14 bra       BB33_43;
+
+       setp.eq.s32     %p15, %r3, 12;
+       @%p15 bra       BB33_42;
+       bra.uni         BB33_26;
+
+BB33_42:
+       max.f32         %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_15:
+       setp.eq.s32     %p21, %r3, 6;
+       @%p21 bra       BB33_46;
+
+       setp.eq.s32     %p22, %r3, 7;
+       @%p22 bra       BB33_45;
+       bra.uni         BB33_17;
+
+BB33_45:
+       setp.gt.f32     %p42, %f1, %f2;
+       selp.f32        %f262, 0f3F800000, 0f00000000, %p42;
+       bra.uni         BB33_63;
+
+BB33_32:
+       setp.eq.s32     %p8, %r3, 16;
+       @%p8 bra        BB33_40;
+
+       setp.eq.s32     %p9, %r3, 17;
+       @%p9 bra        BB33_37;
+       bra.uni         BB33_34;
+
+BB33_37:
+       setp.eq.f32     %p32, %f2, 0f00000000;
+       setp.eq.f32     %p33, %f2, 0f80000000;
+       or.pred         %p34, %p32, %p33;
+       mov.f32         %f262, 0f7FC00000;
+       @%p34 bra       BB33_63;
+
+       div.rn.f32      %f262, %f1, %f2;
+       abs.f32         %f80, %f262;
+       setp.geu.f32    %p35, %f80, 0f7F800000;
+       @%p35 bra       BB33_63;
+
+       cvt.rmi.f32.f32 %f81, %f262;
+       mul.f32         %f82, %f2, %f81;
+       sub.f32         %f262, %f1, %f82;
+       bra.uni         BB33_63;
+
+BB33_70:
+       setp.eq.s32     %p94, %r3, 2;
+       @%p94 bra       BB33_123;
+       bra.uni         BB33_71;
+
+BB33_123:
+       mul.f32         %f266, %f1, %f2;
+       bra.uni         BB33_125;
+
+BB33_86:
+       setp.eq.s32     %p81, %r3, 11;
+       @%p81 bra       BB33_105;
+
+       setp.eq.s32     %p82, %r3, 12;
+       @%p82 bra       BB33_104;
+       bra.uni         BB33_88;
+
+BB33_104:
+       max.f32         %f266, %f2, %f1;
+       bra.uni         BB33_125;
+
+BB33_77:
+       setp.eq.s32     %p88, %r3, 6;
+       @%p88 bra       BB33_108;
+
+       setp.eq.s32     %p89, %r3, 7;
+       @%p89 bra       BB33_107;
+       bra.uni         BB33_79;
+
+BB33_107:
+       setp.gt.f32     %p109, %f2, %f1;
+       selp.f32        %f266, 0f3F800000, 0f00000000, %p109;
+       bra.uni         BB33_125;
+
+BB33_94:
+       setp.eq.s32     %p75, %r3, 16;
+       @%p75 bra       BB33_102;
+
+       setp.eq.s32     %p76, %r3, 17;
+       @%p76 bra       BB33_99;
+       bra.uni         BB33_96;
+
+BB33_99:
+       setp.eq.f32     %p99, %f1, 0f00000000;
+       setp.eq.f32     %p100, %f1, 0f80000000;
+       or.pred         %p101, %p99, %p100;
+       mov.f32         %f266, 0f7FC00000;
+       @%p101 bra      BB33_125;
+
+       div.rn.f32      %f266, %f2, %f1;
+       abs.f32         %f171, %f266;
+       setp.geu.f32    %p102, %f171, 0f7F800000;
+       @%p102 bra      BB33_125;
+
+       cvt.rmi.f32.f32 %f172, %f266;
+       mul.f32         %f173, %f1, %f172;
+       sub.f32         %f266, %f2, %f173;
+       bra.uni         BB33_125;
+
+BB33_6:
+       setp.eq.s32     %p30, %r3, 1;
+       @%p30 bra       BB33_7;
+       bra.uni         BB33_63;
+
+BB33_7:
+       sub.f32         %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_22:
+       setp.eq.s32     %p18, %r3, 10;
+       @%p18 bra       BB33_23;
+       bra.uni         BB33_63;
+
+BB33_23:
+       setp.neu.f32    %p39, %f1, %f2;
+       selp.f32        %f262, 0f3F800000, 0f00000000, %p39;
+       bra.uni         BB33_63;
+
+BB33_13:
+       setp.eq.s32     %p25, %r3, 5;
+       @%p25 bra       BB33_14;
+       bra.uni         BB33_63;
+
+BB33_14:
+       setp.lt.f32     %p44, %f1, %f2;
+       selp.f32        %f262, 0f3F800000, 0f00000000, %p44;
+       bra.uni         BB33_63;
+
+BB33_30:
+       setp.eq.s32     %p12, %r3, 15;
+       @%p12 bra       BB33_31;
+       bra.uni         BB33_63;
+
+BB33_31:
+       mul.f32         %f84, %f1, %f2;
+       mov.f32         %f85, 0f3F800000;
+       sub.f32         %f262, %f85, %f84;
+       bra.uni         BB33_63;
+
+BB33_9:
+       setp.eq.s32     %p28, %r3, 3;
+       @%p28 bra       BB33_10;
+       bra.uni         BB33_63;
+
+BB33_10:
+       div.rn.f32      %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_43:
+       min.f32         %f262, %f1, %f2;
+       bra.uni         BB33_63;
+
+BB33_26:
+       setp.eq.s32     %p16, %r3, 13;
+       @%p16 bra       BB33_27;
+       bra.uni         BB33_63;
+
+BB33_27:
+       cvt.rni.s64.f32 %rd17, %f1;
+       cvt.u32.u64     %r15, %rd17;
+       cvt.rni.s64.f32 %rd18, %f2;
+       cvt.u32.u64     %r16, %rd18;
+       and.b32         %r17, %r16, %r15;
+       setp.eq.s32     %p38, %r17, 0;
+       selp.f32        %f262, 0f00000000, 0f3F800000, %p38;
+       bra.uni         BB33_63;
+
+BB33_46:
+       setp.gtu.f32    %p43, %f1, %f2;
+       selp.f32        %f262, 0f00000000, 0f3F800000, %p43;
+       bra.uni         BB33_63;
+
+BB33_17:
+       setp.eq.s32     %p23, %r3, 8;
+       @%p23 bra       BB33_18;
+       bra.uni         BB33_63;
+
+BB33_18:
+       setp.ltu.f32    %p41, %f1, %f2;
+       selp.f32        %f262, 0f00000000, 0f3F800000, %p41;
+       bra.uni         BB33_63;
+
+BB33_40:
+       setp.neu.f32    %p36, %f1, 0f00000000;
+       sub.f32         %f83, %f1, %f2;
+       selp.f32        %f262, %f83, 0f00000000, %p36;
+       bra.uni         BB33_63;
+
+BB33_34:
+       setp.ne.s32     %p10, %r3, 18;
+       @%p10 bra       BB33_63;
+
+       div.rn.f32      %f262, %f1, %f2;
+       abs.f32         %f78, %f262;
+       setp.geu.f32    %p31, %f78, 0f7F800000;
+       @%p31 bra       BB33_63;
+
+       cvt.rmi.f32.f32 %f262, %f262;
+       bra.uni         BB33_63;
+
+BB33_68:
+       setp.eq.s32     %p97, %r3, 1;
+       @%p97 bra       BB33_69;
+       bra.uni         BB33_125;
+
+BB33_69:
+       sub.f32         %f266, %f2, %f1;
+       bra.uni         BB33_125;
+
+BB33_84:
+       setp.eq.s32     %p85, %r3, 10;
+       @%p85 bra       BB33_85;
+       bra.uni         BB33_125;
+
+BB33_85:
+       setp.neu.f32    %p106, %f2, %f1;
+       selp.f32        %f266, 0f3F800000, 0f00000000, %p106;
+       bra.uni         BB33_125;
+
+BB33_75:
+       setp.eq.s32     %p92, %r3, 5;
+       @%p92 bra       BB33_76;
+       bra.uni         BB33_125;
+
+BB33_76:
+       setp.lt.f32     %p111, %f2, %f1;
+       selp.f32        %f266, 0f3F800000, 0f00000000, %p111;
+       bra.uni         BB33_125;
+
+BB33_92:
+       setp.eq.s32     %p79, %r3, 15;
+       @%p79 bra       BB33_93;
+       bra.uni         BB33_125;
+
+BB33_93:
+       mul.f32         %f175, %f1, %f2;
+       mov.f32         %f176, 0f3F800000;
+       sub.f32         %f266, %f176, %f175;
+       bra.uni         BB33_125;
+
+BB33_71:
+       setp.eq.s32     %p95, %r3, 3;
+       @%p95 bra       BB33_72;
+       bra.uni         BB33_125;
+
+BB33_72:
+       div.rn.f32      %f266, %f2, %f1;
+       bra.uni         BB33_125;
+
+BB33_105:
+       min.f32         %f266, %f2, %f1;
+       bra.uni         BB33_125;
+
+BB33_88:
+       setp.eq.s32     %p83, %r3, 13;
+       @%p83 bra       BB33_89;
+       bra.uni         BB33_125;
+
+BB33_89:
+       cvt.rni.s64.f32 %rd21, %f2;
+       cvt.u32.u64     %r40, %rd21;
+       cvt.rni.s64.f32 %rd22, %f1;
+       cvt.u32.u64     %r41, %rd22;
+       and.b32         %r42, %r41, %r40;
+       setp.eq.s32     %p105, %r42, 0;
+       selp.f32        %f266, 0f00000000, 0f3F800000, %p105;
+       bra.uni         BB33_125;
+
+BB33_108:
+       setp.gtu.f32    %p110, %f2, %f1;
+       selp.f32        %f266, 0f00000000, 0f3F800000, %p110;
+       bra.uni         BB33_125;
+
+BB33_79:
+       setp.eq.s32     %p90, %r3, 8;
+       @%p90 bra       BB33_80;
+       bra.uni         BB33_125;
+
+BB33_80:
+       setp.ltu.f32    %p108, %f2, %f1;
+       selp.f32        %f266, 0f00000000, 0f3F800000, %p108;
+       bra.uni         BB33_125;
+
+BB33_102:
+       setp.neu.f32    %p103, %f2, 0f00000000;
+       sub.f32         %f174, %f2, %f1;
+       selp.f32        %f266, %f174, 0f00000000, %p103;
+       bra.uni         BB33_125;
+
+BB33_96:
+       setp.ne.s32     %p77, %r3, 18;
+       @%p77 bra       BB33_125;
+
+       div.rn.f32      %f266, %f2, %f1;
+       abs.f32         %f169, %f266;
+       setp.geu.f32    %p98, %f169, 0f7F800000;
+       @%p98 bra       BB33_125;
+
+       cvt.rmi.f32.f32 %f266, %f266;
+       bra.uni         BB33_125;
+
+BB33_50:
+       setp.geu.f32    %p55, %f1, 0f00000000;
+       @%p55 bra       BB33_53;
+
+       cvt.rzi.f32.f32 %f164, %f2;
+       setp.neu.f32    %p56, %f164, %f2;
+       selp.f32        %f261, 0f7FFFFFFF, %f261, %p56;
+
+BB33_53:
+       add.f32         %f166, %f20, %f21;
+       mov.b32          %r30, %f166;
+       setp.lt.s32     %p59, %r30, 2139095040;
+       @%p59 bra       BB33_60;
+
+       setp.gtu.f32    %p60, %f20, 0f7F800000;
+       setp.gtu.f32    %p61, %f21, 0f7F800000;
+       or.pred         %p62, %p60, %p61;
+       @%p62 bra       BB33_59;
+       bra.uni         BB33_55;
+
+BB33_59:
+       add.f32         %f261, %f1, %f2;
+       bra.uni         BB33_60;
+
+BB33_55:
+       setp.eq.f32     %p63, %f21, 0f7F800000;
+       @%p63 bra       BB33_58;
+       bra.uni         BB33_56;
+
+BB33_58:
+       setp.gt.f32     %p66, %f20, 0f3F800000;
+       selp.b32        %r34, 2139095040, 0, %p66;
+       xor.b32         %r35, %r34, 2139095040;
+       setp.lt.f32     %p67, %f2, 0f00000000;
+       selp.b32        %r36, %r35, %r34, %p67;
+       mov.b32          %f167, %r36;
+       setp.eq.f32     %p68, %f1, 0fBF800000;
+       selp.f32        %f261, 0f3F800000, %f167, %p68;
+       bra.uni         BB33_60;
+
+BB33_112:
+       setp.geu.f32    %p122, %f2, 0f00000000;
+       @%p122 bra      BB33_115;
+
+       cvt.rzi.f32.f32 %f255, %f1;
+       setp.neu.f32    %p123, %f255, %f1;
+       selp.f32        %f265, 0f7FFFFFFF, %f265, %p123;
+
+BB33_115:
+       add.f32         %f257, %f57, %f58;
+       mov.b32          %r55, %f257;
+       setp.lt.s32     %p126, %r55, 2139095040;
+       @%p126 bra      BB33_122;
+
+       setp.gtu.f32    %p127, %f57, 0f7F800000;
+       setp.gtu.f32    %p128, %f58, 0f7F800000;
+       or.pred         %p129, %p127, %p128;
+       @%p129 bra      BB33_121;
+       bra.uni         BB33_117;
+
+BB33_121:
+       add.f32         %f265, %f1, %f2;
+       bra.uni         BB33_122;
+
+BB33_117:
+       setp.eq.f32     %p130, %f58, 0f7F800000;
+       @%p130 bra      BB33_120;
+       bra.uni         BB33_118;
+
+BB33_120:
+       setp.gt.f32     %p133, %f57, 0f3F800000;
+       selp.b32        %r59, 2139095040, 0, %p133;
+       xor.b32         %r60, %r59, 2139095040;
+       setp.lt.f32     %p134, %f1, 0f00000000;
+       selp.b32        %r61, %r60, %r59, %p134;
+       mov.b32          %f258, %r61;
+       setp.eq.f32     %p135, %f2, 0fBF800000;
+       selp.f32        %f265, 0f3F800000, %f258, %p135;
+       bra.uni         BB33_122;
+
+BB33_56:
+       setp.neu.f32    %p64, %f20, 0f7F800000;
+       @%p64 bra       BB33_60;
+
+       setp.ltu.f32    %p65, %f2, 0f00000000;
+       selp.b32        %r31, 0, 2139095040, %p65;
+       or.b32          %r32, %r31, -2147483648;
+       selp.b32        %r33, %r32, %r31, %p1;
+       mov.b32          %f261, %r33;
+
+BB33_60:
+       setp.eq.f32     %p69, %f2, 0f00000000;
+       setp.eq.f32     %p70, %f1, 0f3F800000;
+       or.pred         %p71, %p70, %p69;
+       selp.f32        %f262, 0f3F800000, %f261, %p71;
+
+BB33_63:
+       st.global.f32   [%rd1], %f262;
+       bra.uni         BB33_126;
+
+BB33_118:
+       setp.neu.f32    %p131, %f57, 0f7F800000;
+       @%p131 bra      BB33_122;
+
+       setp.ltu.f32    %p132, %f1, 0f00000000;
+       selp.b32        %r56, 0, 2139095040, %p132;
+       or.b32          %r57, %r56, -2147483648;
+       selp.b32        %r58, %r57, %r56, %p2;
+       mov.b32          %f265, %r58;
+
+BB33_122:
+       setp.eq.f32     %p136, %f1, 0f00000000;
+       setp.eq.f32     %p137, %f2, 0f3F800000;
+       or.pred         %p138, %p137, %p136;
+       selp.f32        %f266, 0f3F800000, %f265, %p138;
+
+BB33_125:
+       st.global.f32   [%rd1], %f266;
+
+BB33_126:
+       bar.sync        0;
+       ret;
+}
+
        // .globl       fill_d
 .visible .entry fill_d(
        .param .u64 fill_d_param_0,
@@ -4616,14 +6349,14 @@ BB31_126:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB32_2;
+       @%p1 bra        BB34_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB32_2:
+BB34_2:
        ret;
 }
 
@@ -4649,7 +6382,7 @@ BB32_2:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB33_2;
+       @%p1 bra        BB35_2;
 
        cvt.rn.f32.f64  %f1, %fd1;
        cvta.to.global.u64      %rd2, %rd1;
@@ -4657,7 +6390,7 @@ BB32_2:
        add.s64         %rd4, %rd2, %rd3;
        st.global.f32   [%rd4], %f1;
 
-BB33_2:
+BB35_2:
        ret;
 }
 
@@ -4697,10 +6430,10 @@ BB33_2:
        setp.lt.s32     %p1, %r1, %r7;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB34_2;
-       bra.uni         BB34_1;
+       @!%p3 bra       BB36_2;
+       bra.uni         BB36_1;
 
-BB34_1:
+BB36_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r13, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r13, 8;
@@ -4711,14 +6444,14 @@ BB34_1:
        add.s64         %rd9, %rd1, %rd8;
        st.global.f64   [%rd9], %fd1;
 
-BB34_2:
+BB36_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB34_4;
-       bra.uni         BB34_3;
+       @!%p6 bra       BB36_4;
+       bra.uni         BB36_3;
 
-BB34_3:
+BB36_3:
        cvta.to.global.u64      %rd10, %rd3;
        mad.lo.s32      %r15, %r1, %r6, %r2;
        mul.wide.s32    %rd11, %r15, 8;
@@ -4730,7 +6463,7 @@ BB34_3:
        add.s64         %rd14, %rd1, %rd13;
        st.global.f64   [%rd14], %fd2;
 
-BB34_4:
+BB36_4:
        ret;
 }
 
@@ -4770,10 +6503,10 @@ BB34_4:
        setp.lt.s32     %p1, %r1, %r7;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB35_2;
-       bra.uni         BB35_1;
+       @!%p3 bra       BB37_2;
+       bra.uni         BB37_1;
 
-BB35_1:
+BB37_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r13, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r13, 4;
@@ -4784,14 +6517,14 @@ BB35_1:
        add.s64         %rd9, %rd1, %rd8;
        st.global.f32   [%rd9], %f1;
 
-BB35_2:
+BB37_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB35_4;
-       bra.uni         BB35_3;
+       @!%p6 bra       BB37_4;
+       bra.uni         BB37_3;
 
-BB35_3:
+BB37_3:
        cvta.to.global.u64      %rd10, %rd3;
        mad.lo.s32      %r15, %r1, %r6, %r2;
        mul.wide.s32    %rd11, %r15, 4;
@@ -4803,7 +6536,7 @@ BB35_3:
        add.s64         %rd14, %rd1, %rd13;
        st.global.f32   [%rd14], %f2;
 
-BB35_4:
+BB37_4:
        ret;
 }
 
@@ -4842,10 +6575,10 @@ BB35_4:
        setp.lt.s32     %p1, %r1, %r3;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB36_2;
-       bra.uni         BB36_1;
+       @!%p3 bra       BB38_2;
+       bra.uni         BB38_1;
 
-BB36_1:
+BB38_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r12, 8;
@@ -4854,14 +6587,14 @@ BB36_1:
        add.s64         %rd8, %rd1, %rd6;
        st.global.f64   [%rd8], %fd1;
 
-BB36_2:
+BB38_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB36_4;
-       bra.uni         BB36_3;
+       @!%p6 bra       BB38_4;
+       bra.uni         BB38_3;
 
-BB36_3:
+BB38_3:
        cvta.to.global.u64      %rd9, %rd3;
        mad.lo.s32      %r13, %r1, %r6, %r2;
        mul.wide.s32    %rd10, %r13, 8;
@@ -4873,7 +6606,7 @@ BB36_3:
        add.s64         %rd13, %rd1, %rd12;
        st.global.f64   [%rd13], %fd2;
 
-BB36_4:
+BB38_4:
        ret;
 }
 
@@ -4912,10 +6645,10 @@ BB36_4:
        setp.lt.s32     %p1, %r1, %r3;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB37_2;
-       bra.uni         BB37_1;
+       @!%p3 bra       BB39_2;
+       bra.uni         BB39_1;
 
-BB37_1:
+BB39_1:
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd6, %r12, 4;
@@ -4924,14 +6657,14 @@ BB37_1:
        add.s64         %rd8, %rd1, %rd6;
        st.global.f32   [%rd8], %f1;
 
-BB37_2:
+BB39_2:
        setp.lt.s32     %p4, %r1, %r5;
        setp.lt.s32     %p5, %r2, %r6;
        and.pred        %p6, %p4, %p5;
-       @!%p6 bra       BB37_4;
-       bra.uni         BB37_3;
+       @!%p6 bra       BB39_4;
+       bra.uni         BB39_3;
 
-BB37_3:
+BB39_3:
        cvta.to.global.u64      %rd9, %rd3;
        mad.lo.s32      %r13, %r1, %r6, %r2;
        mul.wide.s32    %rd10, %r13, 4;
@@ -4943,7 +6676,7 @@ BB37_3:
        add.s64         %rd13, %rd1, %rd12;
        st.global.f32   [%rd13], %f2;
 
-BB37_4:
+BB39_4:
        ret;
 }
 
@@ -4970,9 +6703,9 @@ BB37_4:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f64         %fd44, 0d0000000000000000;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB38_4;
+       @%p1 bra        BB40_4;
 
-BB38_1:
+BB40_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -4980,135 +6713,135 @@ BB38_1:
        add.f64         %fd44, %fd44, %fd30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB38_3;
+       @%p2 bra        BB40_3;
 
        mul.wide.u32    %rd7, %r3, 8;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f64   %fd31, [%rd8];
        add.f64         %fd44, %fd44, %fd31;
 
-BB38_3:
+BB40_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB38_1;
+       @%p3 bra        BB40_1;
 
-BB38_4:
+BB40_4:
        shl.b32         %r16, %r7, 3;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f64   [%r5], %fd44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB38_8;
+       @%p4 bra        BB40_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB38_7;
+       @%p5 bra        BB40_7;
 
        ld.shared.f64   %fd32, [%r5+4096];
        add.f64         %fd44, %fd44, %fd32;
        st.shared.f64   [%r5], %fd44;
 
-BB38_7:
+BB40_7:
        bar.sync        0;
 
-BB38_8:
+BB40_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB38_12;
+       @%p6 bra        BB40_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB38_11;
+       @%p7 bra        BB40_11;
 
        ld.shared.f64   %fd33, [%r5+2048];
        add.f64         %fd44, %fd44, %fd33;
        st.shared.f64   [%r5], %fd44;
 
-BB38_11:
+BB40_11:
        bar.sync        0;
 
-BB38_12:
+BB40_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB38_16;
+       @%p8 bra        BB40_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB38_15;
+       @%p9 bra        BB40_15;
 
        ld.shared.f64   %fd34, [%r5+1024];
        add.f64         %fd44, %fd44, %fd34;
        st.shared.f64   [%r5], %fd44;
 
-BB38_15:
+BB40_15:
        bar.sync        0;
 
-BB38_16:
+BB40_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB38_20;
+       @%p10 bra       BB40_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB38_19;
+       @%p11 bra       BB40_19;
 
        ld.shared.f64   %fd35, [%r5+512];
        add.f64         %fd44, %fd44, %fd35;
        st.shared.f64   [%r5], %fd44;
 
-BB38_19:
+BB40_19:
        bar.sync        0;
 
-BB38_20:
+BB40_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB38_33;
+       @%p12 bra       BB40_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB38_23;
+       @%p13 bra       BB40_23;
 
        ld.volatile.shared.f64  %fd36, [%r5+256];
        add.f64         %fd44, %fd44, %fd36;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB38_23:
+BB40_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB38_25;
+       @%p14 bra       BB40_25;
 
        ld.volatile.shared.f64  %fd37, [%r5+128];
        add.f64         %fd44, %fd44, %fd37;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB38_25:
+BB40_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB38_27;
+       @%p15 bra       BB40_27;
 
        ld.volatile.shared.f64  %fd38, [%r5+64];
        add.f64         %fd44, %fd44, %fd38;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB38_27:
+BB40_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB38_29;
+       @%p16 bra       BB40_29;
 
        ld.volatile.shared.f64  %fd39, [%r5+32];
        add.f64         %fd44, %fd44, %fd39;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB38_29:
+BB40_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB38_31;
+       @%p17 bra       BB40_31;
 
        ld.volatile.shared.f64  %fd40, [%r5+16];
        add.f64         %fd44, %fd44, %fd40;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB38_31:
+BB40_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB38_33;
+       @%p18 bra       BB40_33;
 
        ld.volatile.shared.f64  %fd41, [%r5+8];
        add.f64         %fd42, %fd44, %fd41;
        st.volatile.shared.f64  [%r5], %fd42;
 
-BB38_33:
+BB40_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB38_35;
+       @%p19 bra       BB40_35;
 
        ld.shared.f64   %fd43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -5116,7 +6849,7 @@ BB38_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f64   [%rd11], %fd43;
 
-BB38_35:
+BB40_35:
        ret;
 }
 
@@ -5143,9 +6876,9 @@ BB38_35:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f32         %f44, 0f00000000;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB39_4;
+       @%p1 bra        BB41_4;
 
-BB39_1:
+BB41_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -5153,135 +6886,135 @@ BB39_1:
        add.f32         %f44, %f44, %f30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB39_3;
+       @%p2 bra        BB41_3;
 
        mul.wide.u32    %rd7, %r3, 4;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f32   %f31, [%rd8];
        add.f32         %f44, %f44, %f31;
 
-BB39_3:
+BB41_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB39_1;
+       @%p3 bra        BB41_1;
 
-BB39_4:
+BB41_4:
        shl.b32         %r16, %r7, 2;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f32   [%r5], %f44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB39_8;
+       @%p4 bra        BB41_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB39_7;
+       @%p5 bra        BB41_7;
 
        ld.shared.f32   %f32, [%r5+2048];
        add.f32         %f44, %f44, %f32;
        st.shared.f32   [%r5], %f44;
 
-BB39_7:
+BB41_7:
        bar.sync        0;
 
-BB39_8:
+BB41_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB39_12;
+       @%p6 bra        BB41_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB39_11;
+       @%p7 bra        BB41_11;
 
        ld.shared.f32   %f33, [%r5+1024];
        add.f32         %f44, %f44, %f33;
        st.shared.f32   [%r5], %f44;
 
-BB39_11:
+BB41_11:
        bar.sync        0;
 
-BB39_12:
+BB41_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB39_16;
+       @%p8 bra        BB41_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB39_15;
+       @%p9 bra        BB41_15;
 
        ld.shared.f32   %f34, [%r5+512];
        add.f32         %f44, %f44, %f34;
        st.shared.f32   [%r5], %f44;
 
-BB39_15:
+BB41_15:
        bar.sync        0;
 
-BB39_16:
+BB41_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB39_20;
+       @%p10 bra       BB41_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB39_19;
+       @%p11 bra       BB41_19;
 
        ld.shared.f32   %f35, [%r5+256];
        add.f32         %f44, %f44, %f35;
        st.shared.f32   [%r5], %f44;
 
-BB39_19:
+BB41_19:
        bar.sync        0;
 
-BB39_20:
+BB41_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB39_33;
+       @%p12 bra       BB41_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB39_23;
+       @%p13 bra       BB41_23;
 
        ld.volatile.shared.f32  %f36, [%r5+128];
        add.f32         %f44, %f44, %f36;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB39_23:
+BB41_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB39_25;
+       @%p14 bra       BB41_25;
 
        ld.volatile.shared.f32  %f37, [%r5+64];
        add.f32         %f44, %f44, %f37;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB39_25:
+BB41_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB39_27;
+       @%p15 bra       BB41_27;
 
        ld.volatile.shared.f32  %f38, [%r5+32];
        add.f32         %f44, %f44, %f38;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB39_27:
+BB41_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB39_29;
+       @%p16 bra       BB41_29;
 
        ld.volatile.shared.f32  %f39, [%r5+16];
        add.f32         %f44, %f44, %f39;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB39_29:
+BB41_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB39_31;
+       @%p17 bra       BB41_31;
 
        ld.volatile.shared.f32  %f40, [%r5+8];
        add.f32         %f44, %f44, %f40;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB39_31:
+BB41_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB39_33;
+       @%p18 bra       BB41_33;
 
        ld.volatile.shared.f32  %f41, [%r5+4];
        add.f32         %f42, %f44, %f41;
        st.volatile.shared.f32  [%r5], %f42;
 
-BB39_33:
+BB41_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB39_35;
+       @%p19 bra       BB41_35;
 
        ld.shared.f32   %f43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -5289,7 +7022,7 @@ BB39_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f32   [%rd11], %f43;
 
-BB39_35:
+BB41_35:
        ret;
 }
 
@@ -5313,16 +7046,16 @@ BB39_35:
        ld.param.u32    %r4, [reduce_row_sum_d_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB40_35;
+       @%p1 bra        BB42_35;
 
        mov.u32         %r71, %tid.x;
        mov.f64         %fd6, 0d0000000000000000;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB40_4;
+       @%p2 bra        BB42_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB40_3:
+BB42_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -5331,9 +7064,9 @@ BB40_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB40_3;
+       @%p3 bra        BB42_3;
 
-BB40_4:
+BB42_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 3;
        mov.u32         %r12, my_sdata;
@@ -5342,114 +7075,114 @@ BB40_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB40_8;
+       @%p4 bra        BB42_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB40_7;
+       @%p5 bra        BB42_7;
 
        ld.shared.f64   %fd29, [%r13+4096];
        add.f64         %fd6, %fd6, %fd29;
        st.shared.f64   [%r13], %fd6;
 
-BB40_7:
+BB42_7:
        bar.sync        0;
 
-BB40_8:
+BB42_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB40_12;
+       @%p6 bra        BB42_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB40_11;
+       @%p7 bra        BB42_11;
 
        ld.shared.f64   %fd30, [%r13+2048];
        add.f64         %fd6, %fd6, %fd30;
        st.shared.f64   [%r13], %fd6;
 
-BB40_11:
+BB42_11:
        bar.sync        0;
 
-BB40_12:
+BB42_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB40_16;
+       @%p8 bra        BB42_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB40_15;
+       @%p9 bra        BB42_15;
 
        ld.shared.f64   %fd31, [%r13+1024];
        add.f64         %fd6, %fd6, %fd31;
        st.shared.f64   [%r13], %fd6;
 
-BB40_15:
+BB42_15:
        bar.sync        0;
 
-BB40_16:
+BB42_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB40_20;
+       @%p10 bra       BB42_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB40_19;
+       @%p11 bra       BB42_19;
 
        ld.shared.f64   %fd32, [%r13+512];
        add.f64         %fd6, %fd6, %fd32;
        st.shared.f64   [%r13], %fd6;
 
-BB40_19:
+BB42_19:
        bar.sync        0;
 
-BB40_20:
+BB42_20:
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB40_33;
+       @%p12 bra       BB42_33;
 
        setp.lt.u32     %p13, %r14, 64;
-       @%p13 bra       BB40_23;
+       @%p13 bra       BB42_23;
 
        ld.volatile.shared.f64  %fd33, [%r13+256];
        add.f64         %fd6, %fd6, %fd33;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB40_23:
+BB42_23:
        setp.lt.u32     %p14, %r14, 32;
-       @%p14 bra       BB40_25;
+       @%p14 bra       BB42_25;
 
        ld.volatile.shared.f64  %fd34, [%r13+128];
        add.f64         %fd6, %fd6, %fd34;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB40_25:
+BB42_25:
        setp.lt.u32     %p15, %r14, 16;
-       @%p15 bra       BB40_27;
+       @%p15 bra       BB42_27;
 
        ld.volatile.shared.f64  %fd35, [%r13+64];
        add.f64         %fd6, %fd6, %fd35;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB40_27:
+BB42_27:
        setp.lt.u32     %p16, %r14, 8;
-       @%p16 bra       BB40_29;
+       @%p16 bra       BB42_29;
 
        ld.volatile.shared.f64  %fd36, [%r13+32];
        add.f64         %fd6, %fd6, %fd36;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB40_29:
+BB42_29:
        setp.lt.u32     %p17, %r14, 4;
-       @%p17 bra       BB40_31;
+       @%p17 bra       BB42_31;
 
        ld.volatile.shared.f64  %fd37, [%r13+16];
        add.f64         %fd6, %fd6, %fd37;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB40_31:
+BB42_31:
        setp.lt.u32     %p18, %r14, 2;
-       @%p18 bra       BB40_33;
+       @%p18 bra       BB42_33;
 
        ld.volatile.shared.f64  %fd38, [%r13+8];
        add.f64         %fd39, %fd6, %fd38;
        st.volatile.shared.f64  [%r13], %fd39;
 
-BB40_33:
+BB42_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB40_35;
+       @%p19 bra       BB42_35;
 
        ld.shared.f64   %fd40, [my_sdata];
        cvta.to.global.u64      %rd6, %rd2;
@@ -5457,7 +7190,7 @@ BB40_33:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd40;
 
-BB40_35:
+BB42_35:
        ret;
 }
 
@@ -5481,16 +7214,16 @@ BB40_35:
        ld.param.u32    %r4, [reduce_row_sum_f_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB41_35;
+       @%p1 bra        BB43_35;
 
        mov.u32         %r71, %tid.x;
        mov.f32         %f6, 0f00000000;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB41_4;
+       @%p2 bra        BB43_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB41_3:
+BB43_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -5499,9 +7232,9 @@ BB41_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB41_3;
+       @%p3 bra        BB43_3;
 
-BB41_4:
+BB43_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 2;
        mov.u32         %r12, my_sdata;
@@ -5510,114 +7243,114 @@ BB41_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB41_8;
+       @%p4 bra        BB43_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB41_7;
+       @%p5 bra        BB43_7;
 
        ld.shared.f32   %f29, [%r13+2048];
        add.f32         %f6, %f6, %f29;
        st.shared.f32   [%r13], %f6;
 
-BB41_7:
+BB43_7:
        bar.sync        0;
 
-BB41_8:
+BB43_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB41_12;
+       @%p6 bra        BB43_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB41_11;
+       @%p7 bra        BB43_11;
 
        ld.shared.f32   %f30, [%r13+1024];
        add.f32         %f6, %f6, %f30;
        st.shared.f32   [%r13], %f6;
 
-BB41_11:
+BB43_11:
        bar.sync        0;
 
-BB41_12:
+BB43_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB41_16;
+       @%p8 bra        BB43_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB41_15;
+       @%p9 bra        BB43_15;
 
        ld.shared.f32   %f31, [%r13+512];
        add.f32         %f6, %f6, %f31;
        st.shared.f32   [%r13], %f6;
 
-BB41_15:
+BB43_15:
        bar.sync        0;
 
-BB41_16:
+BB43_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB41_20;
+       @%p10 bra       BB43_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB41_19;
+       @%p11 bra       BB43_19;
 
        ld.shared.f32   %f32, [%r13+256];
        add.f32         %f6, %f6, %f32;
        st.shared.f32   [%r13], %f6;
 
-BB41_19:
+BB43_19:
        bar.sync        0;
 
-BB41_20:
+BB43_20:
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB41_33;
+       @%p12 bra       BB43_33;
 
        setp.lt.u32     %p13, %r14, 64;
-       @%p13 bra       BB41_23;
+       @%p13 bra       BB43_23;
 
        ld.volatile.shared.f32  %f33, [%r13+128];
        add.f32         %f6, %f6, %f33;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB41_23:
+BB43_23:
        setp.lt.u32     %p14, %r14, 32;
-       @%p14 bra       BB41_25;
+       @%p14 bra       BB43_25;
 
        ld.volatile.shared.f32  %f34, [%r13+64];
        add.f32         %f6, %f6, %f34;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB41_25:
+BB43_25:
        setp.lt.u32     %p15, %r14, 16;
-       @%p15 bra       BB41_27;
+       @%p15 bra       BB43_27;
 
        ld.volatile.shared.f32  %f35, [%r13+32];
        add.f32         %f6, %f6, %f35;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB41_27:
+BB43_27:
        setp.lt.u32     %p16, %r14, 8;
-       @%p16 bra       BB41_29;
+       @%p16 bra       BB43_29;
 
        ld.volatile.shared.f32  %f36, [%r13+16];
        add.f32         %f6, %f6, %f36;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB41_29:
+BB43_29:
        setp.lt.u32     %p17, %r14, 4;
-       @%p17 bra       BB41_31;
+       @%p17 bra       BB43_31;
 
        ld.volatile.shared.f32  %f37, [%r13+8];
        add.f32         %f6, %f6, %f37;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB41_31:
+BB43_31:
        setp.lt.u32     %p18, %r14, 2;
-       @%p18 bra       BB41_33;
+       @%p18 bra       BB43_33;
 
        ld.volatile.shared.f32  %f38, [%r13+4];
        add.f32         %f39, %f6, %f38;
        st.volatile.shared.f32  [%r13], %f39;
 
-BB41_33:
+BB43_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB41_35;
+       @%p19 bra       BB43_35;
 
        ld.shared.f32   %f40, [my_sdata];
        cvta.to.global.u64      %rd6, %rd2;
@@ -5625,7 +7358,7 @@ BB41_33:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f40;
 
-BB41_35:
+BB43_35:
        ret;
 }
 
@@ -5652,32 +7385,32 @@ BB41_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB42_5;
+       @%p1 bra        BB44_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0d0000000000000000;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB42_4;
+       @%p2 bra        BB44_4;
 
        mov.u32         %r10, %r1;
 
-BB42_3:
+BB44_3:
        mul.wide.u32    %rd4, %r10, 8;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f64   %fd6, [%rd5];
        add.f64         %fd8, %fd8, %fd6;
        add.s32         %r10, %r10, %r6;
        setp.lt.u32     %p3, %r10, %r2;
-       @%p3 bra        BB42_3;
+       @%p3 bra        BB44_3;
 
-BB42_4:
+BB44_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB42_5:
+BB44_5:
        ret;
 }
 
@@ -5704,32 +7437,32 @@ BB42_5:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB43_5;
+       @%p1 bra        BB45_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f32         %f8, 0f00000000;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB43_4;
+       @%p2 bra        BB45_4;
 
        mov.u32         %r10, %r1;
 
-BB43_3:
+BB45_3:
        mul.wide.u32    %rd4, %r10, 4;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f32   %f6, [%rd5];
        add.f32         %f8, %f8, %f6;
        add.s32         %r10, %r10, %r6;
        setp.lt.u32     %p3, %r10, %r2;
-       @%p3 bra        BB43_3;
+       @%p3 bra        BB45_3;
 
-BB43_4:
+BB45_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 4;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f8;
 
-BB43_5:
+BB45_5:
        ret;
 }
 
@@ -5756,9 +7489,9 @@ BB43_5:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f64         %fd44, 0dFFEFFFFFFFFFFFFF;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB44_4;
+       @%p1 bra        BB46_4;
 
-BB44_1:
+BB46_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -5766,135 +7499,135 @@ BB44_1:
        max.f64         %fd44, %fd44, %fd30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB44_3;
+       @%p2 bra        BB46_3;
 
        mul.wide.u32    %rd7, %r3, 8;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f64   %fd31, [%rd8];
        max.f64         %fd44, %fd44, %fd31;
 
-BB44_3:
+BB46_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB44_1;
+       @%p3 bra        BB46_1;
 
-BB44_4:
+BB46_4:
        shl.b32         %r16, %r7, 3;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f64   [%r5], %fd44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB44_8;
+       @%p4 bra        BB46_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB44_7;
+       @%p5 bra        BB46_7;
 
        ld.shared.f64   %fd32, [%r5+4096];
        max.f64         %fd44, %fd44, %fd32;
        st.shared.f64   [%r5], %fd44;
 
-BB44_7:
+BB46_7:
        bar.sync        0;
 
-BB44_8:
+BB46_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB44_12;
+       @%p6 bra        BB46_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB44_11;
+       @%p7 bra        BB46_11;
 
        ld.shared.f64   %fd33, [%r5+2048];
        max.f64         %fd44, %fd44, %fd33;
        st.shared.f64   [%r5], %fd44;
 
-BB44_11:
+BB46_11:
        bar.sync        0;
 
-BB44_12:
+BB46_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB44_16;
+       @%p8 bra        BB46_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB44_15;
+       @%p9 bra        BB46_15;
 
        ld.shared.f64   %fd34, [%r5+1024];
        max.f64         %fd44, %fd44, %fd34;
        st.shared.f64   [%r5], %fd44;
 
-BB44_15:
+BB46_15:
        bar.sync        0;
 
-BB44_16:
+BB46_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB44_20;
+       @%p10 bra       BB46_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB44_19;
+       @%p11 bra       BB46_19;
 
        ld.shared.f64   %fd35, [%r5+512];
        max.f64         %fd44, %fd44, %fd35;
        st.shared.f64   [%r5], %fd44;
 
-BB44_19:
+BB46_19:
        bar.sync        0;
 
-BB44_20:
+BB46_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB44_33;
+       @%p12 bra       BB46_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB44_23;
+       @%p13 bra       BB46_23;
 
        ld.volatile.shared.f64  %fd36, [%r5+256];
        max.f64         %fd44, %fd44, %fd36;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB44_23:
+BB46_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB44_25;
+       @%p14 bra       BB46_25;
 
        ld.volatile.shared.f64  %fd37, [%r5+128];
        max.f64         %fd44, %fd44, %fd37;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB44_25:
+BB46_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB44_27;
+       @%p15 bra       BB46_27;
 
        ld.volatile.shared.f64  %fd38, [%r5+64];
        max.f64         %fd44, %fd44, %fd38;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB44_27:
+BB46_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB44_29;
+       @%p16 bra       BB46_29;
 
        ld.volatile.shared.f64  %fd39, [%r5+32];
        max.f64         %fd44, %fd44, %fd39;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB44_29:
+BB46_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB44_31;
+       @%p17 bra       BB46_31;
 
        ld.volatile.shared.f64  %fd40, [%r5+16];
        max.f64         %fd44, %fd44, %fd40;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB44_31:
+BB46_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB44_33;
+       @%p18 bra       BB46_33;
 
        ld.volatile.shared.f64  %fd41, [%r5+8];
        max.f64         %fd42, %fd44, %fd41;
        st.volatile.shared.f64  [%r5], %fd42;
 
-BB44_33:
+BB46_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB44_35;
+       @%p19 bra       BB46_35;
 
        ld.shared.f64   %fd43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -5902,7 +7635,7 @@ BB44_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f64   [%rd11], %fd43;
 
-BB44_35:
+BB46_35:
        ret;
 }
 
@@ -5929,9 +7662,9 @@ BB44_35:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f32         %f44, 0fFF7FFFFF;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB45_4;
+       @%p1 bra        BB47_4;
 
-BB45_1:
+BB47_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -5939,135 +7672,135 @@ BB45_1:
        max.f32         %f44, %f44, %f30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB45_3;
+       @%p2 bra        BB47_3;
 
        mul.wide.u32    %rd7, %r3, 4;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f32   %f31, [%rd8];
        max.f32         %f44, %f44, %f31;
 
-BB45_3:
+BB47_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB45_1;
+       @%p3 bra        BB47_1;
 
-BB45_4:
+BB47_4:
        shl.b32         %r16, %r7, 2;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f32   [%r5], %f44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB45_8;
+       @%p4 bra        BB47_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB45_7;
+       @%p5 bra        BB47_7;
 
        ld.shared.f32   %f32, [%r5+2048];
        max.f32         %f44, %f44, %f32;
        st.shared.f32   [%r5], %f44;
 
-BB45_7:
+BB47_7:
        bar.sync        0;
 
-BB45_8:
+BB47_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB45_12;
+       @%p6 bra        BB47_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB45_11;
+       @%p7 bra        BB47_11;
 
        ld.shared.f32   %f33, [%r5+1024];
        max.f32         %f44, %f44, %f33;
        st.shared.f32   [%r5], %f44;
 
-BB45_11:
+BB47_11:
        bar.sync        0;
 
-BB45_12:
+BB47_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB45_16;
+       @%p8 bra        BB47_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB45_15;
+       @%p9 bra        BB47_15;
 
        ld.shared.f32   %f34, [%r5+512];
        max.f32         %f44, %f44, %f34;
        st.shared.f32   [%r5], %f44;
 
-BB45_15:
+BB47_15:
        bar.sync        0;
 
-BB45_16:
+BB47_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB45_20;
+       @%p10 bra       BB47_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB45_19;
+       @%p11 bra       BB47_19;
 
        ld.shared.f32   %f35, [%r5+256];
        max.f32         %f44, %f44, %f35;
        st.shared.f32   [%r5], %f44;
 
-BB45_19:
+BB47_19:
        bar.sync        0;
 
-BB45_20:
+BB47_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB45_33;
+       @%p12 bra       BB47_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB45_23;
+       @%p13 bra       BB47_23;
 
        ld.volatile.shared.f32  %f36, [%r5+128];
        max.f32         %f44, %f44, %f36;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB45_23:
+BB47_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB45_25;
+       @%p14 bra       BB47_25;
 
        ld.volatile.shared.f32  %f37, [%r5+64];
        max.f32         %f44, %f44, %f37;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB45_25:
+BB47_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB45_27;
+       @%p15 bra       BB47_27;
 
        ld.volatile.shared.f32  %f38, [%r5+32];
        max.f32         %f44, %f44, %f38;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB45_27:
+BB47_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB45_29;
+       @%p16 bra       BB47_29;
 
        ld.volatile.shared.f32  %f39, [%r5+16];
        max.f32         %f44, %f44, %f39;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB45_29:
+BB47_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB45_31;
+       @%p17 bra       BB47_31;
 
        ld.volatile.shared.f32  %f40, [%r5+8];
        max.f32         %f44, %f44, %f40;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB45_31:
+BB47_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB45_33;
+       @%p18 bra       BB47_33;
 
        ld.volatile.shared.f32  %f41, [%r5+4];
        max.f32         %f42, %f44, %f41;
        st.volatile.shared.f32  [%r5], %f42;
 
-BB45_33:
+BB47_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB45_35;
+       @%p19 bra       BB47_35;
 
        ld.shared.f32   %f43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -6075,7 +7808,7 @@ BB45_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f32   [%rd11], %f43;
 
-BB45_35:
+BB47_35:
        ret;
 }
 
@@ -6099,16 +7832,16 @@ BB45_35:
        ld.param.u32    %r4, [reduce_row_max_d_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB46_35;
+       @%p1 bra        BB48_35;
 
        mov.u32         %r71, %tid.x;
        mov.f64         %fd6, 0dFFEFFFFFFFFFFFFF;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB46_4;
+       @%p2 bra        BB48_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB46_3:
+BB48_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -6117,9 +7850,9 @@ BB46_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB46_3;
+       @%p3 bra        BB48_3;
 
-BB46_4:
+BB48_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 3;
        mov.u32         %r12, my_sdata;
@@ -6128,114 +7861,114 @@ BB46_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB46_8;
+       @%p4 bra        BB48_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB46_7;
+       @%p5 bra        BB48_7;
 
        ld.shared.f64   %fd29, [%r13+4096];
        max.f64         %fd6, %fd6, %fd29;
        st.shared.f64   [%r13], %fd6;
 
-BB46_7:
+BB48_7:
        bar.sync        0;
 
-BB46_8:
+BB48_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB46_12;
+       @%p6 bra        BB48_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB46_11;
+       @%p7 bra        BB48_11;
 
        ld.shared.f64   %fd30, [%r13+2048];
        max.f64         %fd6, %fd6, %fd30;
        st.shared.f64   [%r13], %fd6;
 
-BB46_11:
+BB48_11:
        bar.sync        0;
 
-BB46_12:
+BB48_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB46_16;
+       @%p8 bra        BB48_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB46_15;
+       @%p9 bra        BB48_15;
 
        ld.shared.f64   %fd31, [%r13+1024];
        max.f64         %fd6, %fd6, %fd31;
        st.shared.f64   [%r13], %fd6;
 
-BB46_15:
+BB48_15:
        bar.sync        0;
 
-BB46_16:
+BB48_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB46_20;
+       @%p10 bra       BB48_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB46_19;
+       @%p11 bra       BB48_19;
 
        ld.shared.f64   %fd32, [%r13+512];
        max.f64         %fd6, %fd6, %fd32;
        st.shared.f64   [%r13], %fd6;
 
-BB46_19:
+BB48_19:
        bar.sync        0;
 
-BB46_20:
+BB48_20:
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB46_33;
+       @%p12 bra       BB48_33;
 
        setp.lt.u32     %p13, %r14, 64;
-       @%p13 bra       BB46_23;
+       @%p13 bra       BB48_23;
 
        ld.volatile.shared.f64  %fd33, [%r13+256];
        max.f64         %fd6, %fd6, %fd33;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB46_23:
+BB48_23:
        setp.lt.u32     %p14, %r14, 32;
-       @%p14 bra       BB46_25;
+       @%p14 bra       BB48_25;
 
        ld.volatile.shared.f64  %fd34, [%r13+128];
        max.f64         %fd6, %fd6, %fd34;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB46_25:
+BB48_25:
        setp.lt.u32     %p15, %r14, 16;
-       @%p15 bra       BB46_27;
+       @%p15 bra       BB48_27;
 
        ld.volatile.shared.f64  %fd35, [%r13+64];
        max.f64         %fd6, %fd6, %fd35;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB46_27:
+BB48_27:
        setp.lt.u32     %p16, %r14, 8;
-       @%p16 bra       BB46_29;
+       @%p16 bra       BB48_29;
 
        ld.volatile.shared.f64  %fd36, [%r13+32];
        max.f64         %fd6, %fd6, %fd36;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB46_29:
+BB48_29:
        setp.lt.u32     %p17, %r14, 4;
-       @%p17 bra       BB46_31;
+       @%p17 bra       BB48_31;
 
        ld.volatile.shared.f64  %fd37, [%r13+16];
        max.f64         %fd6, %fd6, %fd37;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB46_31:
+BB48_31:
        setp.lt.u32     %p18, %r14, 2;
-       @%p18 bra       BB46_33;
+       @%p18 bra       BB48_33;
 
        ld.volatile.shared.f64  %fd38, [%r13+8];
        max.f64         %fd39, %fd6, %fd38;
        st.volatile.shared.f64  [%r13], %fd39;
 
-BB46_33:
+BB48_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB46_35;
+       @%p19 bra       BB48_35;
 
        ld.shared.f64   %fd40, [my_sdata];
        cvta.to.global.u64      %rd6, %rd2;
@@ -6243,7 +7976,7 @@ BB46_33:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd40;
 
-BB46_35:
+BB48_35:
        ret;
 }
 
@@ -6267,16 +8000,16 @@ BB46_35:
        ld.param.u32    %r4, [reduce_row_max_f_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB47_35;
+       @%p1 bra        BB49_35;
 
        mov.u32         %r71, %tid.x;
        mov.f32         %f6, 0fFF7FFFFF;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB47_4;
+       @%p2 bra        BB49_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB47_3:
+BB49_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -6285,9 +8018,9 @@ BB47_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB47_3;
+       @%p3 bra        BB49_3;
 
-BB47_4:
+BB49_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 2;
        mov.u32         %r12, my_sdata;
@@ -6296,114 +8029,114 @@ BB47_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB47_8;
+       @%p4 bra        BB49_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB47_7;
+       @%p5 bra        BB49_7;
 
        ld.shared.f32   %f29, [%r13+2048];
        max.f32         %f6, %f6, %f29;
        st.shared.f32   [%r13], %f6;
 
-BB47_7:
+BB49_7:
        bar.sync        0;
 
-BB47_8:
+BB49_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB47_12;
+       @%p6 bra        BB49_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB47_11;
+       @%p7 bra        BB49_11;
 
        ld.shared.f32   %f30, [%r13+1024];
        max.f32         %f6, %f6, %f30;
        st.shared.f32   [%r13], %f6;
 
-BB47_11:
+BB49_11:
        bar.sync        0;
 
-BB47_12:
+BB49_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB47_16;
+       @%p8 bra        BB49_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB47_15;
+       @%p9 bra        BB49_15;
 
        ld.shared.f32   %f31, [%r13+512];
        max.f32         %f6, %f6, %f31;
        st.shared.f32   [%r13], %f6;
 
-BB47_15:
+BB49_15:
        bar.sync        0;
 
-BB47_16:
+BB49_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB47_20;
+       @%p10 bra       BB49_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB47_19;
+       @%p11 bra       BB49_19;
 
        ld.shared.f32   %f32, [%r13+256];
        max.f32         %f6, %f6, %f32;
        st.shared.f32   [%r13], %f6;
 
-BB47_19:
+BB49_19:
        bar.sync        0;
 
-BB47_20:
+BB49_20:
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB47_33;
+       @%p12 bra       BB49_33;
 
        setp.lt.u32     %p13, %r14, 64;
-       @%p13 bra       BB47_23;
+       @%p13 bra       BB49_23;
 
        ld.volatile.shared.f32  %f33, [%r13+128];
        max.f32         %f6, %f6, %f33;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB47_23:
+BB49_23:
        setp.lt.u32     %p14, %r14, 32;
-       @%p14 bra       BB47_25;
+       @%p14 bra       BB49_25;
 
        ld.volatile.shared.f32  %f34, [%r13+64];
        max.f32         %f6, %f6, %f34;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB47_25:
+BB49_25:
        setp.lt.u32     %p15, %r14, 16;
-       @%p15 bra       BB47_27;
+       @%p15 bra       BB49_27;
 
        ld.volatile.shared.f32  %f35, [%r13+32];
        max.f32         %f6, %f6, %f35;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB47_27:
+BB49_27:
        setp.lt.u32     %p16, %r14, 8;
-       @%p16 bra       BB47_29;
+       @%p16 bra       BB49_29;
 
        ld.volatile.shared.f32  %f36, [%r13+16];
        max.f32         %f6, %f6, %f36;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB47_29:
+BB49_29:
        setp.lt.u32     %p17, %r14, 4;
-       @%p17 bra       BB47_31;
+       @%p17 bra       BB49_31;
 
        ld.volatile.shared.f32  %f37, [%r13+8];
        max.f32         %f6, %f6, %f37;
        st.volatile.shared.f32  [%r13], %f6;
 
-BB47_31:
+BB49_31:
        setp.lt.u32     %p18, %r14, 2;
-       @%p18 bra       BB47_33;
+       @%p18 bra       BB49_33;
 
        ld.volatile.shared.f32  %f38, [%r13+4];
        max.f32         %f39, %f6, %f38;
        st.volatile.shared.f32  [%r13], %f39;
 
-BB47_33:
+BB49_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB47_35;
+       @%p19 bra       BB49_35;
 
        ld.shared.f32   %f40, [my_sdata];
        cvta.to.global.u64      %rd6, %rd2;
@@ -6411,7 +8144,7 @@ BB47_33:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f40;
 
-BB47_35:
+BB49_35:
        ret;
 }
 
@@ -6438,32 +8171,32 @@ BB47_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB48_5;
+       @%p1 bra        BB50_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0dFFEFFFFFFFFFFFFF;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB48_4;
+       @%p2 bra        BB50_4;
 
        mov.u32         %r10, %r1;
 
-BB48_3:
+BB50_3:
        mul.wide.u32    %rd4, %r10, 8;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f64   %fd6, [%rd5];
        max.f64         %fd8, %fd8, %fd6;
        add.s32         %r10, %r10, %r6;
        setp.lt.u32     %p3, %r10, %r2;
-       @%p3 bra        BB48_3;
+       @%p3 bra        BB50_3;
 
-BB48_4:
+BB50_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB48_5:
+BB50_5:
        ret;
 }
 
@@ -6490,32 +8223,32 @@ BB48_5:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB49_5;
+       @%p1 bra        BB51_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f32         %f8, 0fFF7FFFFF;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB49_4;
+       @%p2 bra        BB51_4;
 
        mov.u32         %r10, %r1;
 
-BB49_3:
+BB51_3:
        mul.wide.u32    %rd4, %r10, 4;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f32   %f6, [%rd5];
        max.f32         %f8, %f8, %f6;
        add.s32         %r10, %r10, %r6;
        setp.lt.u32     %p3, %r10, %r2;
-       @%p3 bra        BB49_3;
+       @%p3 bra        BB51_3;
 
-BB49_4:
+BB51_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 4;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f8;
 
-BB49_5:
+BB51_5:
        ret;
 }
 
@@ -6542,9 +8275,9 @@ BB49_5:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f64         %fd44, 0d7FEFFFFFFFFFFFFF;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB50_4;
+       @%p1 bra        BB52_4;
 
-BB50_1:
+BB52_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -6552,135 +8285,135 @@ BB50_1:
        min.f64         %fd44, %fd44, %fd30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB50_3;
+       @%p2 bra        BB52_3;
 
        mul.wide.u32    %rd7, %r3, 8;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f64   %fd31, [%rd8];
        min.f64         %fd44, %fd44, %fd31;
 
-BB50_3:
+BB52_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB50_1;
+       @%p3 bra        BB52_1;
 
-BB50_4:
+BB52_4:
        shl.b32         %r16, %r7, 3;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f64   [%r5], %fd44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB50_8;
+       @%p4 bra        BB52_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB50_7;
+       @%p5 bra        BB52_7;
 
        ld.shared.f64   %fd32, [%r5+4096];
        min.f64         %fd44, %fd44, %fd32;
        st.shared.f64   [%r5], %fd44;
 
-BB50_7:
+BB52_7:
        bar.sync        0;
 
-BB50_8:
+BB52_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB50_12;
+       @%p6 bra        BB52_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB50_11;
+       @%p7 bra        BB52_11;
 
        ld.shared.f64   %fd33, [%r5+2048];
        min.f64         %fd44, %fd44, %fd33;
        st.shared.f64   [%r5], %fd44;
 
-BB50_11:
+BB52_11:
        bar.sync        0;
 
-BB50_12:
+BB52_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB50_16;
+       @%p8 bra        BB52_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB50_15;
+       @%p9 bra        BB52_15;
 
        ld.shared.f64   %fd34, [%r5+1024];
        min.f64         %fd44, %fd44, %fd34;
        st.shared.f64   [%r5], %fd44;
 
-BB50_15:
+BB52_15:
        bar.sync        0;
 
-BB50_16:
+BB52_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB50_20;
+       @%p10 bra       BB52_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB50_19;
+       @%p11 bra       BB52_19;
 
        ld.shared.f64   %fd35, [%r5+512];
        min.f64         %fd44, %fd44, %fd35;
        st.shared.f64   [%r5], %fd44;
 
-BB50_19:
+BB52_19:
        bar.sync        0;
 
-BB50_20:
+BB52_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB50_33;
+       @%p12 bra       BB52_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB50_23;
+       @%p13 bra       BB52_23;
 
        ld.volatile.shared.f64  %fd36, [%r5+256];
        min.f64         %fd44, %fd44, %fd36;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB50_23:
+BB52_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB50_25;
+       @%p14 bra       BB52_25;
 
        ld.volatile.shared.f64  %fd37, [%r5+128];
        min.f64         %fd44, %fd44, %fd37;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB50_25:
+BB52_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB50_27;
+       @%p15 bra       BB52_27;
 
        ld.volatile.shared.f64  %fd38, [%r5+64];
        min.f64         %fd44, %fd44, %fd38;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB50_27:
+BB52_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB50_29;
+       @%p16 bra       BB52_29;
 
        ld.volatile.shared.f64  %fd39, [%r5+32];
        min.f64         %fd44, %fd44, %fd39;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB50_29:
+BB52_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB50_31;
+       @%p17 bra       BB52_31;
 
        ld.volatile.shared.f64  %fd40, [%r5+16];
        min.f64         %fd44, %fd44, %fd40;
        st.volatile.shared.f64  [%r5], %fd44;
 
-BB50_31:
+BB52_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB50_33;
+       @%p18 bra       BB52_33;
 
        ld.volatile.shared.f64  %fd41, [%r5+8];
        min.f64         %fd42, %fd44, %fd41;
        st.volatile.shared.f64  [%r5], %fd42;
 
-BB50_33:
+BB52_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB50_35;
+       @%p19 bra       BB52_35;
 
        ld.shared.f64   %fd43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -6688,7 +8421,7 @@ BB50_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f64   [%rd11], %fd43;
 
-BB50_35:
+BB52_35:
        ret;
 }
 
@@ -6715,9 +8448,9 @@ BB50_35:
        mad.lo.s32      %r35, %r9, %r10, %r7;
        mov.f32         %f44, 0f7F7FFFFF;
        setp.ge.u32     %p1, %r35, %r6;
-       @%p1 bra        BB51_4;
+       @%p1 bra        BB53_4;
 
-BB51_1:
+BB53_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.u32    %rd4, %r35, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -6725,135 +8458,135 @@ BB51_1:
        min.f32         %f44, %f44, %f30;
        add.s32         %r3, %r35, %r10;
        setp.ge.u32     %p2, %r3, %r6;
-       @%p2 bra        BB51_3;
+       @%p2 bra        BB53_3;
 
        mul.wide.u32    %rd7, %r3, 4;
        add.s64         %rd8, %rd3, %rd7;
        ld.global.f32   %f31, [%rd8];
        min.f32         %f44, %f44, %f31;
 
-BB51_3:
+BB53_3:
        shl.b32         %r13, %r10, 1;
        mov.u32         %r14, %nctaid.x;
        mad.lo.s32      %r35, %r13, %r14, %r35;
        setp.lt.u32     %p3, %r35, %r6;
-       @%p3 bra        BB51_1;
+       @%p3 bra        BB53_1;
 
-BB51_4:
+BB53_4:
        shl.b32         %r16, %r7, 2;
        mov.u32         %r17, my_sdata;
        add.s32         %r5, %r17, %r16;
        st.shared.f32   [%r5], %f44;
        bar.sync        0;
        setp.lt.u32     %p4, %r10, 1024;
-       @%p4 bra        BB51_8;
+       @%p4 bra        BB53_8;
 
        setp.gt.u32     %p5, %r7, 511;
-       @%p5 bra        BB51_7;
+       @%p5 bra        BB53_7;
 
        ld.shared.f32   %f32, [%r5+2048];
        min.f32         %f44, %f44, %f32;
        st.shared.f32   [%r5], %f44;
 
-BB51_7:
+BB53_7:
        bar.sync        0;
 
-BB51_8:
+BB53_8:
        setp.lt.u32     %p6, %r10, 512;
-       @%p6 bra        BB51_12;
+       @%p6 bra        BB53_12;
 
        setp.gt.u32     %p7, %r7, 255;
-       @%p7 bra        BB51_11;
+       @%p7 bra        BB53_11;
 
        ld.shared.f32   %f33, [%r5+1024];
        min.f32         %f44, %f44, %f33;
        st.shared.f32   [%r5], %f44;
 
-BB51_11:
+BB53_11:
        bar.sync        0;
 
-BB51_12:
+BB53_12:
        setp.lt.u32     %p8, %r10, 256;
-       @%p8 bra        BB51_16;
+       @%p8 bra        BB53_16;
 
        setp.gt.u32     %p9, %r7, 127;
-       @%p9 bra        BB51_15;
+       @%p9 bra        BB53_15;
 
        ld.shared.f32   %f34, [%r5+512];
        min.f32         %f44, %f44, %f34;
        st.shared.f32   [%r5], %f44;
 
-BB51_15:
+BB53_15:
        bar.sync        0;
 
-BB51_16:
+BB53_16:
        setp.lt.u32     %p10, %r10, 128;
-       @%p10 bra       BB51_20;
+       @%p10 bra       BB53_20;
 
        setp.gt.u32     %p11, %r7, 63;
-       @%p11 bra       BB51_19;
+       @%p11 bra       BB53_19;
 
        ld.shared.f32   %f35, [%r5+256];
        min.f32         %f44, %f44, %f35;
        st.shared.f32   [%r5], %f44;
 
-BB51_19:
+BB53_19:
        bar.sync        0;
 
-BB51_20:
+BB53_20:
        setp.gt.u32     %p12, %r7, 31;
-       @%p12 bra       BB51_33;
+       @%p12 bra       BB53_33;
 
        setp.lt.u32     %p13, %r10, 64;
-       @%p13 bra       BB51_23;
+       @%p13 bra       BB53_23;
 
        ld.volatile.shared.f32  %f36, [%r5+128];
        min.f32         %f44, %f44, %f36;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB51_23:
+BB53_23:
        setp.lt.u32     %p14, %r10, 32;
-       @%p14 bra       BB51_25;
+       @%p14 bra       BB53_25;
 
        ld.volatile.shared.f32  %f37, [%r5+64];
        min.f32         %f44, %f44, %f37;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB51_25:
+BB53_25:
        setp.lt.u32     %p15, %r10, 16;
-       @%p15 bra       BB51_27;
+       @%p15 bra       BB53_27;
 
        ld.volatile.shared.f32  %f38, [%r5+32];
        min.f32         %f44, %f44, %f38;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB51_27:
+BB53_27:
        setp.lt.u32     %p16, %r10, 8;
-       @%p16 bra       BB51_29;
+       @%p16 bra       BB53_29;
 
        ld.volatile.shared.f32  %f39, [%r5+16];
        min.f32         %f44, %f44, %f39;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB51_29:
+BB53_29:
        setp.lt.u32     %p17, %r10, 4;
-       @%p17 bra       BB51_31;
+       @%p17 bra       BB53_31;
 
        ld.volatile.shared.f32  %f40, [%r5+8];
        min.f32         %f44, %f44, %f40;
        st.volatile.shared.f32  [%r5], %f44;
 
-BB51_31:
+BB53_31:
        setp.lt.u32     %p18, %r10, 2;
-       @%p18 bra       BB51_33;
+       @%p18 bra       BB53_33;
 
        ld.volatile.shared.f32  %f41, [%r5+4];
        min.f32         %f42, %f44, %f41;
        st.volatile.shared.f32  [%r5], %f42;
 
-BB51_33:
+BB53_33:
        setp.ne.s32     %p19, %r7, 0;
-       @%p19 bra       BB51_35;
+       @%p19 bra       BB53_35;
 
        ld.shared.f32   %f43, [my_sdata];
        cvta.to.global.u64      %rd9, %rd2;
@@ -6861,7 +8594,7 @@ BB51_33:
        add.s64         %rd11, %rd9, %rd10;
        st.global.f32   [%rd11], %f43;
 
-BB51_35:
+BB53_35:
        ret;
 }
 
@@ -6885,16 +8618,16 @@ BB51_35:
        ld.param.u32    %r4, [reduce_row_min_d_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB52_35;
+       @%p1 bra        BB54_35;
 
        mov.u32         %r71, %tid.x;
        mov.f64         %fd6, 0d7FEFFFFFFFFFFFFF;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB52_4;
+       @%p2 bra        BB54_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB52_3:
+BB54_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -6903,9 +8636,9 @@ BB52_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB52_3;
+       @%p3 bra        BB54_3;
 
-BB52_4:
+BB54_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 3;
        mov.u32         %r12, my_sdata;
@@ -6914,114 +8647,114 @@ BB52_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB52_8;
+       @%p4 bra        BB54_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB52_7;
+       @%p5 bra        BB54_7;
 
        ld.shared.f64   %fd29, [%r13+4096];
        min.f64         %fd6, %fd6, %fd29;
        st.shared.f64   [%r13], %fd6;
 
-BB52_7:
+BB54_7:
        bar.sync        0;
 
-BB52_8:
+BB54_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB52_12;
+       @%p6 bra        BB54_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB52_11;
+       @%p7 bra        BB54_11;
 
        ld.shared.f64   %fd30, [%r13+2048];
        min.f64         %fd6, %fd6, %fd30;
        st.shared.f64   [%r13], %fd6;
 
-BB52_11:
+BB54_11:
        bar.sync        0;
 
-BB52_12:
+BB54_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB52_16;
+       @%p8 bra        BB54_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB52_15;
+       @%p9 bra        BB54_15;
 
        ld.shared.f64   %fd31, [%r13+1024];
        min.f64         %fd6, %fd6, %fd31;
        st.shared.f64   [%r13], %fd6;
 
-BB52_15:
+BB54_15:
        bar.sync        0;
 
-BB52_16:
+BB54_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB52_20;
+       @%p10 bra       BB54_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB52_19;
+       @%p11 bra       BB54_19;
 
        ld.shared.f64   %fd32, [%r13+512];
        min.f64         %fd6, %fd6, %fd32;
        st.shared.f64   [%r13], %fd6;
 
-BB52_19:
+BB54_19:
        bar.sync        0;
 
-BB52_20:
+BB54_20:
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB52_33;
+       @%p12 bra       BB54_33;
 
        setp.lt.u32     %p13, %r14, 64;
-       @%p13 bra       BB52_23;
+       @%p13 bra       BB54_23;
 
        ld.volatile.shared.f64  %fd33, [%r13+256];
        min.f64         %fd6, %fd6, %fd33;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB52_23:
+BB54_23:
        setp.lt.u32     %p14, %r14, 32;
-       @%p14 bra       BB52_25;
+       @%p14 bra       BB54_25;
 
        ld.volatile.shared.f64  %fd34, [%r13+128];
        min.f64         %fd6, %fd6, %fd34;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB52_25:
+BB54_25:
        setp.lt.u32     %p15, %r14, 16;
-       @%p15 bra       BB52_27;
+       @%p15 bra       BB54_27;
 
        ld.volatile.shared.f64  %fd35, [%r13+64];
        min.f64         %fd6, %fd6, %fd35;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB52_27:
+BB54_27:
        setp.lt.u32     %p16, %r14, 8;
-       @%p16 bra       BB52_29;
+       @%p16 bra       BB54_29;
 
        ld.volatile.shared.f64  %fd36, [%r13+32];
        min.f64         %fd6, %fd6, %fd36;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB52_29:
+BB54_29:
        setp.lt.u32     %p17, %r14, 4;
-       @%p17 bra       BB52_31;
+       @%p17 bra       BB54_31;
 
        ld.volatile.shared.f64  %fd37, [%r13+16];
        min.f64         %fd6, %fd6, %fd37;
        st.volatile.shared.f64  [%r13], %fd6;
 
-BB52_31:
+BB54_31:
        setp.lt.u32     %p18, %r14, 2;
-       @%p18 bra       BB52_33;
+       @%p18 bra       BB54_33;
 
        ld.volatile.shared.f64  %fd38, [%r13+8];
        min.f64         %fd39, %fd6, %fd38;
        st.volatile.shared.f64  [%r13], %fd39;
 
-BB52_33:
+BB54_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB52_35;
+       @%p19 bra       BB54_35;
 
        ld.shared.f64   %fd40, [my_sdata];
        cvta.to.global.u64      %rd6, %rd2;
@@ -7029,7 +8762,7 @@ BB52_33:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd40;
 
-BB52_35:
+BB54_35:
        ret;
 }
 
@@ -7053,16 +8786,16 @@ BB52_35:
        ld.param.u32    %r4, [reduce_row_min_f_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB53_35;
+       @%p1 bra        BB55_35;
 
        mov.u32         %r71, %tid.x;
        mov.f32         %f6, 0f7F7FFFFF;
        setp.ge.u32     %p2, %r71, %r4;
-       @%p2 bra        BB53_4;
+       @%p2 bra        BB55_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB53_3:
+BB55_3:
        mad.lo.s32      %r8, %r6, %r4, %r71;
        mul.wide.u32    %rd4, %r8, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -7071,9 +8804,9 @@ BB53_3:
        mov.u32         %r9, %ntid.x;
        add.s32         %r71, %r9, %r71;
        setp.lt.u32     %p3, %r71, %r4;
-       @%p3 bra        BB53_3;
+       @%p3 bra        BB55_3;
 
-BB53_4:
+BB55_4:
        mov.u32         %r10, %tid.x;
        shl.b32         %r11, %r10, 2;
        mov.u32         %r12, my_sdata;
@@ -7082,114 +8815,114 @@ BB53_4:
        bar.sync        0;
        mov.u32         %r14, %ntid.x;
        setp.lt.u32     %p4, %r14, 1024;
-       @%p4 bra        BB53_8;
+       @%p4 bra        BB55_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB53_7;
+       @%p5 bra        BB55_7;
 
        ld.shared.f32   %f29, [%r13+2048];
        min.f32         %f6, %f6, %f29;
        st.shared.f32   [%r13], %f6;
 
-BB53_7:
+BB55_7:
        bar.sync        0;
 
-BB53_8:
+BB55_8:
        setp.lt.u32     %p6, %r14, 512;
-       @%p6 bra        BB53_12;
+       @%p6 bra        BB55_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB53_11;
+       @%p7 bra        BB55_11;
 
        ld.shared.f32   %f30, [%r13+1024];
        min.f32         %f6, %f6, %f30;
        st.shared.f32   [%r13], %f6;
 
-BB53_11:
+BB55_11:
        bar.sync        0;
 
-BB53_12:
+BB55_12:
        setp.lt.u32     %p8, %r14, 256;
-       @%p8 bra        BB53_16;
+       @%p8 bra        BB55_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB53_15;
+       @%p9 bra        BB55_15;
 
        ld.shared.f32   %f31, [%r13+512];
        min.f32         %f6, %f6, %f31;
        st.shared.f32   [%r13], %f6;
 
-BB53_15:
+BB55_15:
        bar.sync        0;
 
-BB53_16:
+BB55_16:
        setp.lt.u32     %p10, %r14, 128;
-       @%p10 bra       BB53_20;
+       @%p10 bra       BB55_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB53_19;
+       @%p11 bra       BB55_19;
 
        ld.shared.f32   %f32, [%r13+256

<TRUNCATED>

Reply via email to