http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index c990f27..5f72887 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -103,6 +103,694 @@ BB1_2:
        ret;
 }
 
+       // .globl       sparse_dense_im2col_d
+.visible .entry sparse_dense_im2col_d(
+       .param .u64 sparse_dense_im2col_d_param_0,
+       .param .u64 sparse_dense_im2col_d_param_1,
+       .param .u64 sparse_dense_im2col_d_param_2,
+       .param .u64 sparse_dense_im2col_d_param_3,
+       .param .u32 sparse_dense_im2col_d_param_4,
+       .param .u32 sparse_dense_im2col_d_param_5,
+       .param .u32 sparse_dense_im2col_d_param_6,
+       .param .u32 sparse_dense_im2col_d_param_7,
+       .param .u32 sparse_dense_im2col_d_param_8,
+       .param .u32 sparse_dense_im2col_d_param_9,
+       .param .u32 sparse_dense_im2col_d_param_10,
+       .param .u32 sparse_dense_im2col_d_param_11,
+       .param .u32 sparse_dense_im2col_d_param_12,
+       .param .u32 sparse_dense_im2col_d_param_13,
+       .param .u32 sparse_dense_im2col_d_param_14,
+       .param .u32 sparse_dense_im2col_d_param_15,
+       .param .u32 sparse_dense_im2col_d_param_16,
+       .param .u32 sparse_dense_im2col_d_param_17,
+       .param .u32 sparse_dense_im2col_d_param_18,
+       .param .u32 sparse_dense_im2col_d_param_19
+)
+{
+       .reg .pred      %p<13>;
+       .reg .b32       %r<74>;
+       .reg .f64       %fd<2>;
+       .reg .b64       %rd<18>;
+
+
+       ld.param.u64    %rd4, [sparse_dense_im2col_d_param_0];
+       ld.param.u64    %rd5, [sparse_dense_im2col_d_param_1];
+       ld.param.u64    %rd6, [sparse_dense_im2col_d_param_2];
+       ld.param.u64    %rd7, [sparse_dense_im2col_d_param_3];
+       ld.param.u32    %r35, [sparse_dense_im2col_d_param_4];
+       ld.param.u32    %r22, [sparse_dense_im2col_d_param_7];
+       ld.param.u32    %r23, [sparse_dense_im2col_d_param_8];
+       ld.param.u32    %r24, [sparse_dense_im2col_d_param_9];
+       ld.param.u32    %r25, [sparse_dense_im2col_d_param_10];
+       ld.param.u32    %r26, [sparse_dense_im2col_d_param_11];
+       ld.param.u32    %r27, [sparse_dense_im2col_d_param_12];
+       ld.param.u32    %r28, [sparse_dense_im2col_d_param_13];
+       ld.param.u32    %r29, [sparse_dense_im2col_d_param_14];
+       ld.param.u32    %r30, [sparse_dense_im2col_d_param_15];
+       ld.param.u32    %r31, [sparse_dense_im2col_d_param_16];
+       ld.param.u32    %r32, [sparse_dense_im2col_d_param_17];
+       ld.param.u32    %r33, [sparse_dense_im2col_d_param_18];
+       ld.param.u32    %r34, [sparse_dense_im2col_d_param_19];
+       mov.u32         %r36, %ntid.x;
+       mov.u32         %r37, %ctaid.x;
+       mov.u32         %r38, %tid.x;
+       mad.lo.s32      %r1, %r36, %r37, %r38;
+       setp.ge.s32     %p1, %r1, %r35;
+       @%p1 bra        BB2_11;
+
+       cvta.to.global.u64      %rd1, %rd6;
+       cvta.to.global.u64      %rd2, %rd5;
+       cvta.to.global.u64      %rd8, %rd4;
+       cvt.s64.s32     %rd3, %r1;
+       mul.wide.s32    %rd9, %r1, 8;
+       add.s64         %rd10, %rd8, %rd9;
+       ld.global.f64   %fd1, [%rd10];
+       mov.u32         %r67, 0;
+
+BB2_2:
+       mov.u32         %r2, %r67;
+       add.s32         %r67, %r2, 1;
+       mul.wide.s32    %rd11, %r67, 4;
+       add.s64         %rd12, %rd2, %rd11;
+       ld.global.u32   %r40, [%rd12];
+       setp.le.s32     %p2, %r40, %r1;
+       @%p2 bra        BB2_2;
+
+       shl.b64         %rd13, %rd3, 2;
+       add.s64         %rd14, %rd1, %rd13;
+       ld.global.u32   %r41, [%rd14];
+       div.s32         %r4, %r41, %r22;
+       rem.s32         %r42, %r41, %r22;
+       div.s32         %r43, %r42, %r23;
+       rem.s32         %r44, %r42, %r23;
+       add.s32         %r5, %r43, %r33;
+       mul.lo.s32      %r45, %r31, %r26;
+       mov.u32         %r46, 1;
+       sub.s32         %r47, %r46, %r45;
+       add.s32         %r48, %r47, %r5;
+       mov.u32         %r49, 0;
+       max.s32         %r70, %r49, %r48;
+       add.s32         %r50, %r24, -1;
+       min.s32         %r7, %r50, %r5;
+       add.s32         %r8, %r44, %r34;
+       mul.lo.s32      %r51, %r32, %r27;
+       sub.s32         %r52, %r46, %r51;
+       add.s32         %r53, %r52, %r8;
+       max.s32         %r73, %r49, %r53;
+       add.s32         %r54, %r25, -1;
+       min.s32         %r10, %r54, %r8;
+
+BB2_4:
+       mov.u32         %r69, %r70;
+       sub.s32         %r55, %r5, %r69;
+       rem.s32         %r56, %r55, %r31;
+       setp.ne.s32     %p3, %r56, 0;
+       setp.le.s32     %p4, %r69, %r7;
+       and.pred        %p5, %p3, %p4;
+       add.s32         %r70, %r69, 1;
+       @%p5 bra        BB2_4;
+
+BB2_5:
+       mov.u32         %r13, %r73;
+       sub.s32         %r57, %r8, %r13;
+       rem.s32         %r58, %r57, %r32;
+       setp.ne.s32     %p6, %r58, 0;
+       setp.le.s32     %p7, %r13, %r10;
+       and.pred        %p8, %p6, %p7;
+       add.s32         %r73, %r13, 1;
+       @%p8 bra        BB2_5;
+
+       setp.gt.s32     %p9, %r69, %r7;
+       @%p9 bra        BB2_11;
+
+       mul.lo.s32      %r15, %r2, %r28;
+       mul.lo.s32      %r16, %r4, %r29;
+       cvta.to.global.u64      %rd15, %rd7;
+
+BB2_8:
+       sub.s32         %r59, %r5, %r69;
+       div.s32         %r60, %r59, %r31;
+       mad.lo.s32      %r18, %r60, %r27, %r15;
+       setp.gt.s32     %p10, %r13, %r10;
+       mov.u32         %r72, %r13;
+       @%p10 bra       BB2_10;
+
+BB2_9:
+       mov.u32         %r19, %r72;
+       sub.s32         %r61, %r8, %r19;
+       div.s32         %r62, %r61, %r32;
+       mad.lo.s32      %r63, %r69, %r25, %r16;
+       add.s32         %r64, %r63, %r19;
+       mad.lo.s32      %r65, %r64, %r30, %r18;
+       add.s32         %r66, %r65, %r62;
+       mul.wide.s32    %rd16, %r66, 8;
+       add.s64         %rd17, %rd15, %rd16;
+       st.global.f64   [%rd17], %fd1;
+       add.s32         %r20, %r19, %r32;
+       setp.le.s32     %p11, %r20, %r10;
+       mov.u32         %r72, %r20;
+       @%p11 bra       BB2_9;
+
+BB2_10:
+       add.s32         %r69, %r69, %r31;
+       setp.le.s32     %p12, %r69, %r7;
+       @%p12 bra       BB2_8;
+
+BB2_11:
+       ret;
+}
+
+       // .globl       sparse_dense_im2col_f
+.visible .entry sparse_dense_im2col_f(
+       .param .u64 sparse_dense_im2col_f_param_0,
+       .param .u64 sparse_dense_im2col_f_param_1,
+       .param .u64 sparse_dense_im2col_f_param_2,
+       .param .u64 sparse_dense_im2col_f_param_3,
+       .param .u32 sparse_dense_im2col_f_param_4,
+       .param .u32 sparse_dense_im2col_f_param_5,
+       .param .u32 sparse_dense_im2col_f_param_6,
+       .param .u32 sparse_dense_im2col_f_param_7,
+       .param .u32 sparse_dense_im2col_f_param_8,
+       .param .u32 sparse_dense_im2col_f_param_9,
+       .param .u32 sparse_dense_im2col_f_param_10,
+       .param .u32 sparse_dense_im2col_f_param_11,
+       .param .u32 sparse_dense_im2col_f_param_12,
+       .param .u32 sparse_dense_im2col_f_param_13,
+       .param .u32 sparse_dense_im2col_f_param_14,
+       .param .u32 sparse_dense_im2col_f_param_15,
+       .param .u32 sparse_dense_im2col_f_param_16,
+       .param .u32 sparse_dense_im2col_f_param_17,
+       .param .u32 sparse_dense_im2col_f_param_18,
+       .param .u32 sparse_dense_im2col_f_param_19
+)
+{
+       .reg .pred      %p<13>;
+       .reg .f32       %f<2>;
+       .reg .b32       %r<74>;
+       .reg .b64       %rd<18>;
+
+
+       ld.param.u64    %rd4, [sparse_dense_im2col_f_param_0];
+       ld.param.u64    %rd5, [sparse_dense_im2col_f_param_1];
+       ld.param.u64    %rd6, [sparse_dense_im2col_f_param_2];
+       ld.param.u64    %rd7, [sparse_dense_im2col_f_param_3];
+       ld.param.u32    %r35, [sparse_dense_im2col_f_param_4];
+       ld.param.u32    %r22, [sparse_dense_im2col_f_param_7];
+       ld.param.u32    %r23, [sparse_dense_im2col_f_param_8];
+       ld.param.u32    %r24, [sparse_dense_im2col_f_param_9];
+       ld.param.u32    %r25, [sparse_dense_im2col_f_param_10];
+       ld.param.u32    %r26, [sparse_dense_im2col_f_param_11];
+       ld.param.u32    %r27, [sparse_dense_im2col_f_param_12];
+       ld.param.u32    %r28, [sparse_dense_im2col_f_param_13];
+       ld.param.u32    %r29, [sparse_dense_im2col_f_param_14];
+       ld.param.u32    %r30, [sparse_dense_im2col_f_param_15];
+       ld.param.u32    %r31, [sparse_dense_im2col_f_param_16];
+       ld.param.u32    %r32, [sparse_dense_im2col_f_param_17];
+       ld.param.u32    %r33, [sparse_dense_im2col_f_param_18];
+       ld.param.u32    %r34, [sparse_dense_im2col_f_param_19];
+       mov.u32         %r36, %ntid.x;
+       mov.u32         %r37, %ctaid.x;
+       mov.u32         %r38, %tid.x;
+       mad.lo.s32      %r1, %r36, %r37, %r38;
+       setp.ge.s32     %p1, %r1, %r35;
+       @%p1 bra        BB3_11;
+
+       cvta.to.global.u64      %rd1, %rd6;
+       cvta.to.global.u64      %rd2, %rd5;
+       cvta.to.global.u64      %rd8, %rd4;
+       cvt.s64.s32     %rd3, %r1;
+       mul.wide.s32    %rd9, %r1, 4;
+       add.s64         %rd10, %rd8, %rd9;
+       ld.global.f32   %f1, [%rd10];
+       mov.u32         %r67, 0;
+
+BB3_2:
+       mov.u32         %r2, %r67;
+       add.s32         %r67, %r2, 1;
+       mul.wide.s32    %rd11, %r67, 4;
+       add.s64         %rd12, %rd2, %rd11;
+       ld.global.u32   %r40, [%rd12];
+       setp.le.s32     %p2, %r40, %r1;
+       @%p2 bra        BB3_2;
+
+       shl.b64         %rd13, %rd3, 2;
+       add.s64         %rd14, %rd1, %rd13;
+       ld.global.u32   %r41, [%rd14];
+       div.s32         %r4, %r41, %r22;
+       rem.s32         %r42, %r41, %r22;
+       div.s32         %r43, %r42, %r23;
+       rem.s32         %r44, %r42, %r23;
+       add.s32         %r5, %r43, %r33;
+       mul.lo.s32      %r45, %r31, %r26;
+       mov.u32         %r46, 1;
+       sub.s32         %r47, %r46, %r45;
+       add.s32         %r48, %r47, %r5;
+       mov.u32         %r49, 0;
+       max.s32         %r70, %r49, %r48;
+       add.s32         %r50, %r24, -1;
+       min.s32         %r7, %r50, %r5;
+       add.s32         %r8, %r44, %r34;
+       mul.lo.s32      %r51, %r32, %r27;
+       sub.s32         %r52, %r46, %r51;
+       add.s32         %r53, %r52, %r8;
+       max.s32         %r73, %r49, %r53;
+       add.s32         %r54, %r25, -1;
+       min.s32         %r10, %r54, %r8;
+
+BB3_4:
+       mov.u32         %r69, %r70;
+       sub.s32         %r55, %r5, %r69;
+       rem.s32         %r56, %r55, %r31;
+       setp.ne.s32     %p3, %r56, 0;
+       setp.le.s32     %p4, %r69, %r7;
+       and.pred        %p5, %p3, %p4;
+       add.s32         %r70, %r69, 1;
+       @%p5 bra        BB3_4;
+
+BB3_5:
+       mov.u32         %r13, %r73;
+       sub.s32         %r57, %r8, %r13;
+       rem.s32         %r58, %r57, %r32;
+       setp.ne.s32     %p6, %r58, 0;
+       setp.le.s32     %p7, %r13, %r10;
+       and.pred        %p8, %p6, %p7;
+       add.s32         %r73, %r13, 1;
+       @%p8 bra        BB3_5;
+
+       setp.gt.s32     %p9, %r69, %r7;
+       @%p9 bra        BB3_11;
+
+       mul.lo.s32      %r15, %r2, %r28;
+       mul.lo.s32      %r16, %r4, %r29;
+       cvta.to.global.u64      %rd15, %rd7;
+
+BB3_8:
+       sub.s32         %r59, %r5, %r69;
+       div.s32         %r60, %r59, %r31;
+       mad.lo.s32      %r18, %r60, %r27, %r15;
+       setp.gt.s32     %p10, %r13, %r10;
+       mov.u32         %r72, %r13;
+       @%p10 bra       BB3_10;
+
+BB3_9:
+       mov.u32         %r19, %r72;
+       sub.s32         %r61, %r8, %r19;
+       div.s32         %r62, %r61, %r32;
+       mad.lo.s32      %r63, %r69, %r25, %r16;
+       add.s32         %r64, %r63, %r19;
+       mad.lo.s32      %r65, %r64, %r30, %r18;
+       add.s32         %r66, %r65, %r62;
+       mul.wide.s32    %rd16, %r66, 4;
+       add.s64         %rd17, %rd15, %rd16;
+       st.global.f32   [%rd17], %f1;
+       add.s32         %r20, %r19, %r32;
+       setp.le.s32     %p11, %r20, %r10;
+       mov.u32         %r72, %r20;
+       @%p11 bra       BB3_9;
+
+BB3_10:
+       add.s32         %r69, %r69, %r31;
+       setp.le.s32     %p12, %r69, %r7;
+       @%p12 bra       BB3_8;
+
+BB3_11:
+       ret;
+}
+
+       // .globl       dense_dense_im2col_d
+.visible .entry dense_dense_im2col_d(
+       .param .u64 dense_dense_im2col_d_param_0,
+       .param .u64 dense_dense_im2col_d_param_1,
+       .param .u32 dense_dense_im2col_d_param_2,
+       .param .u32 dense_dense_im2col_d_param_3,
+       .param .u32 dense_dense_im2col_d_param_4,
+       .param .u32 dense_dense_im2col_d_param_5,
+       .param .u32 dense_dense_im2col_d_param_6,
+       .param .u32 dense_dense_im2col_d_param_7,
+       .param .u32 dense_dense_im2col_d_param_8,
+       .param .u32 dense_dense_im2col_d_param_9,
+       .param .u32 dense_dense_im2col_d_param_10,
+       .param .u32 dense_dense_im2col_d_param_11,
+       .param .u32 dense_dense_im2col_d_param_12,
+       .param .u32 dense_dense_im2col_d_param_13,
+       .param .u32 dense_dense_im2col_d_param_14,
+       .param .u32 dense_dense_im2col_d_param_15,
+       .param .u32 dense_dense_im2col_d_param_16
+)
+{
+       .reg .pred      %p<12>;
+       .reg .b32       %r<71>;
+       .reg .f64       %fd<2>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd1, [dense_dense_im2col_d_param_0];
+       ld.param.u64    %rd2, [dense_dense_im2col_d_param_1];
+       ld.param.u32    %r35, [dense_dense_im2col_d_param_2];
+       ld.param.u32    %r21, [dense_dense_im2col_d_param_3];
+       ld.param.u32    %r22, [dense_dense_im2col_d_param_4];
+       ld.param.u32    %r23, [dense_dense_im2col_d_param_5];
+       ld.param.u32    %r24, [dense_dense_im2col_d_param_6];
+       ld.param.u32    %r25, [dense_dense_im2col_d_param_7];
+       ld.param.u32    %r26, [dense_dense_im2col_d_param_8];
+       ld.param.u32    %r27, [dense_dense_im2col_d_param_9];
+       ld.param.u32    %r28, [dense_dense_im2col_d_param_10];
+       ld.param.u32    %r29, [dense_dense_im2col_d_param_11];
+       ld.param.u32    %r30, [dense_dense_im2col_d_param_12];
+       ld.param.u32    %r31, [dense_dense_im2col_d_param_13];
+       ld.param.u32    %r32, [dense_dense_im2col_d_param_14];
+       ld.param.u32    %r33, [dense_dense_im2col_d_param_15];
+       ld.param.u32    %r34, [dense_dense_im2col_d_param_16];
+       mov.u32         %r36, %ctaid.x;
+       mov.u32         %r37, %ntid.x;
+       mov.u32         %r38, %tid.x;
+       mad.lo.s32      %r1, %r37, %r36, %r38;
+       setp.ge.s32     %p1, %r1, %r35;
+       @%p1 bra        BB4_9;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       div.s32         %r2, %r1, %r21;
+       rem.s32         %r39, %r1, %r21;
+       div.s32         %r3, %r39, %r22;
+       rem.s32         %r40, %r39, %r22;
+       div.s32         %r41, %r40, %r23;
+       rem.s32         %r42, %r40, %r23;
+       add.s32         %r4, %r41, %r33;
+       mul.lo.s32      %r43, %r31, %r26;
+       mov.u32         %r44, 1;
+       sub.s32         %r45, %r44, %r43;
+       add.s32         %r46, %r45, %r4;
+       mov.u32         %r47, 0;
+       max.s32         %r67, %r47, %r46;
+       add.s32         %r48, %r24, -1;
+       min.s32         %r6, %r48, %r4;
+       add.s32         %r7, %r42, %r34;
+       mul.lo.s32      %r49, %r32, %r27;
+       sub.s32         %r50, %r44, %r49;
+       add.s32         %r51, %r50, %r7;
+       max.s32         %r70, %r47, %r51;
+       add.s32         %r52, %r25, -1;
+       min.s32         %r9, %r52, %r7;
+
+BB4_2:
+       mov.u32         %r66, %r67;
+       sub.s32         %r53, %r4, %r66;
+       rem.s32         %r54, %r53, %r31;
+       setp.ne.s32     %p2, %r54, 0;
+       setp.le.s32     %p3, %r66, %r6;
+       and.pred        %p4, %p2, %p3;
+       add.s32         %r67, %r66, 1;
+       @%p4 bra        BB4_2;
+
+BB4_3:
+       mov.u32         %r12, %r70;
+       sub.s32         %r55, %r7, %r12;
+       rem.s32         %r56, %r55, %r32;
+       setp.ne.s32     %p5, %r56, 0;
+       setp.le.s32     %p6, %r12, %r9;
+       and.pred        %p7, %p5, %p6;
+       add.s32         %r70, %r12, 1;
+       @%p7 bra        BB4_3;
+
+       setp.gt.s32     %p8, %r66, %r6;
+       @%p8 bra        BB4_9;
+
+       mul.lo.s32      %r14, %r2, %r28;
+       mul.lo.s32      %r15, %r3, %r29;
+       cvta.to.global.u64      %rd6, %rd2;
+
+BB4_6:
+       sub.s32         %r57, %r4, %r66;
+       div.s32         %r58, %r57, %r31;
+       mad.lo.s32      %r17, %r58, %r27, %r14;
+       setp.gt.s32     %p9, %r12, %r9;
+       mov.u32         %r69, %r12;
+       @%p9 bra        BB4_8;
+
+BB4_7:
+       mov.u32         %r18, %r69;
+       sub.s32         %r59, %r7, %r18;
+       div.s32         %r60, %r59, %r32;
+       mad.lo.s32      %r61, %r66, %r25, %r15;
+       add.s32         %r62, %r61, %r18;
+       mad.lo.s32      %r63, %r62, %r30, %r17;
+       add.s32         %r64, %r63, %r60;
+       mul.wide.s32    %rd7, %r64, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f64   [%rd8], %fd1;
+       add.s32         %r19, %r18, %r32;
+       setp.le.s32     %p10, %r19, %r9;
+       mov.u32         %r69, %r19;
+       @%p10 bra       BB4_7;
+
+BB4_8:
+       add.s32         %r66, %r66, %r31;
+       setp.le.s32     %p11, %r66, %r6;
+       @%p11 bra       BB4_6;
+
+BB4_9:
+       ret;
+}
+
+       // .globl       dense_dense_im2col_f
+.visible .entry dense_dense_im2col_f(
+       .param .u64 dense_dense_im2col_f_param_0,
+       .param .u64 dense_dense_im2col_f_param_1,
+       .param .u32 dense_dense_im2col_f_param_2,
+       .param .u32 dense_dense_im2col_f_param_3,
+       .param .u32 dense_dense_im2col_f_param_4,
+       .param .u32 dense_dense_im2col_f_param_5,
+       .param .u32 dense_dense_im2col_f_param_6,
+       .param .u32 dense_dense_im2col_f_param_7,
+       .param .u32 dense_dense_im2col_f_param_8,
+       .param .u32 dense_dense_im2col_f_param_9,
+       .param .u32 dense_dense_im2col_f_param_10,
+       .param .u32 dense_dense_im2col_f_param_11,
+       .param .u32 dense_dense_im2col_f_param_12,
+       .param .u32 dense_dense_im2col_f_param_13,
+       .param .u32 dense_dense_im2col_f_param_14,
+       .param .u32 dense_dense_im2col_f_param_15,
+       .param .u32 dense_dense_im2col_f_param_16
+)
+{
+       .reg .pred      %p<12>;
+       .reg .f32       %f<2>;
+       .reg .b32       %r<71>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd1, [dense_dense_im2col_f_param_0];
+       ld.param.u64    %rd2, [dense_dense_im2col_f_param_1];
+       ld.param.u32    %r35, [dense_dense_im2col_f_param_2];
+       ld.param.u32    %r21, [dense_dense_im2col_f_param_3];
+       ld.param.u32    %r22, [dense_dense_im2col_f_param_4];
+       ld.param.u32    %r23, [dense_dense_im2col_f_param_5];
+       ld.param.u32    %r24, [dense_dense_im2col_f_param_6];
+       ld.param.u32    %r25, [dense_dense_im2col_f_param_7];
+       ld.param.u32    %r26, [dense_dense_im2col_f_param_8];
+       ld.param.u32    %r27, [dense_dense_im2col_f_param_9];
+       ld.param.u32    %r28, [dense_dense_im2col_f_param_10];
+       ld.param.u32    %r29, [dense_dense_im2col_f_param_11];
+       ld.param.u32    %r30, [dense_dense_im2col_f_param_12];
+       ld.param.u32    %r31, [dense_dense_im2col_f_param_13];
+       ld.param.u32    %r32, [dense_dense_im2col_f_param_14];
+       ld.param.u32    %r33, [dense_dense_im2col_f_param_15];
+       ld.param.u32    %r34, [dense_dense_im2col_f_param_16];
+       mov.u32         %r36, %ctaid.x;
+       mov.u32         %r37, %ntid.x;
+       mov.u32         %r38, %tid.x;
+       mad.lo.s32      %r1, %r37, %r36, %r38;
+       setp.ge.s32     %p1, %r1, %r35;
+       @%p1 bra        BB5_9;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       mul.wide.s32    %rd4, %r1, 4;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f32   %f1, [%rd5];
+       div.s32         %r2, %r1, %r21;
+       rem.s32         %r39, %r1, %r21;
+       div.s32         %r3, %r39, %r22;
+       rem.s32         %r40, %r39, %r22;
+       div.s32         %r41, %r40, %r23;
+       rem.s32         %r42, %r40, %r23;
+       add.s32         %r4, %r41, %r33;
+       mul.lo.s32      %r43, %r31, %r26;
+       mov.u32         %r44, 1;
+       sub.s32         %r45, %r44, %r43;
+       add.s32         %r46, %r45, %r4;
+       mov.u32         %r47, 0;
+       max.s32         %r67, %r47, %r46;
+       add.s32         %r48, %r24, -1;
+       min.s32         %r6, %r48, %r4;
+       add.s32         %r7, %r42, %r34;
+       mul.lo.s32      %r49, %r32, %r27;
+       sub.s32         %r50, %r44, %r49;
+       add.s32         %r51, %r50, %r7;
+       max.s32         %r70, %r47, %r51;
+       add.s32         %r52, %r25, -1;
+       min.s32         %r9, %r52, %r7;
+
+BB5_2:
+       mov.u32         %r66, %r67;
+       sub.s32         %r53, %r4, %r66;
+       rem.s32         %r54, %r53, %r31;
+       setp.ne.s32     %p2, %r54, 0;
+       setp.le.s32     %p3, %r66, %r6;
+       and.pred        %p4, %p2, %p3;
+       add.s32         %r67, %r66, 1;
+       @%p4 bra        BB5_2;
+
+BB5_3:
+       mov.u32         %r12, %r70;
+       sub.s32         %r55, %r7, %r12;
+       rem.s32         %r56, %r55, %r32;
+       setp.ne.s32     %p5, %r56, 0;
+       setp.le.s32     %p6, %r12, %r9;
+       and.pred        %p7, %p5, %p6;
+       add.s32         %r70, %r12, 1;
+       @%p7 bra        BB5_3;
+
+       setp.gt.s32     %p8, %r66, %r6;
+       @%p8 bra        BB5_9;
+
+       mul.lo.s32      %r14, %r2, %r28;
+       mul.lo.s32      %r15, %r3, %r29;
+       cvta.to.global.u64      %rd6, %rd2;
+
+BB5_6:
+       sub.s32         %r57, %r4, %r66;
+       div.s32         %r58, %r57, %r31;
+       mad.lo.s32      %r17, %r58, %r27, %r14;
+       setp.gt.s32     %p9, %r12, %r9;
+       mov.u32         %r69, %r12;
+       @%p9 bra        BB5_8;
+
+BB5_7:
+       mov.u32         %r18, %r69;
+       sub.s32         %r59, %r7, %r18;
+       div.s32         %r60, %r59, %r32;
+       mad.lo.s32      %r61, %r66, %r25, %r15;
+       add.s32         %r62, %r61, %r18;
+       mad.lo.s32      %r63, %r62, %r30, %r17;
+       add.s32         %r64, %r63, %r60;
+       mul.wide.s32    %rd7, %r64, 4;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f32   [%rd8], %f1;
+       add.s32         %r19, %r18, %r32;
+       setp.le.s32     %p10, %r19, %r9;
+       mov.u32         %r69, %r19;
+       @%p10 bra       BB5_7;
+
+BB5_8:
+       add.s32         %r66, %r66, %r31;
+       setp.le.s32     %p11, %r66, %r6;
+       @%p11 bra       BB5_6;
+
+BB5_9:
+       ret;
+}
+
+       // .globl       reorg_knpq_d
+.visible .entry reorg_knpq_d(
+       .param .u64 reorg_knpq_d_param_0,
+       .param .u64 reorg_knpq_d_param_1,
+       .param .u32 reorg_knpq_d_param_2,
+       .param .u32 reorg_knpq_d_param_3,
+       .param .u32 reorg_knpq_d_param_4,
+       .param .u32 reorg_knpq_d_param_5
+)
+{
+       .reg .pred      %p<2>;
+       .reg .b32       %r<16>;
+       .reg .f64       %fd<2>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd1, [reorg_knpq_d_param_0];
+       ld.param.u64    %rd2, [reorg_knpq_d_param_1];
+       ld.param.u32    %r5, [reorg_knpq_d_param_2];
+       ld.param.u32    %r2, [reorg_knpq_d_param_3];
+       ld.param.u32    %r3, [reorg_knpq_d_param_4];
+       ld.param.u32    %r4, [reorg_knpq_d_param_5];
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r7, %r6, %r8;
+       setp.ge.s32     %p1, %r1, %r5;
+       @%p1 bra        BB6_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       rem.s32         %r9, %r1, %r2;
+       div.s32         %r10, %r9, %r4;
+       rem.s32         %r11, %r9, %r4;
+       mul.wide.s32    %rd4, %r1, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd1, [%rd5];
+       div.s32         %r12, %r1, %r2;
+       mul.lo.s32      %r13, %r12, %r4;
+       mad.lo.s32      %r14, %r10, %r3, %r13;
+       add.s32         %r15, %r14, %r11;
+       cvta.to.global.u64      %rd6, %rd2;
+       mul.wide.s32    %rd7, %r15, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f64   [%rd8], %fd1;
+
+BB6_2:
+       ret;
+}
+
+       // .globl       reorg_knpq_f
+.visible .entry reorg_knpq_f(
+       .param .u64 reorg_knpq_f_param_0,
+       .param .u64 reorg_knpq_f_param_1,
+       .param .u32 reorg_knpq_f_param_2,
+       .param .u32 reorg_knpq_f_param_3,
+       .param .u32 reorg_knpq_f_param_4,
+       .param .u32 reorg_knpq_f_param_5
+)
+{
+       .reg .pred      %p<2>;
+       .reg .f32       %f<2>;
+       .reg .b32       %r<16>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd1, [reorg_knpq_f_param_0];
+       ld.param.u64    %rd2, [reorg_knpq_f_param_1];
+       ld.param.u32    %r5, [reorg_knpq_f_param_2];
+       ld.param.u32    %r2, [reorg_knpq_f_param_3];
+       ld.param.u32    %r3, [reorg_knpq_f_param_4];
+       ld.param.u32    %r4, [reorg_knpq_f_param_5];
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r7, %r6, %r8;
+       setp.ge.s32     %p1, %r1, %r5;
+       @%p1 bra        BB7_2;
+
+       cvta.to.global.u64      %rd3, %rd1;
+       rem.s32         %r9, %r1, %r2;
+       div.s32         %r10, %r9, %r4;
+       rem.s32         %r11, %r9, %r4;
+       mul.wide.s32    %rd4, %r1, 4;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f32   %f1, [%rd5];
+       div.s32         %r12, %r1, %r2;
+       mul.lo.s32      %r13, %r12, %r4;
+       mad.lo.s32      %r14, %r10, %r3, %r13;
+       add.s32         %r15, %r14, %r11;
+       cvta.to.global.u64      %rd6, %rd2;
+       mul.wide.s32    %rd7, %r15, 4;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f32   [%rd8], %f1;
+
+BB7_2:
+       ret;
+}
+
        // .globl       slice_sparse_dense_row_d
 .visible .entry slice_sparse_dense_row_d(
        .param .u64 slice_sparse_dense_row_d_param_0,
@@ -137,7 +825,7 @@ BB1_2:
        mad.lo.s32      %r1, %r17, %r18, %r19;
        add.s32         %r2, %r1, %r15;
        setp.gt.s32     %p1, %r2, %r16;
-       @%p1 bra        BB2_6;
+       @%p1 bra        BB8_6;
 
        cvta.to.global.u64      %rd13, %rd10;
        mul.wide.s32    %rd14, %r2, 4;
@@ -145,7 +833,7 @@ BB1_2:
        ld.global.u32   %r23, [%rd1];
        ld.global.u32   %r22, [%rd1+4];
        setp.ge.s32     %p2, %r23, %r22;
-       @%p2 bra        BB2_6;
+       @%p2 bra        BB8_6;
 
        cvta.to.global.u64      %rd2, %rd12;
        cvta.to.global.u64      %rd15, %rd9;
@@ -157,12 +845,12 @@ BB1_2:
        mul.wide.s32    %rd18, %r23, 4;
        add.s64         %rd21, %rd16, %rd18;
 
-BB2_3:
+BB8_3:
        ld.global.u32   %r8, [%rd21];
        setp.lt.s32     %p3, %r8, %r12;
        setp.gt.s32     %p4, %r8, %r13;
        or.pred         %p5, %p3, %p4;
-       @%p5 bra        BB2_5;
+       @%p5 bra        BB8_5;
 
        ld.global.f64   %fd1, [%rd22];
        add.s32         %r21, %r5, %r8;
@@ -171,14 +859,14 @@ BB2_3:
        st.global.f64   [%rd20], %fd1;
        ld.global.u32   %r22, [%rd1+4];
 
-BB2_5:
+BB8_5:
        add.s64         %rd22, %rd22, 8;
        add.s64         %rd21, %rd21, 4;
        add.s32         %r23, %r23, 1;
        setp.lt.s32     %p6, %r23, %r22;
-       @%p6 bra        BB2_3;
+       @%p6 bra        BB8_3;
 
-BB2_6:
+BB8_6:
        ret;
 }
 
@@ -216,7 +904,7 @@ BB2_6:
        mad.lo.s32      %r1, %r17, %r18, %r19;
        add.s32         %r2, %r1, %r15;
        setp.gt.s32     %p1, %r2, %r16;
-       @%p1 bra        BB3_6;
+       @%p1 bra        BB9_6;
 
        cvta.to.global.u64      %rd13, %rd10;
        mul.wide.s32    %rd14, %r2, 4;
@@ -224,7 +912,7 @@ BB2_6:
        ld.global.u32   %r23, [%rd1];
        ld.global.u32   %r22, [%rd1+4];
        setp.ge.s32     %p2, %r23, %r22;
-       @%p2 bra        BB3_6;
+       @%p2 bra        BB9_6;
 
        cvta.to.global.u64      %rd2, %rd12;
        cvta.to.global.u64      %rd15, %rd9;
@@ -235,12 +923,12 @@ BB2_6:
        add.s64         %rd21, %rd15, %rd17;
        add.s64         %rd20, %rd16, %rd17;
 
-BB3_3:
+BB9_3:
        ld.global.u32   %r8, [%rd20];
        setp.lt.s32     %p3, %r8, %r12;
        setp.gt.s32     %p4, %r8, %r13;
        or.pred         %p5, %p3, %p4;
-       @%p5 bra        BB3_5;
+       @%p5 bra        BB9_5;
 
        ld.global.f32   %f1, [%rd21];
        add.s32         %r21, %r5, %r8;
@@ -249,14 +937,14 @@ BB3_3:
        st.global.f32   [%rd19], %f1;
        ld.global.u32   %r22, [%rd1+4];
 
-BB3_5:
+BB9_5:
        add.s64         %rd21, %rd21, 4;
        add.s64         %rd20, %rd20, 4;
        add.s32         %r23, %r23, 1;
        setp.lt.s32     %p6, %r23, %r22;
-       @%p6 bra        BB3_3;
+       @%p6 bra        BB9_3;
 
-BB3_6:
+BB9_6:
        ret;
 }
 
@@ -301,7 +989,7 @@ BB3_6:
        add.s64         %rd12, %rd1, %rd11;
        ld.global.u32   %r15, [%rd12+4];
        setp.ge.s32     %p1, %r1, %r15;
-       @%p1 bra        BB4_5;
+       @%p1 bra        BB10_5;
 
        cvta.to.global.u64      %rd2, %rd7;
        cvta.to.global.u64      %rd3, %rd5;
@@ -313,11 +1001,11 @@ BB3_6:
        setp.lt.s32     %p2, %r2, %r6;
        setp.gt.s32     %p3, %r2, %r7;
        or.pred         %p4, %p2, %p3;
-       @%p4 bra        BB4_5;
+       @%p4 bra        BB10_5;
 
        mov.u32         %r21, %r5;
 
-BB4_3:
+BB10_3:
        mov.u32         %r3, %r21;
        add.s32         %r4, %r3, 1;
        mul.wide.s32    %rd16, %r4, 4;
@@ -325,7 +1013,7 @@ BB4_3:
        ld.global.u32   %r16, [%rd17];
        setp.le.s32     %p5, %r16, %r1;
        mov.u32         %r21, %r4;
-       @%p5 bra        BB4_3;
+       @%p5 bra        BB10_3;
 
        shl.b64         %rd18, %rd4, 3;
        add.s64         %rd19, %rd3, %rd18;
@@ -338,7 +1026,7 @@ BB4_3:
        add.s64         %rd21, %rd2, %rd20;
        st.global.f64   [%rd21], %fd1;
 
-BB4_5:
+BB10_5:
        ret;
 }
 
@@ -383,7 +1071,7 @@ BB4_5:
        add.s64         %rd12, %rd1, %rd11;
        ld.global.u32   %r15, [%rd12+4];
        setp.ge.s32     %p1, %r1, %r15;
-       @%p1 bra        BB5_5;
+       @%p1 bra        BB11_5;
 
        cvta.to.global.u64      %rd2, %rd7;
        cvta.to.global.u64      %rd3, %rd5;
@@ -395,11 +1083,11 @@ BB4_5:
        setp.lt.s32     %p2, %r2, %r6;
        setp.gt.s32     %p3, %r2, %r7;
        or.pred         %p4, %p2, %p3;
-       @%p4 bra        BB5_5;
+       @%p4 bra        BB11_5;
 
        mov.u32         %r21, %r5;
 
-BB5_3:
+BB11_3:
        mov.u32         %r3, %r21;
        add.s32         %r4, %r3, 1;
        mul.wide.s32    %rd16, %r4, 4;
@@ -407,7 +1095,7 @@ BB5_3:
        ld.global.u32   %r16, [%rd17];
        setp.le.s32     %p5, %r16, %r1;
        mov.u32         %r21, %r4;
-       @%p5 bra        BB5_3;
+       @%p5 bra        BB11_3;
 
        shl.b64         %rd18, %rd4, 2;
        add.s64         %rd19, %rd3, %rd18;
@@ -420,7 +1108,7 @@ BB5_3:
        add.s64         %rd21, %rd2, %rd20;
        st.global.f32   [%rd21], %f1;
 
-BB5_5:
+BB11_5:
        ret;
 }
 
@@ -458,10 +1146,10 @@ BB5_5:
        setp.lt.s32     %p1, %r2, %r7;
        setp.gt.s32     %p2, %r6, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB6_2;
-       bra.uni         BB6_1;
+       @!%p3 bra       BB12_2;
+       bra.uni         BB12_1;
 
-BB6_1:
+BB12_1:
        rem.s32         %r11, %r1, %r6;
        cvta.to.global.u64      %rd3, %rd1;
        add.s32         %r12, %r2, %r3;
@@ -475,7 +1163,7 @@ BB6_1:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd1;
 
-BB6_2:
+BB12_2:
        ret;
 }
 
@@ -513,10 +1201,10 @@ BB6_2:
        setp.lt.s32     %p1, %r2, %r7;
        setp.gt.s32     %p2, %r6, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB7_2;
-       bra.uni         BB7_1;
+       @!%p3 bra       BB13_2;
+       bra.uni         BB13_1;
 
-BB7_1:
+BB13_1:
        rem.s32         %r11, %r1, %r6;
        cvta.to.global.u64      %rd3, %rd1;
        add.s32         %r12, %r2, %r3;
@@ -530,7 +1218,7 @@ BB7_1:
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f1;
 
-BB7_2:
+BB13_2:
        ret;
 }
 
@@ -560,10 +1248,10 @@ BB7_2:
        setp.gt.s32     %p1, %r9, %r8;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB8_2;
-       bra.uni         BB8_1;
+       @!%p3 bra       BB14_2;
+       bra.uni         BB14_1;
 
-BB8_1:
+BB14_1:
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
@@ -572,7 +1260,7 @@ BB8_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB8_2:
+BB14_2:
        ret;
 }
 
@@ -602,10 +1290,10 @@ BB8_2:
        setp.gt.s32     %p1, %r9, %r8;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB9_2;
-       bra.uni         BB9_1;
+       @!%p3 bra       BB15_2;
+       bra.uni         BB15_1;
 
-BB9_1:
+BB15_1:
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 4;
        add.s64         %rd4, %rd2, %rd3;
@@ -614,7 +1302,7 @@ BB9_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f32   [%rd6], %f1;
 
-BB9_2:
+BB15_2:
        ret;
 }
 
@@ -644,10 +1332,10 @@ BB9_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB10_2;
-       bra.uni         BB10_1;
+       @!%p3 bra       BB16_2;
+       bra.uni         BB16_1;
 
-BB10_1:
+BB16_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -658,7 +1346,7 @@ BB10_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB10_2:
+BB16_2:
        ret;
 }
 
@@ -689,10 +1377,10 @@ BB10_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB11_2;
-       bra.uni         BB11_1;
+       @!%p3 bra       BB17_2;
+       bra.uni         BB17_1;
 
-BB11_1:
+BB17_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -705,7 +1393,7 @@ BB11_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f32   [%rd7], %f2;
 
-BB11_2:
+BB17_2:
        ret;
 }
 
@@ -737,10 +1425,10 @@ BB11_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB12_4;
-       bra.uni         BB12_1;
+       @!%p3 bra       BB18_4;
+       bra.uni         BB18_1;
 
-BB12_1:
+BB18_1:
        cvta.to.global.u64      %rd5, %rd2;
        cvt.s64.s32     %rd1, %r1;
        mul.wide.s32    %rd6, %r1, 8;
@@ -748,20 +1436,20 @@ BB12_1:
        ld.global.f64   %fd4, [%rd7];
        mov.f64         %fd5, 0d0000000000000000;
        setp.leu.f64    %p4, %fd4, 0d0000000000000000;
-       @%p4 bra        BB12_3;
+       @%p4 bra        BB18_3;
 
        cvta.to.global.u64      %rd8, %rd3;
        shl.b64         %rd9, %rd1, 3;
        add.s64         %rd10, %rd8, %rd9;
        ld.global.f64   %fd5, [%rd10];
 
-BB12_3:
+BB18_3:
        cvta.to.global.u64      %rd11, %rd4;
        shl.b64         %rd12, %rd1, 3;
        add.s64         %rd13, %rd11, %rd12;
        st.global.f64   [%rd13], %fd5;
 
-BB12_4:
+BB18_4:
        ret;
 }
 
@@ -793,10 +1481,10 @@ BB12_4:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB13_4;
-       bra.uni         BB13_1;
+       @!%p3 bra       BB19_4;
+       bra.uni         BB19_1;
 
-BB13_1:
+BB19_1:
        cvta.to.global.u64      %rd5, %rd2;
        cvt.s64.s32     %rd1, %r1;
        mul.wide.s32    %rd6, %r1, 4;
@@ -804,20 +1492,20 @@ BB13_1:
        ld.global.f32   %f4, [%rd7];
        mov.f32         %f5, 0f00000000;
        setp.leu.f32    %p4, %f4, 0f00000000;
-       @%p4 bra        BB13_3;
+       @%p4 bra        BB19_3;
 
        cvta.to.global.u64      %rd8, %rd3;
        shl.b64         %rd9, %rd1, 2;
        add.s64         %rd10, %rd8, %rd9;
        ld.global.f32   %f5, [%rd10];
 
-BB13_3:
+BB19_3:
        cvta.to.global.u64      %rd11, %rd4;
        shl.b64         %rd12, %rd1, 2;
        add.s64         %rd13, %rd11, %rd12;
        st.global.f32   [%rd13], %f5;
 
-BB13_4:
+BB19_4:
        ret;
 }
 
@@ -847,10 +1535,10 @@ BB13_4:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB14_2;
-       bra.uni         BB14_1;
+       @!%p3 bra       BB20_2;
+       bra.uni         BB20_1;
 
-BB14_1:
+BB20_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -861,7 +1549,7 @@ BB14_1:
        add.f64         %fd3, %fd2, %fd1;
        st.global.f64   [%rd7], %fd3;
 
-BB14_2:
+BB20_2:
        ret;
 }
 
@@ -891,10 +1579,10 @@ BB14_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB15_2;
-       bra.uni         BB15_1;
+       @!%p3 bra       BB21_2;
+       bra.uni         BB21_1;
 
-BB15_1:
+BB21_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 4;
        add.s64         %rd5, %rd3, %rd4;
@@ -905,7 +1593,7 @@ BB15_1:
        add.f32         %f3, %f2, %f1;
        st.global.f32   [%rd7], %f3;
 
-BB15_2:
+BB21_2:
        ret;
 }
 
@@ -939,10 +1627,10 @@ BB15_2:
        setp.lt.s32     %p1, %r8, %r4;
        setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB16_2;
-       bra.uni         BB16_1;
+       @!%p3 bra       BB22_2;
+       bra.uni         BB22_1;
 
-BB16_1:
+BB22_1:
        rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
        mul.wide.s32    %rd5, %r1, 8;
@@ -958,7 +1646,7 @@ BB16_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB16_2:
+BB22_2:
        ret;
 }
 
@@ -992,10 +1680,10 @@ BB16_2:
        setp.lt.s32     %p1, %r8, %r4;
        setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB17_2;
-       bra.uni         BB17_1;
+       @!%p3 bra       BB23_2;
+       bra.uni         BB23_1;
 
-BB17_1:
+BB23_1:
        rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
        mul.wide.s32    %rd5, %r1, 4;
@@ -1011,7 +1699,7 @@ BB17_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f32   [%rd11], %f3;
 
-BB17_2:
+BB23_2:
        ret;
 }
 
@@ -1050,10 +1738,10 @@ BB17_2:
        setp.lt.s32     %p1, %r1, %r5;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB18_4;
-       bra.uni         BB18_1;
+       @!%p3 bra       BB24_4;
+       bra.uni         BB24_1;
 
-BB18_1:
+BB24_1:
        cvta.to.global.u64      %rd6, %rd4;
        mad.lo.s32      %r10, %r1, %r3, %r2;
        cvta.to.global.u64      %rd7, %rd3;
@@ -1062,25 +1750,25 @@ BB18_1:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd2, %rd6, %rd8;
        setp.eq.s32     %p4, %r4, 1;
-       @%p4 bra        BB18_3;
-       bra.uni         BB18_2;
+       @%p4 bra        BB24_3;
+       bra.uni         BB24_2;
 
-BB18_3:
+BB24_3:
        mul.wide.s32    %rd12, %r2, 8;
        add.s64         %rd13, %rd1, %rd12;
        ld.global.f64   %fd5, [%rd13];
        fma.rn.f64      %fd6, %fd5, %fd2, %fd1;
        st.global.f64   [%rd2], %fd6;
-       bra.uni         BB18_4;
+       bra.uni         BB24_4;
 
-BB18_2:
+BB24_2:
        mul.wide.s32    %rd10, %r1, 8;
        add.s64         %rd11, %rd1, %rd10;
        ld.global.f64   %fd3, [%rd11];
        fma.rn.f64      %fd4, %fd3, %fd2, %fd1;
        st.global.f64   [%rd2], %fd4;
 
-BB18_4:
+BB24_4:
        ret;
 }
 
@@ -1120,10 +1808,10 @@ BB18_4:
        setp.lt.s32     %p1, %r1, %r5;
        setp.gt.s32     %p2, %r3, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB19_4;
-       bra.uni         BB19_1;
+       @!%p3 bra       BB25_4;
+       bra.uni         BB25_1;
 
-BB19_1:
+BB25_1:
        cvta.to.global.u64      %rd6, %rd4;
        mad.lo.s32      %r10, %r1, %r3, %r2;
        cvta.to.global.u64      %rd7, %rd3;
@@ -1133,10 +1821,10 @@ BB19_1:
        cvt.f64.f32     %fd1, %f1;
        add.s64         %rd2, %rd6, %rd8;
        setp.eq.s32     %p4, %r4, 1;
-       @%p4 bra        BB19_3;
-       bra.uni         BB19_2;
+       @%p4 bra        BB25_3;
+       bra.uni         BB25_2;
 
-BB19_3:
+BB25_3:
        mul.wide.s32    %rd12, %r2, 4;
        add.s64         %rd13, %rd1, %rd12;
        ld.global.f32   %f4, [%rd13];
@@ -1144,9 +1832,9 @@ BB19_3:
        fma.rn.f64      %fd6, %fd5, %fd2, %fd1;
        cvt.rn.f32.f64  %f5, %fd6;
        st.global.f32   [%rd2], %f5;
-       bra.uni         BB19_4;
+       bra.uni         BB25_4;
 
-BB19_2:
+BB25_2:
        mul.wide.s32    %rd10, %r1, 4;
        add.s64         %rd11, %rd1, %rd10;
        ld.global.f32   %f2, [%rd11];
@@ -1155,7 +1843,7 @@ BB19_2:
        cvt.rn.f32.f64  %f3, %fd4;
        st.global.f32   [%rd2], %f3;
 
-BB19_4:
+BB25_4:
        ret;
 }
 
@@ -1189,10 +1877,10 @@ BB19_4:
        setp.lt.s32     %p1, %r8, %r4;
        setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB20_2;
-       bra.uni         BB20_1;
+       @!%p3 bra       BB26_2;
+       bra.uni         BB26_1;
 
-BB20_1:
+BB26_1:
        rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
        mul.wide.s32    %rd5, %r1, 8;
@@ -1208,7 +1896,7 @@ BB20_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB20_2:
+BB26_2:
        ret;
 }
 
@@ -1242,10 +1930,10 @@ BB20_2:
        setp.lt.s32     %p1, %r8, %r4;
        setp.gt.s32     %p2, %r2, -1;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB21_2;
-       bra.uni         BB21_1;
+       @!%p3 bra       BB27_2;
+       bra.uni         BB27_1;
 
-BB21_1:
+BB27_1:
        rem.s32         %r9, %r1, %r2;
        cvta.to.global.u64      %rd4, %rd1;
        mul.wide.s32    %rd5, %r1, 4;
@@ -1261,7 +1949,7 @@ BB21_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f32   [%rd11], %f3;
 
-BB21_2:
+BB27_2:
        ret;
 }
 
@@ -1299,10 +1987,10 @@ BB21_2:
        setp.lt.s32     %p2, %r2, %r10;
        setp.gt.s32     %p3, %r6, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB22_65;
-       bra.uni         BB22_1;
+       @!%p4 bra       BB28_65;
+       bra.uni         BB28_1;
 
-BB22_1:
+BB28_1:
        rem.s32         %r14, %r1, %r6;
        cvta.to.global.u64      %rd5, %rd2;
        mad.lo.s32      %r3, %r2, %r6, %r14;
@@ -1323,47 +2011,47 @@ BB22_1:
        ld.global.f64   %fd2, [%rd10];
        mov.f64         %fd54, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p9, %r9, 8;
-       @%p9 bra        BB22_18;
+       @%p9 bra        BB28_18;
 
        setp.gt.s32     %p23, %r9, 3;
-       @%p23 bra       BB22_10;
+       @%p23 bra       BB28_10;
 
        setp.gt.s32     %p30, %r9, 1;
-       @%p30 bra       BB22_7;
+       @%p30 bra       BB28_7;
 
        setp.eq.s32     %p33, %r9, 0;
-       @%p33 bra       BB22_63;
-       bra.uni         BB22_5;
+       @%p33 bra       BB28_63;
+       bra.uni         BB28_5;
 
-BB22_63:
+BB28_63:
        add.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_18:
+BB28_18:
        setp.gt.s32     %p10, %r9, 13;
-       @%p10 bra       BB22_27;
+       @%p10 bra       BB28_27;
 
        setp.gt.s32     %p17, %r9, 10;
-       @%p17 bra       BB22_23;
+       @%p17 bra       BB28_23;
 
        setp.eq.s32     %p21, %r9, 9;
-       @%p21 bra       BB22_45;
-       bra.uni         BB22_21;
+       @%p21 bra       BB28_45;
+       bra.uni         BB28_21;
 
-BB22_45:
+BB28_45:
        setp.eq.f64     %p50, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_10:
+BB28_10:
        setp.gt.s32     %p24, %r9, 5;
-       @%p24 bra       BB22_14;
+       @%p24 bra       BB28_14;
 
        setp.eq.s32     %p28, %r9, 4;
-       @%p28 bra       BB22_48;
-       bra.uni         BB22_12;
+       @%p28 bra       BB28_48;
+       bra.uni         BB28_12;
 
-BB22_48:
+BB28_48:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -1399,10 +2087,10 @@ BB22_48:
        }// Callseq End 0
        setp.lt.s32     %p56, %r4, 0;
        and.pred        %p1, %p56, %p55;
-       @!%p1 bra       BB22_50;
-       bra.uni         BB22_49;
+       @!%p1 bra       BB28_50;
+       bra.uni         BB28_49;
 
-BB22_49:
+BB28_49:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r33}, %fd53;
@@ -1414,30 +2102,30 @@ BB22_49:
        }
        mov.b64         %fd53, {%r35, %r34};
 
-BB22_50:
+BB28_50:
        mov.f64         %fd52, %fd53;
        setp.eq.f64     %p57, %fd1, 0d0000000000000000;
-       @%p57 bra       BB22_53;
-       bra.uni         BB22_51;
+       @%p57 bra       BB28_53;
+       bra.uni         BB28_51;
 
-BB22_53:
+BB28_53:
        selp.b32        %r36, %r4, 0, %p55;
        or.b32          %r37, %r36, 2146435072;
        setp.lt.s32     %p61, %r5, 0;
        selp.b32        %r38, %r37, %r36, %p61;
        mov.u32         %r39, 0;
        mov.b64         %fd52, {%r39, %r38};
-       bra.uni         BB22_54;
+       bra.uni         BB28_54;
 
-BB22_27:
+BB28_27:
        setp.gt.s32     %p11, %r9, 15;
-       @%p11 bra       BB22_31;
+       @%p11 bra       BB28_31;
 
        setp.eq.s32     %p15, %r9, 14;
-       @%p15 bra       BB22_42;
-       bra.uni         BB22_29;
+       @%p15 bra       BB28_42;
+       bra.uni         BB28_29;
 
-BB22_42:
+BB28_42:
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.rni.s64.f64 %rd12, %fd2;
        cvt.u32.u64     %r25, %rd11;
@@ -1445,61 +2133,61 @@ BB22_42:
        or.b32          %r27, %r26, %r25;
        setp.eq.s32     %p47, %r27, 0;
        selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_7:
+BB28_7:
        setp.eq.s32     %p31, %r9, 2;
-       @%p31 bra       BB22_62;
-       bra.uni         BB22_8;
+       @%p31 bra       BB28_62;
+       bra.uni         BB28_8;
 
-BB22_62:
+BB28_62:
        mul.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_23:
+BB28_23:
        setp.eq.s32     %p18, %r9, 11;
-       @%p18 bra       BB22_44;
+       @%p18 bra       BB28_44;
 
        setp.eq.s32     %p19, %r9, 12;
-       @%p19 bra       BB22_43;
-       bra.uni         BB22_25;
+       @%p19 bra       BB28_43;
+       bra.uni         BB28_25;
 
-BB22_43:
+BB28_43:
        max.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_14:
+BB28_14:
        setp.eq.s32     %p25, %r9, 6;
-       @%p25 bra       BB22_47;
+       @%p25 bra       BB28_47;
 
        setp.eq.s32     %p26, %r9, 7;
-       @%p26 bra       BB22_46;
-       bra.uni         BB22_16;
+       @%p26 bra       BB28_46;
+       bra.uni         BB28_16;
 
-BB22_46:
+BB28_46:
        setp.gt.f64     %p52, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_31:
+BB28_31:
        setp.eq.s32     %p12, %r9, 16;
-       @%p12 bra       BB22_41;
+       @%p12 bra       BB28_41;
 
        setp.eq.s32     %p13, %r9, 17;
-       @%p13 bra       BB22_37;
-       bra.uni         BB22_33;
+       @%p13 bra       BB28_37;
+       bra.uni         BB28_33;
 
-BB22_37:
+BB28_37:
        setp.eq.f64     %p39, %fd2, 0d0000000000000000;
        setp.eq.f64     %p40, %fd2, 0d8000000000000000;
        or.pred         %p41, %p39, %p40;
        mov.f64         %fd54, 0d7FF8000000000000;
-       @%p41 bra       BB22_64;
+       @%p41 bra       BB28_64;
 
        div.rn.f64      %fd54, %fd1, %fd2;
        abs.f64         %fd39, %fd54;
        setp.gtu.f64    %p42, %fd39, 0d7FF0000000000000;
-       @%p42 bra       BB22_64;
+       @%p42 bra       BB28_64;
 
        {
        .reg .b32 %temp; 
@@ -1513,74 +2201,74 @@ BB22_37:
        setp.ne.s32     %p43, %r24, 2146435072;
        setp.ne.s32     %p44, %r22, 0;
        or.pred         %p45, %p43, %p44;
-       @!%p45 bra      BB22_64;
-       bra.uni         BB22_40;
+       @!%p45 bra      BB28_64;
+       bra.uni         BB28_40;
 
-BB22_40:
+BB28_40:
        cvt.rmi.f64.f64 %fd40, %fd54;
        mul.f64         %fd41, %fd2, %fd40;
        sub.f64         %fd54, %fd1, %fd41;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_5:
+BB28_5:
        setp.eq.s32     %p34, %r9, 1;
-       @%p34 bra       BB22_6;
-       bra.uni         BB22_64;
+       @%p34 bra       BB28_6;
+       bra.uni         BB28_64;
 
-BB22_6:
+BB28_6:
        sub.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_21:
+BB28_21:
        setp.eq.s32     %p22, %r9, 10;
-       @%p22 bra       BB22_22;
-       bra.uni         BB22_64;
+       @%p22 bra       BB28_22;
+       bra.uni         BB28_64;
 
-BB22_22:
+BB28_22:
        setp.neu.f64    %p49, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_12:
+BB28_12:
        setp.eq.s32     %p29, %r9, 5;
-       @%p29 bra       BB22_13;
-       bra.uni         BB22_64;
+       @%p29 bra       BB28_13;
+       bra.uni         BB28_64;
 
-BB22_13:
+BB28_13:
        setp.lt.f64     %p54, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_29:
+BB28_29:
        setp.eq.s32     %p16, %r9, 15;
-       @%p16 bra       BB22_30;
-       bra.uni         BB22_64;
+       @%p16 bra       BB28_30;
+       bra.uni         BB28_64;
 
-BB22_30:
+BB28_30:
        mul.f64         %fd43, %fd1, %fd2;
        mov.f64         %fd44, 0d3FF0000000000000;
        sub.f64         %fd54, %fd44, %fd43;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_8:
+BB28_8:
        setp.eq.s32     %p32, %r9, 3;
-       @%p32 bra       BB22_9;
-       bra.uni         BB22_64;
+       @%p32 bra       BB28_9;
+       bra.uni         BB28_64;
 
-BB22_9:
+BB28_9:
        div.rn.f64      %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_44:
+BB28_44:
        min.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_25:
+BB28_25:
        setp.eq.s32     %p20, %r9, 13;
-       @%p20 bra       BB22_26;
-       bra.uni         BB22_64;
+       @%p20 bra       BB28_26;
+       bra.uni         BB28_64;
 
-BB22_26:
+BB28_26:
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.rni.s64.f64 %rd14, %fd2;
        cvt.u32.u64     %r28, %rd13;
@@ -1588,37 +2276,37 @@ BB22_26:
        and.b32         %r30, %r29, %r28;
        setp.eq.s32     %p48, %r30, 0;
        selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_47:
+BB28_47:
        setp.le.f64     %p53, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_16:
+BB28_16:
        setp.eq.s32     %p27, %r9, 8;
-       @%p27 bra       BB22_17;
-       bra.uni         BB22_64;
+       @%p27 bra       BB28_17;
+       bra.uni         BB28_64;
 
-BB22_17:
+BB28_17:
        setp.ge.f64     %p51, %fd1, %fd2;
        selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_41:
+BB28_41:
        setp.neu.f64    %p46, %fd1, 0d0000000000000000;
        sub.f64         %fd42, %fd1, %fd2;
        selp.f64        %fd54, %fd42, 0d0000000000000000, %p46;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_33:
+BB28_33:
        setp.ne.s32     %p14, %r9, 18;
-       @%p14 bra       BB22_64;
+       @%p14 bra       BB28_64;
 
        div.rn.f64      %fd54, %fd1, %fd2;
        abs.f64         %fd37, %fd54;
        setp.gtu.f64    %p35, %fd37, 0d7FF0000000000000;
-       @%p35 bra       BB22_64;
+       @%p35 bra       BB28_64;
 
        {
        .reg .b32 %temp; 
@@ -1632,22 +2320,22 @@ BB22_33:
        setp.ne.s32     %p36, %r21, 2146435072;
        setp.ne.s32     %p37, %r19, 0;
        or.pred         %p38, %p36, %p37;
-       @!%p38 bra      BB22_64;
-       bra.uni         BB22_36;
+       @!%p38 bra      BB28_64;
+       bra.uni         BB28_36;
 
-BB22_36:
+BB28_36:
        cvt.rmi.f64.f64 %fd54, %fd54;
-       bra.uni         BB22_64;
+       bra.uni         BB28_64;
 
-BB22_51:
+BB28_51:
        setp.gt.s32     %p58, %r4, -1;
-       @%p58 bra       BB22_54;
+       @%p58 bra       BB28_54;
 
        cvt.rzi.f64.f64 %fd45, %fd2;
        setp.neu.f64    %p59, %fd45, %fd2;
        selp.f64        %fd52, 0dFFF8000000000000, %fd52, %p59;
 
-BB22_54:
+BB28_54:
        mov.f64         %fd25, %fd52;
        add.f64         %fd26, %fd1, %fd2;
        {
@@ -1657,17 +2345,17 @@ BB22_54:
        and.b32         %r41, %r40, 2146435072;
        setp.ne.s32     %p62, %r41, 2146435072;
        mov.f64         %fd51, %fd25;
-       @%p62 bra       BB22_61;
+       @%p62 bra       BB28_61;
 
        setp.gtu.f64    %p63, %fd19, 0d7FF0000000000000;
        mov.f64         %fd51, %fd26;
-       @%p63 bra       BB22_61;
+       @%p63 bra       BB28_61;
 
        abs.f64         %fd46, %fd2;
        setp.gtu.f64    %p64, %fd46, 0d7FF0000000000000;
        mov.f64         %fd50, %fd26;
        mov.f64         %fd51, %fd50;
-       @%p64 bra       BB22_61;
+       @%p64 bra       BB28_61;
 
        {
        .reg .b32 %temp; 
@@ -1677,10 +2365,10 @@ BB22_54:
        setp.eq.s32     %p65, %r43, 2146435072;
        setp.eq.s32     %p66, %r42, 0;
        and.pred        %p67, %p65, %p66;
-       @%p67 bra       BB22_60;
-       bra.uni         BB22_58;
+       @%p67 bra       BB28_60;
+       bra.uni         BB28_58;
 
-BB22_60:
+BB28_60:
        setp.gt.f64     %p71, %fd19, 0d3FF0000000000000;
        selp.b32        %r51, 2146435072, 0, %p71;
        xor.b32         %r52, %r51, 2146435072;
@@ -1690,9 +2378,9 @@ BB22_60:
        selp.b32        %r54, 1072693248, %r53, %p73;
        mov.u32         %r55, 0;
        mov.b64         %fd51, {%r55, %r54};
-       bra.uni         BB22_61;
+       bra.uni         BB28_61;
 
-BB22_58:
+BB28_58:
        {
        .reg .b32 %temp; 
        mov.b64         {%r44, %temp}, %fd1;
@@ -1702,10 +2390,10 @@ BB22_58:
        setp.eq.s32     %p69, %r44, 0;
        and.pred        %p70, %p68, %p69;
        mov.f64         %fd51, %fd25;
-       @!%p70 bra      BB22_61;
-       bra.uni         BB22_59;
+       @!%p70 bra      BB28_61;
+       bra.uni         BB28_59;
 
-BB22_59:
+BB28_59:
        shr.s32         %r46, %r5, 31;
        and.b32         %r47, %r46, -2146435072;
        selp.b32        %r48, -1048576, 2146435072, %p1;
@@ -1713,20 +2401,20 @@ BB22_59:
        mov.u32         %r50, 0;
        mov.b64         %fd51, {%r50, %r49};
 
-BB22_61:
+BB28_61:
        setp.eq.f64     %p74, %fd2, 0d0000000000000000;
        setp.eq.f64     %p75, %fd1, 0d3FF0000000000000;
        or.pred         %p76, %p75, %p74;
        selp.f64        %fd54, 0d3FF0000000000000, %fd51, %p76;
 
-BB22_64:
+BB28_64:
        cvta.to.global.u64      %rd16, %rd4;
        mul.wide.s32    %rd17, %r3, 8;
        add.s64         %rd18, %rd16, %rd17;
        st.global.f64   [%rd18], %fd54;
        bar.sync        0;
 
-BB22_65:
+BB28_65:
        ret;
 }
 
@@ -1764,10 +2452,10 @@ BB22_65:
        setp.lt.s32     %p2, %r2, %r8;
        setp.gt.s32     %p3, %r4, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB23_63;
-       bra.uni         BB23_1;
+       @!%p4 bra       BB29_63;
+       bra.uni         BB29_1;
 
-BB23_1:
+BB29_1:
        rem.s32         %r12, %r1, %r4;
        cvta.to.global.u64      %rd4, %rd1;
        mad.lo.s32      %r3, %r2, %r4, %r12;
@@ -1788,47 +2476,47 @@ BB23_1:
        ld.global.f32   %f2, [%rd9];
        mov.f32         %f133, 0f7F7FFFFF;
        setp.gt.s32     %p9, %r7, 8;
-       @%p9 bra        BB23_18;
+       @%p9 bra        BB29_18;
 
        setp.gt.s32     %p23, %r7, 3;
-       @%p23 bra       BB23_10;
+       @%p23 bra       BB29_10;
 
        setp.gt.s32     %p30, %r7, 1;
-       @%p30 bra       BB23_7;
+       @%p30 bra       BB29_7;
 
        setp.eq.s32     %p33, %r7, 0;
-       @%p33 bra       BB23_61;
-       bra.uni         BB23_5;
+       @%p33 bra       BB29_61;
+       bra.uni         BB29_5;
 
-BB23_61:
+BB29_61:
        add.f32         %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_18:
+BB29_18:
        setp.gt.s32     %p10, %r7, 13;
-       @%p10 bra       BB23_27;
+       @%p10 bra       BB29_27;
 
        setp.gt.s32     %p17, %r7, 10;
-       @%p17 bra       BB23_23;
+       @%p17 bra       BB29_23;
 
        setp.eq.s32     %p21, %r7, 9;
-       @%p21 bra       BB23_43;
-       bra.uni         BB23_21;
+       @%p21 bra       BB29_43;
+       bra.uni         BB29_21;
 
-BB23_43:
+BB29_43:
        setp.eq.f32     %p44, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p44;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_10:
+BB29_10:
        setp.gt.s32     %p24, %r7, 5;
-       @%p24 bra       BB23_14;
+       @%p24 bra       BB29_14;
 
        setp.eq.s32     %p28, %r7, 4;
-       @%p28 bra       BB23_46;
-       bra.uni         BB23_12;
+       @%p28 bra       BB29_46;
+       bra.uni         BB29_12;
 
-BB23_46:
+BB29_46:
        mul.f32         %f53, %f2, 0f3F000000;
        cvt.rzi.f32.f32 %f54, %f53;
        fma.rn.f32      %f55, %f54, 0fC0000000, %f2;
@@ -1928,11 +2616,11 @@ BB23_46:
        setp.gt.f32     %p54, %f115, 0f42D20000;
        selp.f32        %f131, 0f7F800000, %f125, %p54;
        setp.eq.f32     %p55, %f131, 0f7F800000;
-       @%p55 bra       BB23_48;
+       @%p55 bra       BB29_48;
 
        fma.rn.f32      %f131, %f131, %f22, %f131;
 
-BB23_48:
+BB29_48:
        setp.lt.f32     %p56, %f1, 0f00000000;
        setp.eq.f32     %p57, %f19, 0f3F800000;
        and.pred        %p1, %p56, %p57;
@@ -1941,10 +2629,10 @@ BB23_48:
        mov.b32          %f126, %r30;
        selp.f32        %f132, %f126, %f131, %p1;
        setp.eq.f32     %p58, %f1, 0f00000000;
-       @%p58 bra       BB23_51;
-       bra.uni         BB23_49;
+       @%p58 bra       BB29_51;
+       bra.uni         BB29_49;
 
-BB23_51:
+BB29_51:
        add.f32         %f128, %f1, %f1;
        mov.b32          %r31, %f128;
        selp.b32        %r32, %r31, 0, %p57;
@@ -1952,17 +2640,17 @@ BB23_51:
        setp.lt.f32     %p62, %f2, 0f00000000;
        selp.b32        %r34, %r33, %r32, %p62;
        mov.b32          %f132, %r34;
-       bra.uni         BB23_52;
+       bra.uni         BB29_52;
 
-BB23_27:
+BB29_27:
        setp.gt.s32     %p11, %r7, 15;
-       @%p11 bra       BB23_31;
+       @%p11 bra       BB29_31;
 
        setp.eq.s32     %p15, %r7, 14;
-       @%p15 bra       BB23_40;
-       bra.uni         BB23_29;
+       @%p15 bra       BB29_40;
+       bra.uni         BB29_29;
 
-BB23_40:
+BB29_40:
        cvt.rni.s64.f32 %rd10, %f1;
        cvt.rni.s64.f32 %rd11, %f2;
        cvt.u32.u64     %r17, %rd10;
@@ -1970,126 +2658,126 @@ BB23_40:
        or.b32          %r19, %r18, %r17;
        setp.eq.s32     %p41, %r19, 0;
        selp.f32        %f133, 0f00000000, 0f3F800000, %p41;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_7:
+BB29_7:
        setp.eq.s32     %p31, %r7, 2;
-       @%p31 bra       BB23_60;
-       bra.uni         BB23_8;
+       @%p31 bra       BB29_60;
+       bra.uni         BB29_8;
 
-BB23_60:
+BB29_60:
        mul.f32         %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_23:
+BB29_23:
        setp.eq.s32     %p18, %r7, 11;
-       @%p18 bra       BB23_42;
+       @%p18 bra       BB29_42;
 
        setp.eq.s32     %p19, %r7, 12;
-       @%p19 bra       BB23_41;
-       bra.uni         BB23_25;
+       @%p19 bra       BB29_41;
+       bra.uni         BB29_25;
 
-BB23_41:
+BB29_41:
        max.f32         %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_14:
+BB29_14:
        setp.eq.s32     %p25, %r7, 6;
-       @%p25 bra       BB23_45;
+       @%p25 bra       BB29_45;
 
        setp.eq.s32     %p26, %r7, 7;
-       @%p26 bra       BB23_44;
-       bra.uni         BB23_16;
+       @%p26 bra       BB29_44;
+       bra.uni         BB29_16;
 
-BB23_44:
+BB29_44:
        setp.gt.f32     %p46, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p46;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_31:
+BB29_31:
        setp.eq.s32     %p12, %r7, 16;
-       @%p12 bra       BB23_39;
+       @%p12 bra       BB29_39;
 
        setp.eq.s32     %p13, %r7, 17;
-       @%p13 bra       BB23_36;
-       bra.uni         BB23_33;
+       @%p13 bra       BB29_36;
+       bra.uni         BB29_33;
 
-BB23_36:
+BB29_36:
        setp.eq.f32     %p36, %f2, 0f00000000;
        setp.eq.f32     %p37, %f2, 0f80000000;
        or.pred         %p38, %p36, %p37;
        mov.f32         %f133, 0f7FC00000;
-       @%p38 bra       BB23_62;
+       @%p38 bra       BB29_62;
 
        div.rn.f32      %f133, %f1, %f2;
        abs.f32         %f43, %f133;
        setp.geu.f32    %p39, %f43, 0f7F800000;
-       @%p39 bra       BB23_62;
+       @%p39 bra       BB29_62;
 
        cvt.rmi.f32.f32 %f44, %f133;
        mul.f32         %f45, %f2, %f44;
        sub.f32         %f133, %f1, %f45;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_5:
+BB29_5:
        setp.eq.s32     %p34, %r7, 1;
-       @%p34 bra       BB23_6;
-       bra.uni         BB23_62;
+       @%p34 bra       BB29_6;
+       bra.uni         BB29_62;
 
-BB23_6:
+BB29_6:
        sub.f32         %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_21:
+BB29_21:
        setp.eq.s32     %p22, %r7, 10;
-       @%p22 bra       BB23_22;
-       bra.uni         BB23_62;
+       @%p22 bra       BB29_22;
+       bra.uni         BB29_62;
 
-BB23_22:
+BB29_22:
        setp.neu.f32    %p43, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p43;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_12:
+BB29_12:
        setp.eq.s32     %p29, %r7, 5;
-       @%p29 bra       BB23_13;
-       bra.uni         BB23_62;
+       @%p29 bra       BB29_13;
+       bra.uni         BB29_62;
 
-BB23_13:
+BB29_13:
        setp.lt.f32     %p48, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p48;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_29:
+BB29_29:
        setp.eq.s32     %p16, %r7, 15;
-       @%p16 bra       BB23_30;
-       bra.uni         BB23_62;
+       @%p16 bra       BB29_30;
+       bra.uni         BB29_62;
 
-BB23_30:
+BB29_30:
        mul.f32         %f47, %f1, %f2;
        mov.f32         %f48, 0f3F800000;
        sub.f32         %f133, %f48, %f47;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_8:
+BB29_8:
        setp.eq.s32     %p32, %r7, 3;
-       @%p32 bra       BB23_9;
-       bra.uni         BB23_62;
+       @%p32 bra       BB29_9;
+       bra.uni         BB29_62;
 
-BB23_9:
+BB29_9:
        div.rn.f32      %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_42:
+BB29_42:
        min.f32         %f133, %f1, %f2;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_25:
+BB29_25:
        setp.eq.s32     %p20, %r7, 13;
-       @%p20 bra       BB23_26;
-       bra.uni         BB23_62;
+       @%p20 bra       BB29_26;
+       bra.uni         BB29_62;
 
-BB23_26:
+BB29_26:
        cvt.rni.s64.f32 %rd12, %f1;
        cvt.rni.s64.f32 %rd13, %f2;
        cvt.u32.u64     %r20, %rd12;
@@ -2097,71 +2785,71 @@ BB23_26:
        and.b32         %r22, %r21, %r20;
        setp.eq.s32     %p42, %r22, 0;
        selp.f32        %f133, 0f00000000, 0f3F800000, %p42;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_45:
+BB29_45:
        setp.le.f32     %p47, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p47;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_16:
+BB29_16:
        setp.eq.s32     %p27, %r7, 8;
-       @%p27 bra       BB23_17;
-       bra.uni         BB23_62;
+       @%p27 bra       BB29_17;
+       bra.uni         BB29_62;
 
-BB23_17:
+BB29_17:
        setp.ge.f32     %p45, %f1, %f2;
        selp.f32        %f133, 0f3F800000, 0f00000000, %p45;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_39:
+BB29_39:
        setp.neu.f32    %p40, %f1, 0f00000000;
        sub.f32         %f46, %f1, %f2;
        selp.f32        %f133, %f46, 0f00000000, %p40;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_33:
+BB29_33:
        setp.ne.s32     %p14, %r7, 18;
-       @%p14 bra       BB23_62;
+       @%p14 bra       BB29_62;
 
        div.rn.f32      %f133, %f1, %f2;
        abs.f32         %f41, %f133;
        setp.geu.f32    %p35, %f41, 0f7F800000;
-       @%p35 bra       BB23_62;
+       @%p35 bra       BB29_62;
 
        cvt.rmi.f32.f32 %f133, %f133;
-       bra.uni         BB23_62;
+       bra.uni         BB29_62;
 
-BB23_49:
+BB29_49:
        setp.geu.f32    %p59, %f1, 0f00000000;
-       @%p59 bra       BB23_52;
+       @%p59 bra       BB29_52;
 
        cvt.rzi.f32.f32 %f127, %f2;
        setp.neu.f32    %p60, %f127, %f2;
        selp.f32        %f132, 0f7FFFFFFF, %f132, %p60;
 
-BB23_52:
+BB29_52:
        add.f32         %f129, %f20, %f21;
        mov.b32          %r35, %f129;
        setp.lt.s32     %p63, %r35, 2139095040;
-       @%p63 bra       BB23_59;
+       @%p63 bra       BB29_59;
 
        setp.gtu.f32    %p64, %f20, 0f7F800000;
        setp.gtu.f32    %p65, %f21, 0f7F800000;
        or.pred         %p66, %p64, %p65;
-       @%p66 bra       BB23_58;
-       bra.uni         BB23_54;
+       @%p66 bra       BB29_58;
+       bra.uni         BB29_54;
 
-BB23_58:
+BB29_58:
        add.f32         %f132, %f1, %f2;
-       bra.uni         BB23_59;
+       bra.uni         BB29_59;
 
-BB23_54:
+BB29_54:
        setp.eq.f32     %p67, %f21, 0f7F800000;
-       @%p67 bra       BB23_57;
-       bra.uni         BB23_55;
+       @%p67 bra       BB29_57;
+       bra.uni         BB29_55;
 
-BB23_57:
+BB29_57:
        setp.gt.f32     %p70, %f20, 0f3F800000;
        selp.b32        %r39, 2139095040, 0, %p70;
        xor.b32         %r40, %r39, 2139095040;
@@ -2170,11 +2858,11 @@ BB23_57:
        mov.b32          %f130, %r41;
        setp.eq.f32     %p72, %f1, 0fBF800000;
        selp.f32        %f132, 0f3F800000, %f130, %p72;
-       bra.uni         BB23_59;
+       bra.uni         BB29_59;
 
-BB23_55:
+BB29_55:
        setp.neu.f32    %p68, %f20, 0f7F800000;
-       @%p68 bra       BB23_59;
+       @%p68 bra       BB29_59;
 
        setp.ge.f32     %p69, %f2, 0f00000000;
        selp.b32        %r36, 2139095040, 0, %p69;
@@ -2182,20 +2870,20 @@ BB23_55:
        selp.b32        %r38, %r37, %r36, %p1;
        mov.b32          %f132, %r38;
 
-BB23_59:
+BB29_59:
        setp.eq.f32     %p73, %f2, 0f00000000;
        setp.eq.f32     %p74, %f1, 0f3F800000;
        or.pred         %p75, %p74, %p73;
        selp.f32        %f133, 0f3F800000, %f132, %p75;
 
-BB23_62:
+BB29_62:
        cvta.to.global.u64      %rd14, %rd3;
        mul.wide.s32    %rd15, %r3, 4;
        add.s64         %rd16, %rd14, %rd15;
        st.global.f32   [%rd16], %f133;
        bar.sync        0;
 
-BB23_63:
+BB29_63:
        ret;
 }
 
@@ -2226,7 +2914,7 @@ BB23_63:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r9, %r10, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB24_130;
+       @%p3 bra        BB30_130;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -2235,86 +2923,86 @@ BB23_63:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB24_66;
+       @%p4 bra        BB30_66;
 
        mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 8;
-       @%p5 bra        BB24_19;
+       @%p5 bra        BB30_19;
 
        setp.gt.s32     %p19, %r6, 3;
-       @%p19 bra       BB24_11;
+       @%p19 bra       BB30_11;
 
        setp.gt.s32     %p26, %r6, 1;
-       @%p26 bra       BB24_8;
+       @%p26 bra       BB30_8;
 
        setp.eq.s32     %p29, %r6, 0;
-       @%p29 bra       BB24_64;
-       bra.uni         BB24_6;
+       @%p29 bra       BB30_64;
+       bra.uni         BB30_6;
 
-BB24_64:
+BB30_64:
        add.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_66:
+BB30_66:
        mov.f64         %fd106, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p73, %r6, 8;
-       @%p73 bra       BB24_83;
+       @%p73 bra       BB30_83;
 
        setp.gt.s32     %p87, %r6, 3;
-       @%p87 bra       BB24_75;
+       @%p87 bra       BB30_75;
 
        setp.gt.s32     %p94, %r6, 1;
-       @%p94 bra       BB24_72;
+       @%p94 bra       BB30_72;
 
        setp.eq.s32     %p97, %r6, 0;
-       @%p97 bra       BB24_128;
-       bra.uni         BB24_70;
+       @%p97 bra       BB30_128;
+       bra.uni         BB30_70;
 
-BB24_128:
+BB30_128:
        add.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_19:
+BB30_19:
        setp.gt.s32     %p6, %r6, 13;
-       @%p6 bra        BB24_28;
+       @%p6 bra        BB30_28;
 
        setp.gt.s32     %p13, %r6, 10;
-       @%p13 bra       BB24_24;
+       @%p13 bra       BB30_24;
 
        setp.eq.s32     %p17, %r6, 9;
-       @%p17 bra       BB24_46;
-       bra.uni         BB24_22;
+       @%p17 bra       BB30_46;
+       bra.uni         BB30_22;
 
-BB24_46:
+BB30_46:
        setp.eq.f64     %p46, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_83:
+BB30_83:
        setp.gt.s32     %p74, %r6, 13;
-       @%p74 bra       BB24_92;
+       @%p74 bra       BB30_92;
 
        setp.gt.s32     %p81, %r6, 10;
-       @%p81 bra       BB24_88;
+       @%p81 bra       BB30_88;
 
        setp.eq.s32     %p85, %r6, 9;
-       @%p85 bra       BB24_110;
-       bra.uni         BB24_86;
+       @%p85 bra       BB30_110;
+       bra.uni         BB30_86;
 
-BB24_110:
+BB30_110:
        setp.eq.f64     %p114, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_11:
+BB30_11:
        setp.gt.s32     %p20, %r6, 5;
-       @%p20 bra       BB24_15;
+       @%p20 bra       BB30_15;
 
        setp.eq.s32     %p24, %r6, 4;
-       @%p24 bra       BB24_49;
-       bra.uni         BB24_13;
+       @%p24 bra       BB30_49;
+       bra.uni         BB30_13;
 
-BB24_49:
+BB30_49:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd68;
@@ -2350,10 +3038,10 @@ BB24_49:
        }// Callseq End 1
        setp.lt.s32     %p52, %r2, 0;
        and.pred        %p1, %p52, %p51;
-       @!%p1 bra       BB24_51;
-       bra.uni         BB24_50;
+       @!%p1 bra       BB30_51;
+       bra.uni         BB30_50;
 
-BB24_50:
+BB30_50:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r26}, %fd97;
@@ -2365,30 +3053,30 @@ BB24_50:
        }
        mov.b64         %fd97, {%r28, %r27};
 
-BB24_51:
+BB30_51:
        mov.f64         %fd96, %fd97;
        setp.eq.f64     %p53, %fd68, 0d0000000000000000;
-       @%p53 bra       BB24_54;
-       bra.uni         BB24_52;
+       @%p53 bra       BB30_54;
+       bra.uni         BB30_52;
 
-BB24_54:
+BB30_54:
        selp.b32        %r29, %r2, 0, %p51;
        or.b32          %r30, %r29, 2146435072;
        setp.lt.s32     %p57, %r3, 0;
        selp.b32        %r31, %r30, %r29, %p57;
        mov.u32         %r32, 0;
        mov.b64         %fd96, {%r32, %r31};
-       bra.uni         BB24_55;
+       bra.uni         BB30_55;
 
-BB24_28:
+BB30_28:
        setp.gt.s32     %p7, %r6, 15;
-       @%p7 bra        BB24_32;
+       @%p7 bra        BB30_32;
 
        setp.eq.s32     %p11, %r6, 14;
-       @%p11 bra       BB24_43;
-       bra.uni         BB24_30;
+       @%p11 bra       BB30_43;
+       bra.uni         BB30_30;
 
-BB24_43:
+BB30_43:
        cvt.rni.s64.f64 %rd10, %fd68;
        cvt.rni.s64.f64 %rd11, %fd1;
        cvt.u32.u64     %r18, %rd10;
@@ -2396,17 +3084,17 @@ BB24_43:
        or.b32          %r20, %r19, %r18;
        setp.eq.s32     %p43, %r20, 0;
        selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_75:
+BB30_75:
        setp.gt.s32     %p88, %r6, 5;
-       @%p88 bra       BB24_79;
+       @%p88 bra       BB30_79;
 
        setp.eq.s32     %p92, %r6, 4;
-       @%p92 bra       BB24_113;
-       bra.uni         BB24_77;
+       @%p92 bra       BB30_113;
+       bra.uni         BB30_77;
 
-BB24_113:
+BB30_113:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -2442,10 +3130,10 @@ BB24_113:
        }// Callseq End 2
        setp.lt.s32     %p120, %r4, 0;
        and.pred        %p2, %p120, %p119;
-       @!%p2 bra       BB24_115;
-       bra.uni         BB24_114;
+       @!%p2 bra       BB30_115;
+       bra.uni         BB30_114;
 
-BB24_114:
+BB30_114:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r63}, %fd105;
@@ -2457,30 +3145,30 @@ BB24_114:
        }
        mov.b64         %fd105, {%r65, %r64};
 
-BB24_115:
+BB30_115:
        mov.f64         %fd104, %fd105;
        setp.eq.f64     %p121, %fd1, 0d0000000000000000;
-       @%p121 bra      BB24_118;
-       bra.uni         BB24_116;
+       @%p121 bra      BB30_118;
+       bra.uni         BB30_116;
 
-BB24_118:
+BB30_118:
        selp.b32        %r66, %r4, 0, %p119;
        or.b32          %r67, %r66, 2146435072;
        setp.lt.s32     %p125, %r5, 0;
        selp.b32        %r68, %r67, %r66, %p125;
        mov.u32         %r69, 0;
        mov.b64         %fd104, {%r69, %r68};
-       bra.uni         BB24_119;
+       bra.uni         BB30_119;
 
-BB24_92:
+BB30_92:
        setp.gt.s32     %p75, %r6, 15;
-       @%p75 bra       BB24_96;
+       @%p75 bra       BB30_96;
 
        setp.eq.s32     %p79, %r6, 14;
-       @%p79 bra       BB24_107;
-       bra.uni         BB24_94;
+       @%p79 bra       BB30_107;
+       bra.uni         BB30_94;
 
-BB24_107:
+BB30_107:
        cvt.rni.s64.f64 %rd15, %fd1;
        cvt.rni.s64.f64 %rd16, %fd68;
        cvt.u32.u64     %r55, %rd15;
@@ -2488,61 +3176,61 @@ BB24_107:
        or.b32          %r57, %r56, %r55;
        setp.eq.s32     %p111, %r57, 0;
        selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_8:
+BB30_8:
        setp.eq.s32     %p27, %r6, 2;
-       @%p27 bra       BB24_63;
-       bra.uni         BB24_9;
+       @%p27 bra       BB30_63;
+       bra.uni         BB30_9;
 
-BB24_63:
+BB30_63:
        mul.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_24:
+BB30_24:
        setp.eq.s32     %p14, %r6, 11;
-       @%p14 bra       BB24_45;
+       @%p14 bra       BB30_45;
 
        setp.eq.s32     %p15, %r6, 12;
-       @%p15 bra       BB24_44;
-       bra.uni         BB24_26;
+       @%p15 bra       BB30_44;
+       bra.uni         BB30_26;
 
-BB24_44:
+BB30_44:
        max.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_15:
+BB30_15:
        setp.eq.s32     %p21, %r6, 6;
-       @%p21 bra       BB24_48;
+       @%p21 bra       BB30_48;
 
        setp.eq.s32     %p22, %r6, 7;
-       @%p22 bra       BB24_47;
-       bra.uni         BB24_17;
+       @%p22 bra       BB30_47;
+       bra.uni         BB30_17;
 
-BB24_47:
+BB30_47:
        setp.lt.f64     %p48, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_32:
+BB30_32:
        setp.eq.s32     %p8, %r6, 16;
-       @%p8 bra        BB24_42;
+       @%p8 bra        BB30_42;
 
        setp.eq.s32     %p9, %r6, 17;
-       @%p9 bra        BB24_38;
-       bra.uni         BB24_34;
+       @%p9 bra        BB30_38;
+       bra.uni         BB30_34;
 
-BB24_38:
+BB30_38:
        setp.eq.f64     %p35, %fd1, 0d0000000000000000;
        setp.eq.f64     %p36, %fd1, 0d8000000000000000;
        or.pred         %p37, %p35, %p36;
        mov.f64         %fd98, 0d7FF8000000000000;
-       @%p37 bra       BB24_65;
+       @%p37 bra       BB30_65;
 
        div.rn.f64      %fd98, %fd68, %fd1;
        abs.f64         %fd72, %fd98;
        setp.gtu.f64    %p38, %fd72, 0d7FF0000000000000;
-       @%p38 bra       BB24_65;
+       @%p38 bra       BB30_65;
 
        {
        .reg .b32 %temp; 
@@ -2556,68 +3244,68 @@ BB24_38:
        setp.ne.s32     %p39, %r17, 2146435072;
        setp.ne.s32     %p40, %r15, 0;
        or.pred         %p41, %p39, %p40;
-       @!%p41 bra      BB24_65;
-       bra.uni         BB24_41;
+       @!%p41 bra      BB30_65;
+       bra.uni         BB30_41;
 
-BB24_41:
+BB30_41:
        cvt.rmi.f64.f64 %fd73, %fd98;
        mul.f64         %fd74, %fd1, %fd73;
        sub.f64         %fd98, %fd68, %fd74;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_72:
+BB30_72:
        setp.eq.s32     %p95, %r6, 2;
-       @%p95 bra       BB24_127;
-       bra.uni         BB24_73;
+       @%p95 bra       BB30_127;
+       bra.uni         BB30_73;
 
-BB24_127:
+BB30_127:
        mul.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_88:
+BB30_88:
        setp.eq.s32     %p82, %r6, 11;
-       @%p82 bra       BB24_109;
+       @%p82 bra       BB30_109;
 
        setp.eq.s32     %p83, %r6, 12;
-       @%p83 bra       BB24_108;
-       bra.uni         BB24_90;
+       @%p83 bra       BB30_108;
+       bra.uni         BB30_90;
 
-BB24_108:
+BB30_108:
        max.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_79:
+BB30_79:
        setp.eq.s32     %p89, %r6, 6;
-       @%p89 bra       BB24_112;
+       @%p89 bra       BB30_112;
 
        setp.eq.s32     %p90, %r6, 7;
-       @%p90 bra       BB24_111;
-       bra.uni         BB24_81;
+       @%p90 bra       BB30_111;
+       bra.uni         BB30_81;
 
-BB24_111:
+BB30_111:
        setp.gt.f64     %p116, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_96:
+BB30_96:
        setp.eq.s32     %p76, %r6, 16;
-       @%p76 bra       BB24_106;
+       @%p76 bra       BB30_106;
 
        setp.eq.s32     %p77, %r6, 17;
-       @%p77 bra       BB24_102;
-       bra.uni         BB24_98;
+       @%p77 bra       BB30_102;
+       bra.uni         BB30_98;
 
-BB24_102:
+BB30_102:
        setp.eq.f64     %p103, %fd68, 0d0000000000000000;
        setp.eq.f64     %p104, %fd68, 0d8000000000000000;
        or.pred         %p105, %p103, %p104;
        mov.f64         %fd106, 0d7FF8000000000000;
-       @%p105 bra      BB24_129;
+       @%p105 bra      BB30_129;
 
        div.rn.f64      %fd106, %fd1, %fd68;
        abs.f64         %fd83, %fd106;
        setp.gtu.f64    %p106, %fd83, 0d7FF0000000000000;
-       @%p106 bra      BB24_129;
+       @%p106 bra      BB30_129;
 
        {
        .reg .b32 %temp; 
@@ -2631,74 +3319,74 @@ BB24_102:
        setp.ne.s32     %p107, %r54, 2146435072;
        setp.ne.s32     %p108, %r52, 0;
        or.pred         %p109, %p107, %p108;
-       @!%p109 bra     BB24_129;
-       bra.uni         BB24_105;
+       @!%p109 bra     BB30_129;
+       bra.uni         BB30_105;
 
-BB24_105:
+BB30_105:
        cvt.rmi.f64.f64 %fd84, %fd106;
        mul.f64         %fd85, %fd84, %fd68;
        sub.f64         %fd106, %fd1, %fd85;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_6:
+BB30_6:
        setp.eq.s32     %p30, %r6, 1;
-       @%p30 bra       BB24_7;
-       bra.uni         BB24_65;
+       @%p30 bra       BB30_7;
+       bra.uni         BB30_65;
 
-BB24_7:
+BB30_7:
        sub.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_22:
+BB30_22:
        setp.eq.s32     %p18, %r6, 10;
-       @%p18 bra       BB24_23;
-       bra.uni         BB24_65;
+       @%p18 bra       BB30_23;
+       bra.uni         BB30_65;
 
-BB24_23:
+BB30_23:
        setp.neu.f64    %p45, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_13:
+BB30_13:
        setp.eq.s32     %p25, %r6, 5;
-       @%p25 bra       BB24_14;
-       bra.uni         BB24_65;
+       @%p25 bra       BB30_14;
+       bra.uni         BB30_65;
 
-BB24_14:
+BB30_14:
        setp.gt.f64     %p50, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_30:
+BB30_30:
        setp.eq.s32     %p12, %r6, 15;
-       @%p12 bra       BB24_31;
-       bra.uni         BB24_65;
+       @%p12 bra       BB30_31;
+       bra.uni         BB30_65;
 
-BB24_31:
+BB30_31:
        mul.f64         %fd76, %fd1, %fd68;
        mov.f64         %fd77, 0d3FF0000000000000;
        sub.f64         %fd98, %fd77, %fd76;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_9:
+BB30_9:
        setp.eq.s32     %p28, %r6, 3;
-       @%p28 bra       BB24_10;
-       bra.uni         BB24_65;
+       @%p28 bra       BB30_10;
+       bra.uni         BB30_65;
 
-BB24_10:
+BB30_10:
        div.rn.f64      %fd98, %fd68, %fd1;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_45:
+BB30_45:
        min.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_26:
+BB30_26:
        setp.eq.s32     %p16, %r6, 13;
-       @%p16 bra       BB24_27;
-       bra.uni         BB24_65;
+       @%p16 bra       BB30_27;
+       bra.uni         BB30_65;
 
-BB24_27:
+BB30_27:
        cvt.rni.s64.f64 %rd12, %fd68;
        cvt.rni.s64.f64 %rd13, %fd1;
        cvt.u32.u64     %r21, %rd12;
@@ -2706,37 +3394,37 @@ BB24_27:
        and.b32         %r23, %r22, %r21;
        setp.eq.s32     %p44, %r23, 0;
        selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_48:
+BB30_48:
        setp.ge.f64     %p49, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_17:
+BB30_17:
        setp.eq.s32     %p23, %r6, 8;
-       @%p23 bra       BB24_18;
-       bra.uni         BB24_65;
+       @%p23 bra       BB30_18;
+       bra.uni         BB30_65;
 
-BB24_18:
+BB30_18:
        setp.le.f64     %p47, %fd1, %fd68;
        selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_42:
+BB30_42:
        setp.neu.f64    %p42, %fd68, 0d0000000000000000;
        sub.f64         %fd75, %fd68, %fd1;
        selp.f64        %fd98, %fd75, 0d0000000000000000, %p42;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_34:
+BB30_34:
        setp.ne.s32     %p10, %r6, 18;
-       @%p10 bra       BB24_65;
+       @%p10 bra       BB30_65;
 
        div.rn.f64      %fd98, %fd68, %fd1;
        abs.f64         %fd70, %fd98;
        setp.gtu.f64    %p31, %fd70, 0d7FF0000000000000;
-       @%p31 bra       BB24_65;
+       @%p31 bra       BB30_65;
 
        {
        .reg .b32 %temp; 
@@ -2750,72 +3438,72 @@ BB24_34:
        setp.ne.s32     %p32, %r14, 2146435072;
        setp.ne.s32     %p33, %r12, 0;
        or.pred         %p34, %p32, %p33;
-       @!%p34 bra      BB24_65;
-       bra.uni         BB24_37;
+       @!%p34 bra      BB30_65;
+       bra.uni         BB30_37;
 
-BB24_37:
+BB30_37:
        cvt.rmi.f64.f64 %fd98, %fd98;
-       bra.uni         BB24_65;
+       bra.uni         BB30_65;
 
-BB24_70:
+BB30_70:
        setp.eq.s32     %p98, %r6, 1;
-       @%p98 bra       BB24_71;
-       bra.uni         BB24_129;
+       @%p98 bra       BB30_71;
+       bra.uni         BB30_129;
 
-BB24_71:
+BB30_71:
        sub.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_86:
+BB30_86:
        setp.eq.s32     %p86, %r6, 10;
-       @%p86 bra       BB24_87;
-       bra.uni         BB24_129;
+       @%p86 bra       BB30_87;
+       bra.uni         BB30_129;
 
-BB24_87:
+BB30_87:
        setp.neu.f64    %p113, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_77:
+BB30_77:
        setp.eq.s32     %p93, %r6, 5;
-       @%p93 bra       BB24_78;
-       bra.uni         BB24_129;
+       @%p93 bra       BB30_78;
+       bra.uni         BB30_129;
 
-BB24_78:
+BB30_78:
        setp.lt.f64     %p118, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_94:
+BB30_94:
        setp.eq.s32     %p80, %r6, 15;
-       @%p80 bra       BB24_95;
-       bra.uni         BB24_129;
+       @%p80 bra       BB30_95;
+       bra.uni         BB30_129;
 
-BB24_95:
+BB30_95:
        mul.f64         %fd87, %fd1, %fd68;
        mov.f64         %fd88, 0d3FF0000000000000;
        sub.f64         %fd106, %fd88, %fd87;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_73:
+BB30_73:
        setp.eq.s32     %p96, %r6, 3;
-       @%p96 bra       BB24_74;
-       bra.uni         BB24_129;
+       @%p96 bra       BB30_74;
+       bra.uni         BB30_129;
 
-BB24_74:
+BB30_74:
        div.rn.f64      %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_109:
+BB30_109:
        min.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_90:
+BB30_90:
        setp.eq.s32     %p84, %r6, 13;
-       @%p84 bra       BB24_91;
-       bra.uni         BB24_129;
+       @%p84 bra       BB30_91;
+       bra.uni         BB30_129;
 
-BB24_91:
+BB30_91:
        cvt.rni.s64.f64 %rd17, %fd1;
        cvt.rni.s64.f64 %rd18, %fd68;
        cvt.u32.u64     %r58, %rd17;
@@ -2823,37 +3511,37 @@ BB24_91:
        and.b32         %r60, %r59, %r58;
        setp.eq.s32     %p112, %r60, 0;
        selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_112:
+BB30_112:
        setp.le.f64     %p117, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_81:
+BB30_81:
        setp.eq.s32     %p91, %r6, 8;
-       @%p91 bra       BB24_82;
-       bra.uni         BB24_129;
+       @%p91 bra       BB30_82;
+       bra.uni         BB30_129;
 
-BB24_82:
+BB30_82:
        setp.ge.f64     %p115, %fd1, %fd68;
        selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_106:
+BB30_106:
        setp.neu.f64    %p110, %fd1, 0d0000000000000000;
        sub.f64         %fd86, %fd1, %fd68;
        selp.f64        %fd106, %fd86, 0d0000000000000000, %p110;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_98:
+BB30_98:
        setp.ne.s32     %p78, %r6, 18;
-       @%p78 bra       BB24_129;
+       @%p78 bra       BB30_129;
 
        div.rn.f64      %fd106, %fd1, %fd68;
        abs.f64         %fd81, %fd106;
        setp.gtu.f64    %p99, %fd81, 0d7FF0000000000000;
-       @%p99 bra       BB24_129;
+       @%p99 bra       BB30_129;
 
        {
        .reg .b32 %temp; 
@@ -2867,22 +3555,22 @@ BB24_98:
        setp.ne.s32     %p100, %r51, 2146435072;
        setp.ne.s32     %p101, %r49, 0;
        or.pred         %p102, %p100, %p101;
-       @!%p102 bra     BB24_129;
-       bra.uni         BB24_101;
+       @!%p102 bra     BB30_129;
+       bra.uni         BB30_101;
 
-BB24_101:
+BB30_101:
        cvt.rmi.f64.f64 %fd106, %fd106;
-       bra.uni         BB24_129;
+       bra.uni         BB30_129;
 
-BB24_52:
+BB30_52:
        setp.gt.s32     %p54, %r2, -1;
-       @%p54 bra       BB24_55;
+       @%p54 bra       BB30_55;
 
        cvt.rzi.f64.f64 %fd78, %fd1;
        setp.neu.f64    %p55, %fd78, %fd1;
        selp.f64        %fd96, 0dFFF8000000000000, %fd96, %p55;
 
-BB24_55:
+BB30_55:
        mov.f64         %fd24, %fd96;
        add.f64         %fd25, %fd1, %fd68;
        {
@@ -2892,17 +3580,17 @@ BB24_55:
        and.b32         %r34, %r33, 2146435072;
        setp.ne.s32     %p58, %r34, 2146435072;
        mov.f64         %fd95, %fd24;
-       @%p58 bra       BB24_62;
+       @%p58 bra       BB30_62;
 
        setp.gtu.f64    %p59, %fd18, 0d7FF0000000000000;
        mov.f64         %fd95, %fd25;
-       @%p59 bra       BB24_62;
+       @%p59 bra       BB30_62;
 
        abs.f64         %fd79, %fd1;
        setp.gtu.f64    %p60, %fd79, 0d7FF0000000000000;
        mov.f64         %fd94, %fd25;
        mov.f64         %fd95, %fd94;
-       @%p60 bra       BB24_62;
+       @%p60 bra       BB30_62;
 
        {
        .reg .b32 %temp; 
@@ -2912,10 +3600,10 @@ BB24_55:
        setp.eq.s32     %p61, %r36, 2146435072;
        setp.eq.s32     %p62, %r35, 0;
        and.pred        %p63, %p61, %p62;
-       @%p63 bra       BB24_61;
-       bra.uni         BB24_59;
+       @%p63 bra       BB30_61;
+       bra.uni         BB30_59;
 
-BB24_61:
+BB30_61:
        setp.gt.f64     %p67, %fd18, 0d3FF0000000000000;
        selp.b32        %r44, 2146435072, 0, %p67;
        xor.b32         %r45, %r44, 2146435072;
@@ -2925,17 +3613,17 @@ BB24_61:
        selp.b32        %r47, 1072693248, %r46, %p69;
        mov.u32         %r48, 0;
        mov.b64         %fd95, {%r48, %r47};
-       bra.uni         BB24_62;
+       bra.uni         BB30_62;
 
-BB24_116:
+BB30_116:
        setp.gt.s32     %p122, %r4, -1;
-       @%p122 bra      BB24_119;
+       @%p122 bra      BB30_119;
 
        cvt.rzi.f64.f64 %fd89, %fd68;
        setp.neu.f64    %p123, %fd89, %fd68;
        selp.f64        %fd104, 0dFFF8000000000000, %fd104, %p123;
 
-BB24_119:
+BB30_119:
        mov.f64         %fd57, %fd104;
        add.f64         %fd58, %fd1, %fd68;
        {
@@ -2945,17 +3633,17 @@ BB24_119:
        and.b32         %r71, %r70, 2146435072;
        setp.ne.s32     %p126, %r71, 2146435072;
        mov.f64         %fd103, %fd57;
-       @%p126 bra      BB24_126;
+       @%p126 bra      BB30_126;
 
        setp.gtu.f64    %p127, %fd51, 0d7FF0000000000000;
        mov.f64         %fd103, %fd58;
-       @%p127 bra      BB24_126;
+       @%p127 bra      BB30_126;
 
        abs.f64         %fd90, %fd68;
        setp.gtu.f64    %p128, %fd90, 0d7FF0000000000000;
        mov.f64         %fd102, %fd58;
        mov.f64         %fd103, %fd102;
-       @%p128 bra      BB24_126;
+       @%p128 bra      BB30_126;
 
        {
        .reg .b32 %temp; 
@@ -2965,10 +3653,10 @@ BB24_119:
        setp.eq.s32     %p129, %r73, 2146435072;
        setp.eq.s32     %p130, %r72, 0;
        and.pred        %p131, %p129, %p130;
-       @%p131 bra      BB24_125;
-       bra.uni         BB24_123;
+       @%p131 bra      BB30_125;
+       bra.uni         BB30_123;
 
-BB24_125:
+BB30_125:
        setp.gt.f64     %p135, %fd51, 0d3FF0000000000000;
        selp.b32        %r81, 2146435072, 0, %p135;
        xor.b32         %r82, %r81, 2146435072;
@@ -2978,9 +3666,9 @@ BB24_125:
        selp.b32        %r84, 1072693248, %r83, %p137;
        mov.u32         %r85, 0;
        mov.b64         %fd103, {%r85, %r84};
-       bra.uni         BB24_126;
+       bra.uni         BB30_126;
 
-BB24_59:
+BB30_59:
        {
        .reg .b32 %temp; 
        mov.b64         {%r37, %temp}, %fd68;
@@ -2990,10 +3678,10 @@ BB24_59:
        setp.eq.s32     %p65, %r37, 0;
        and.pred        %p66, %p64, %p65;
        mov.f64         %fd95, %fd24;
-       @!%p66 bra      BB24_62;
-       bra.uni         BB24_60;
+       @!%p66 bra      BB30_62;
+       bra.uni         BB30_60;
 
-BB24_60:
+BB30_60:
        shr.s32         %r39, %r3, 31;
        and.b32         %r40, %r39, -2146435072;
        selp.b32        %r41, -1048576, 2146435072, %p1;
@@ -3001,17 +3689,17 @@ BB24_60:
        mov.u32         %r43, 0;
        mov.b64         %fd95, {%r43, %r42};
 
-BB24_62:
+BB30_62:
        setp.eq.f64     %p70, %fd1, 0d0000000000000000;
        setp.eq.f64     %p71, %fd68, 0d3FF0000000000000;
        or.pred         %p72, %p71, %p70;
        selp.f64        %fd98, 0d3FF0000000000000, %fd95, %p72;
 
-BB24_65:
+BB30_65:
        st.global.f64   [%rd1], %fd98;
-       bra.uni         BB24_130;
+       bra.uni         BB30_130;
 
-BB24_123:
+BB30_123:
        {
        .reg .b32 %temp; 
        mov.b64         {%r74, %temp}, %fd1;
@@ -3021,10 +3709,10 @@ BB24_123:
        setp.eq.s32     %p133, %r74, 0;
        and.pred        %p134, %p132, %p133;
        mov.f64         %fd103, %fd57;
-       @!%p134 bra     BB24_126;
-       bra.uni         BB24_124;
+       @!%p134 bra     BB30_126;
+       bra.uni         BB30_124;
 
-BB24_124:
+BB30_124:
        shr.s32         %r76, %r5, 31;
        and.b32         %r77, %r76, -2146435072;
        selp.b32        %r78, -1048576, 2146435072, %p2;
@@ -3032,16 +3720,16 @@ BB24_124:
        mov.u32         %r80, 0;
        mov.b64         %fd103, {%r80, %r79};
 
-BB24_126:
+BB30_126:
        setp.eq.f64     %p138, %fd68, 0d0000000000000000;
        setp.eq.f64     %p139, %fd1, 0d3FF0000000000000;
        or.pred         %p140, %p139, %p138;
        selp.f64        %fd106, 0d3FF0000000000000, %fd103, %p140;
 
-BB24_129:
+BB30_129:
        st.global.f64   [%rd1], %fd106;
 
-BB24_130:
+BB30_130:
        bar.sync        0;
        ret;
 }
@@ -3075,7 +3763,7 @@ BB24_130:
        mov.u32         %r7, %tid.x;
        mad.lo.s32      %r1, %r5, %r6, %r7;
        setp.ge.s32     %p3, %r1, %r4;
-       @%p3 bra        BB25_126;
+       @%p3 bra        BB31_126;
 
        cvta.to.global.u64      %rd4, %rd3;
        cvta.to.global.u64      %rd5, %rd2;
@@ -3084,86 +3772,86 @@ BB24_130:
        ld.global.f32   %f2, [%rd7];
        add.s64         %rd1, %rd4, %rd6;
        setp.eq.s32     %p4, %r3, 0;
-       @%p4 bra        BB25_64;
+       @%p4 bra        BB31_64;
 
        mov.f32         %f261, 0f7F7FFFFF;
        setp.gt.s32     %p5, %r2, 8;
-       @%p5 bra        BB25_19;
+       @%p5 bra        BB31_19;
 
        setp.gt.s32     %p19, %r2, 3;
-       @%p19 bra       BB25_11;
+       @%p19 bra       BB31_11;
 
        setp.gt.s32     %p26, %r2, 1;
-       @%p26 bra       BB25_8;
+       @%p26 bra       BB31_8;
 
        setp.eq.s32     %p29, %r2, 0;
-       @%p29 bra       BB25_62;
-       bra.uni         BB25_6;
+       @%p29 bra       BB31_62;
+       bra.uni         BB31_6;
 
-BB25_62:
+BB31_62:
        add.f32         %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_64:
+BB31_64:
        mov.f32         %f264, 0f7F7FFFFF;
        setp.gt.s32     %p72, %r2, 8;
-       @%p72 bra       BB25_81;
+       @%p72 bra       BB31_81;
 
        setp.gt.s32     %p86, %r2, 3;
-       @%p86 bra       BB25_73;
+       @%p86 bra       BB31_73;
 
        setp.gt.s32     %p93, %r2, 1;
-       @%p93 bra       BB25_70;
+       @%p93 bra       BB31_70;
 
        setp.eq.s32     %p96, %r2, 0;
-       @%p96 bra       BB25_124;
-       bra.uni         BB25_68;
+       @%p96 bra       BB31_124;
+       bra.uni         BB31_68;
 
-BB25_124:
+BB31_124:
        add.f32         %f264, %f1, %f2;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_19:
+BB31_19:
        setp.gt.s32     %p6, %r2, 13;
-       @%p6 bra        BB25_28;
+       @%p6 bra        BB31_28;
 
        setp.gt.s32     %p13, %r2, 10;
-       @%p13 bra       BB25_24;
+       @%p13 bra       BB31_24;
 
        setp.eq.s32     %p17, %r2, 9;
-       @%p17 bra       BB25_44;
-       bra.uni         BB25_22;
+       @%p17 bra       BB31_44;
+       bra.uni         BB31_22;
 
-BB25_44:
+BB31_44:
        setp.eq.f32     %p40, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p40;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_81:
+BB31_81:
        setp.gt.s32     %p73, %r2, 13;
-       @%p73 bra       BB25_90;
+       @%p73 bra       BB31_90;
 
        setp.gt.s32     %p80, %r2, 10;
-       @%p80 bra       BB25_86;
+       @%p80 bra       BB31_86;
 
        setp.eq.s32     %p84, %r2, 9;
-       @%p84 bra       BB25_106;
-       bra.uni         BB25_84;
+       @%p84 bra       BB31_106;
+       bra.uni         BB31_84;
 
-BB25_106:
+BB31_106:
        setp.eq.f32     %p107, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p107;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_11:
+BB31_11:
        setp.gt.s32     %p20, %r2, 5;
-       @%p20 bra       BB25_15;
+       @%p20 bra       BB31_15;
 
        setp.eq.s32     %p24, %r2, 4;
-       @%p24 bra       BB25_47;
-       bra.uni         BB25_13;
+       @%p24 bra       BB31_47;
+       bra.uni         BB31_13;
 
-BB25_47:
+BB31_47:
        mul.f32         %f90, %f2, 0f3F000000;
        cvt.rzi.f32.f32 %f91, %f90;
        fma.rn.f32      %f92, %f91, 0fC0000000, %f2;
@@ -3263,11 +3951,11 @@ BB25_47:
        setp.gt.f32     %p50, %f152, 0f42D20000;
        selp.f32        %f259, 0f7F800000, %f162, %p50;
        setp.eq.f32     %p51, %f259, 0f7F800000;
-       @%p51 bra       BB25_49;
+       @%p51 bra       BB31_49;
 
        fma.rn.f32      %f259, %f259, %f22, %f259;
 
-BB25_49:
+BB31_49:
        setp.lt.f32     %p52, %f1, 0f00000000;
        setp.eq.f32     %p53, %f19, 0f3F800000;
        and.pred        %p1, %p52, %p53;
@@ -3276,10 +3964,10 @@ BB25_49:
        mov.b32          %f163, %r21;
        selp.f32        %f260, %f163, %f259, %p1;
        setp.eq.f32     %p54, %f1, 0f00000000;
-       @%p54 bra       BB25_52;
-       bra.uni         BB25_50;
+       @%p54 bra       BB31_52;
+       bra.uni         BB31_50;
 
-BB25_52:
+BB31_52:
        add.f32         %f165, %f1, %f1;
        mov.b32          %r22, %f165;
        selp.b32        %r23, %r22, 0, %p53;
@@ -3287,17 +3975,17 @@ BB25_52:
        setp.lt.f32     %p58, %f2, 0f00000000;
        selp.b32        %r25, %r24, %r23, %p58;
        mov.b32          %f260, %r25;
-       bra.uni         BB25_53;
+       bra.uni         BB31_53;
 
-BB25_28:
+BB31_28:
        setp.gt.s32     %p7, %r2, 15;
-       @%p7 bra        BB25_32;
+       @%p7 bra        BB31_32;
 
        setp.eq.s32     %p11, %r2, 14;
-       @%p11 bra       BB25_41;
-       bra.uni         BB25_30;
+       @%p11 bra       BB31_41;
+       bra.uni         BB31_30;
 
-BB25_41:
+BB31_41:
        cvt.rni.s64.f32 %rd8, %f1;
        cvt.rni.s64.f32 %rd9, %f2;
        cvt.u32.u64     %r8, %rd8;
@@ -3305,17 +3993,17 @@ BB25_41:
        or.b32          %r10, %r9, %r8;
        setp.eq.s32     %p37, %r10, 0;
        selp.f32        %f261, 0f00000000, 0f3F800000, %p37;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_73:
+BB31_73:
        setp.gt.s32     %p87, %r2, 5;
-       @%p87 bra       BB25_77;
+       @%p87 bra       BB31_77;
 
        setp.eq.s32     %p91, %r2, 4;
-       @%p91 bra       BB25_109;
-       bra.uni         BB25_75;
+       @%p91 bra       BB31_109;
+       bra.uni         BB31_75;
 
-BB25_109:
+BB31_109:
        mul.f32         %f181, %f1, 0f3F000000;
        cvt.rzi.f32.f32 %f182, %f181;
        fma.rn.f32      %f183, %f182, 0fC0000000, %f1;
@@ -3415,11 +4103,11 @@ BB25_109:
        setp.gt.f32     %p117, %f243, 0f42D20000;
        selp.f32        %f262, 0f7F800000, %f253, %p117;
        setp.eq.f32     %p118, %f262, 0f7F800000;
-       @%p118 bra      BB25_111;
+       @%p118 bra      BB31_111;
 
        fma.rn.f32      %f262, %f262, %f59, %f262;
 
-BB25_111:
+BB31_111:
        setp.lt.f32     %p119, %f2, 0f00000000;
        setp.eq.f32     %p120, %f56, 0f3F800000;
        and.pred        %p2, %p119, %p120;
@@ -3428,10 +4116,10 @@ BB25_111:
        mov.b32          %f254, %r46;
        selp.f32        %f263, %f254, %f262, %p2;
        setp.eq.f32     %p121, %f2, 0f00000000;
-       @%p121 bra      BB25_114;
-       bra.uni         BB25_112;
+       @%p121 bra      BB31_114;
+       bra.uni         BB31_112;
 
-BB25_114:
+BB31_114:
        add.f32         %f256, %f2, %f2;
        mov.b32          %r47, %f256;
        selp.b32        %r48, %r47, 0, %p120;
@@ -3439,17 +4127,17 @@ BB25_114:
        setp.lt.f32     %p125, %f1, 0f00000000;
        selp.b32        %r50, %r49, %r48, %p125;
        mov.b32          %f263, %r50;
-       bra.uni         BB25_115;
+       bra.uni         BB31_115;
 
-BB25_90:
+BB31_90:
        setp.gt.s32     %p74, %r2, 15;
-       @%p74 bra       BB25_94;
+       @%p74 bra       BB31_94;
 
        setp.eq.s32     %p78, %r2, 14;
-       @%p78 bra       BB25_103;
-       bra.uni         BB25_92;
+       @%p78 bra       BB31_103;
+       bra.uni         BB31_92;
 
-BB25_103:
+BB31_103:
        cvt.rni.s64.f32 %rd12, %f2;
        cvt.rni.s64.f32 %rd13, %f1;
        cvt.u32.u64     %r33, %rd12;
@@ -3457,185 +4145,185 @@ BB25_103:
        or.b32          %r35, %r34, %r33;
        setp.eq.s32     %p104, %r35, 0;
        selp.f32        %f264, 0f00000000, 0f3F800000, %p104;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_8:
+BB31_8:
        setp.eq.s32     %p27, %r2, 2;
-       @%p27 bra       BB25_61;
-       bra.uni         BB25_9;
+       @%p27 bra       BB31_61;
+       bra.uni         BB31_9;
 
-BB25_61:
+BB31_61:
        mul.f32         %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_24:
+BB31_24:
        setp.eq.s32     %p14, %r2, 11;
-       @%p14 bra       BB25_43;
+       @%p14 bra       BB31_43;
 
        setp.eq.s32     %p15, %r2, 12;
-       @%p15 bra       BB25_42;
-       bra.uni         BB25_26;
+       @%p15 bra       BB31_42;
+       bra.uni         BB31_26;
 
-BB25_42:
+BB31_42:
        max.f32         %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_15:
+BB31_15:
        setp.eq.s32     %p21, %r2, 6;
-       @%p21 bra       BB25_46;
+       @%p21 bra       BB31_46;
 
        setp.eq.s32     %p22, %r2, 7;
-       @%p22 bra       BB25_45;
-       bra.uni         BB25_17;
+       @%p22 bra       BB31_45;
+       bra.uni         BB31_17;
 
-BB25_45:
+BB31_45:
        setp.gt.f32     %p42, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p42;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_32:
+BB31_32:
        setp.eq.s32     %p8, %r2, 16;
-       @%p8 bra        BB25_40;
+       @%p8 bra        BB31_40;
 
        setp.eq.s32     %p9, %r2, 17;
-       @%p9 bra        BB25_37;
-       bra.uni         BB25_34;
+       @%p9 bra        BB31_37;
+       bra.uni         BB31_34;
 
-BB25_37:
+BB31_37:
        setp.eq.f32     %p32, %f2, 0f00000000;
        setp.eq.f32     %p33, %f2, 0f80000000;
        or.pred         %p34, %p32, %p33;
        mov.f32         %f261, 0f7FC00000;
-       @%p34 bra       BB25_63;
+       @%p34 bra       BB31_63;
 
        div.rn.f32      %f261, %f1, %f2;
        abs.f32         %f80, %f261;
        setp.geu.f32    %p35, %f80, 0f7F800000;
-       @%p35 bra       BB25_63;
+       @%p35 bra       BB31_63;
 
        cvt.rmi.f32.f32 %f81, %f261;
        mul.f32         %f82, %f2, %f81;
        sub.f32         %f261, %f1, %f82;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_70:
+BB31_70:
        setp.eq.s32     %p94, %r2, 2;
-       @%p94 bra       BB25_123;
-       bra.uni         BB25_71;
+       @%p94 bra       BB31_123;
+       bra.uni         BB31_71;
 
-BB25_123:
+BB31_123:
        mul.f32         %f264, %f1, %f2;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_86:
+BB31_86:
        setp.eq.s32     %p81, %r2, 11;
-       @%p81 bra       BB25_105;
+       @%p81 bra       BB31_105;
 
        setp.eq.s32     %p82, %r2, 12;
-       @%p82 bra       BB25_104;
-       bra.uni         BB25_88;
+       @%p82 bra       BB31_104;
+       bra.uni         BB31_88;
 
-BB25_104:
+BB31_104:
        max.f32         %f264, %f2, %f1;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_77:
+BB31_77:
        setp.eq.s32     %p88, %r2, 6;
-       @%p88 bra       BB25_108;
+       @%p88 bra       BB31_108;
 
        setp.eq.s32     %p89, %r2, 7;
-       @%p89 bra       BB25_107;
-       bra.uni         BB25_79;
+       @%p89 bra       BB31_107;
+       bra.uni         BB31_79;
 
-BB25_107:
+BB31_107:
        setp.gt.f32     %p109, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p109;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_94:
+BB31_94:
        setp.eq.s32     %p75, %r2, 16;
-       @%p75 bra       BB25_102;
+       @%p75 bra       BB31_102;
 
        setp.eq.s32     %p76, %r2, 17;
-       @%p76 bra       BB25_99;
-       bra.uni         BB25_96;
+       @%p76 bra       BB31_99;
+       bra.uni         BB31_96;
 
-BB25_99:
+BB31_99:
        setp.eq.f32     %p99, %f1, 0f00000000;
        setp.eq.f32     %p100, %f1, 0f80000000;
        or.pred         %p101, %p99, %p100;
        mov.f32         %f264, 0f7FC00000;
-       @%p101 bra      BB25_125;
+       @%p101 bra      BB31_125;
 
        div.rn.f32      %f264, %f2, %f1;
        abs.f32         %f171, %f264;
        setp.geu.f32    %p102, %f171, 0f7F800000;
-       @%p102 bra      BB25_125;
+       @%p102 bra      BB31_125;
 
        cvt.rmi.f32.f32 %f172, %f264;
        mul.f32         %f173, %f1, %f172;
        sub.f32         %f264, %f2, %f173;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_6:
+BB31_6:
        setp.eq.s32     %p30, %r2, 1;
-       @%p30 bra       BB25_7;
-       bra.uni         BB25_63;
+       @%p30 bra       BB31_7;
+       bra.uni         BB31_63;
 
-BB25_7:
+BB31_7:
        sub.f32         %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_22:
+BB31_22:
        setp.eq.s32     %p18, %r2, 10;
-       @%p18 bra       BB25_23;
-       bra.uni         BB25_63;
+       @%p18 bra       BB31_23;
+       bra.uni         BB31_63;
 
-BB25_23:
+BB31_23:
        setp.neu.f32    %p39, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p39;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_13:
+BB31_13:
        setp.eq.s32     %p25, %r2, 5;
-       @%p25 bra       BB25_14;
-       bra.uni         BB25_63;
+       @%p25 bra       BB31_14;
+       bra.uni         BB31_63;
 
-BB25_14:
+BB31_14:
        setp.lt.f32     %p44, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p44;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_30:
+BB31_30:
        setp.eq.s32     %p12, %r2, 15;
-       @%p12 bra       BB25_31;
-       bra.uni         BB25_63;
+       @%p12 bra       BB31_31;
+       bra.uni         BB31_63;
 
-BB25_31:
+BB31_31:
        mul.f32         %f84, %f1, %f2;
        mov.f32         %f85, 0f3F800000;
        sub.f32         %f261, %f85, %f84;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_9:
+BB31_9:
        setp.eq.s32     %p28, %r2, 3;
-       @%p28 bra       BB25_10;
-       bra.uni         BB25_63;
+       @%p28 bra       BB31_10;
+       bra.uni         BB31_63;
 
-BB25_10:
+BB31_10:
        div.rn.f32      %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_43:
+BB31_43:
        min.f32         %f261, %f1, %f2;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_26:
+BB31_26:
        setp.eq.s32     %p16, %r2, 13;
-       @%p16 bra       BB25_27;
-       bra.uni         BB25_63;
+       @%p16 bra       BB31_27;
+       bra.uni         BB31_63;
 
-BB25_27:
+BB31_27:
        cvt.rni.s64.f32 %rd10, %f1;
        cvt.rni.s64.f32 %rd11, %f2;
        cvt.u32.u64     %r11, %rd10;
@@ -3643,100 +4331,100 @@ BB25_27:
        and.b32         %r13, %r12, %r11;
        setp.eq.s32     %p38, %r13, 0;
        selp.f32        %f261, 0f00000000, 0f3F800000, %p38;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_46:
+BB31_46:
        setp.le.f32     %p43, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p43;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_17:
+BB31_17:
        setp.eq.s32     %p23, %r2, 8;
-       @%p23 bra       BB25_18;
-       bra.uni         BB25_63;
+       @%p23 bra       BB31_18;
+       bra.uni         BB31_63;
 
-BB25_18:
+BB31_18:
        setp.ge.f32     %p41, %f1, %f2;
        selp.f32        %f261, 0f3F800000, 0f00000000, %p41;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_40:
+BB31_40:
        setp.neu.f32    %p36, %f1, 0f00000000;
        sub.f32         %f83, %f1, %f2;
        selp.f32        %f261, %f83, 0f00000000, %p36;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_34:
+BB31_34:
        setp.ne.s32     %p10, %r2, 18;
-       @%p10 bra       BB25_63;
+       @%p10 bra       BB31_63;
 
        div.rn.f32      %f261, %f1, %f2;
        abs.f32         %f78, %f261;
        setp.geu.f32    %p31, %f78, 0f7F800000;
-       @%p31 bra       BB25_63;
+       @%p31 bra       BB31_63;
 
        cvt.rmi.f32.f32 %f261, %f261;
-       bra.uni         BB25_63;
+       bra.uni         BB31_63;
 
-BB25_68:
+BB31_68:
        setp.eq.s32     %p97, %r2, 1;
-       @%p97 bra       BB25_69;
-       bra.uni         BB25_125;
+       @%p97 bra       BB31_69;
+       bra.uni         BB31_125;
 
-BB25_69:
+BB31_69:
        sub.f32         %f264, %f2, %f1;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_84:
+BB31_84:
        setp.eq.s32     %p85, %r2, 10;
-       @%p85 bra       BB25_85;
-       bra.uni         BB25_125;
+       @%p85 bra       BB31_85;
+       bra.uni         BB31_125;
 
-BB25_85:
+BB31_85:
        setp.neu.f32    %p106, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p106;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_75:
+BB31_75:
        setp.eq.s32     %p92, %r2, 5;
-       @%p92 bra       BB25_76;
-       bra.uni         BB25_125;
+       @%p92 bra       BB31_76;
+       bra.uni         BB31_125;
 
-BB25_76:
+BB31_76:
        setp.lt.f32     %p111, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p111;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_92:
+BB31_92:
        setp.eq.s32     %p79, %r2, 15;
-       @%p79 bra       BB25_93;
-       bra.uni         BB25_125;
+       @%p79 bra       BB31_93;
+       bra.uni         BB31_125;
 
-BB25_93:
+BB31_93:
        mul.f32         %f175, %f1, %f2;
        mov.f32         %f176, 0f3F800000;
        sub.f32         %f264, %f176, %f175;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_71:
+BB31_71:
        setp.eq.s32     %p95, %r2, 3;
-       @%p95 bra       BB25_72;
-       bra.uni         BB25_125;
+       @%p95 bra       BB31_72;
+       bra.uni         BB31_125;
 
-BB25_72:
+BB31_72:
        div.rn.f32      %f264, %f2, %f1;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_105:
+BB31_105:
        min.f32         %f264, %f2, %f1;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_88:
+BB31_88:
        setp.eq.s32     %p83, %r2, 13;
-       @%p83 bra       BB25_89;
-       bra.uni         BB25_125;
+       @%p83 bra       BB31_89;
+       bra.uni         BB31_125;
 
-BB25_89:
+BB31_89:
        cvt.rni.s64.f32 %rd14, %f2;
        cvt.rni.s64.f32 %rd15, %f1;
        cvt.u32.u64     %r36, %rd14;
@@ -3744,71 +4432,71 @@ BB25_89:
        and.b32         %r38, %r37, %r36;
        setp.eq.s32     %p105, %r38, 0;
        selp.f32        %f264, 0f00000000, 0f3F800000, %p105;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_108:
+BB31_108:
        setp.le.f32     %p110, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p110;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_79:
+BB31_79:
        setp.eq.s32     %p90, %r2, 8;
-       @%p90 bra       BB25_80;
-       bra.uni         BB25_125;
+       @%p90 bra       BB31_80;
+       bra.uni         BB31_125;
 
-BB25_80:
+BB31_80:
        setp.ge.f32     %p108, %f2, %f1;
        selp.f32        %f264, 0f3F800000, 0f00000000, %p108;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_102:
+BB31_102:
        setp.neu.f32    %p103, %f2, 0f00000000;
        sub.f32         %f174, %f2, %f1;
        selp.f32        %f264, %f174, 0f00000000, %p103;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_96:
+BB31_96:
        setp.ne.s32     %p77, %r2, 18;
-       @%p77 bra       BB25_125;
+       @%p77 bra       BB31_125;
 
        div.rn.f32      %f264, %f2, %f1;
        abs.f32         %f169, %f264;
        setp.geu.f32    %p98, %f169, 0f7F800000;
-       @%p98 bra       BB25_125;
+       @%p98 bra       BB31_125;
 
        cvt.rmi.f32.f32 %f264, %f264;
-       bra.uni         BB25_125;
+       bra.uni         BB31_125;
 
-BB25_50:
+BB31_50:
        setp.geu.f32    %p55, %f1, 0f00000000;
-       @%p55 bra       BB25_53;
+       @%p55 bra       BB31_53;
 
        cvt.rzi.f32.f32 %f164, %f2;
        setp.neu.f32    %p56, %f164, %f2;
        selp.f32        %f260, 0f7FFFFFFF, %f260, %p56;
 
-BB25_53:
+BB31_53:
        add.f32         %f166, %f20, %f21;
        mov.b32          %r26, %f166;
        setp.lt.s32     %p59, %r26, 2139095040;
-       @%p59 bra       BB25_60;
+       @%p59 bra       BB31_60;
 
        setp.gtu.f32    %p60, %f20, 0f7F800000;
        setp.gtu.f32    %p61, %f21, 0f7F800000;
        or.pred         %p62, %p60, %p61;
-       @%p62 bra       BB25_59;
-       bra.uni         BB25_55;
+       @%p62 bra       BB31_59;
+       bra.uni         BB31_55;
 
-BB25_59:
+BB31_59:
        add.f32         %f260, %f1, %f2;
-       bra.uni         BB25_60;
+       bra.uni         BB31_60;
 
-BB25_55:
+BB31_55:
        setp.eq.f32     %p63, %f21, 0f7F800000;
-       @%p63 bra       BB25_58;
-       bra.uni         BB25_56;
+       @%p63 bra       BB31_58;
+       bra.uni         BB31_56;
 
-BB25_58:
+BB31_58:
        setp.gt.f32     %p66, %f20, 0f3F800000;
        selp.b32        %r30, 2139095040, 0, %p66;
        xor.b32         %r31, %r30, 2139095040;
@@ -3817,38 +4505,38 @@ BB25_58:
        mov.b32          %f167, %r32;
        setp.eq.f32     %p68, %f1, 0fBF800000;
        selp.f32        %f260, 0f3F800000, %f167, %p68;
-       bra.uni         BB25_60;
+       bra.uni         BB31_60;
 
-BB25_112:
+BB31_112:
        setp.geu.f32    %p122, %f2, 0f00000000;
-       @%p122 bra      BB25_115;
+       @%p122 bra      BB31_115;
 
        cvt.rzi.f32.f32 %f255, %f1;
        setp.neu.f32    %p123, %f255, %f1;
        selp.f32        %f263, 0f7FFFFFFF, %f263, %p123;
 
-BB25_115:
+BB31_115:
        add.f32         %f257, %f57, %f58;
        mov.b32          %r51, %f257;
        setp.lt.s32     %p126, %r51, 2139095040;
-       @%p126 bra      BB25_122;
+       @%p126 bra      BB31_122;
 
        setp.gtu.f32    %p127, %f57, 0f7F800000;
        setp.gtu.f32    %p128, %f58, 0f7F800000;
        or.pred         %p129, %p127, %p128;
-       @%p129 bra      BB25_121;
-       bra.uni         BB25_117;
+       @%p129 bra      BB31_121;
+       bra.uni         BB31_117;
 
-BB25_121:
+BB31_121:
        add.f32         %f263, %f1, %f2;
-       bra.uni         BB25_122;
+       bra.uni         BB31_122;
 
-BB25_117:
+BB31_117:
        setp.eq.f32     %p130, %f58, 0f7F800000;
-       @%p130 bra      BB25_120;
-       bra.uni         BB25_118;
+       @%p130 bra      BB31_120;
+       bra.uni         BB31_118;
 
-BB25_120:
+BB31_120:
        setp.gt.f32     %p133, %f57, 0f3F800000;
        selp.b32        %r55, 2139095040, 0, %p133;
        xor.b32         %r56, %r55, 2139095040;
@@ -3857,11 +4545,11 @@ BB25_120:
        mov.b32          %f258, %r57;
        setp.eq.f32     %p135, %f2, 0fBF800000;
        selp.f32        %f263, 0f3F800000, %f258, %p135;
-       bra.uni         BB25_122;
+       bra.uni         BB31_122;
 
-BB25_56:
+BB31_56:
        setp.neu.f32    %p64, %f20, 0f7F800000;
-       @%p64 bra       BB25_60;
+       @%p64 bra       BB31_60;
 
        setp.ge.f32     %p65, %f2, 0f00000000;
        selp.b32        %r27, 2139095040, 0, %p65;
@@ -3869,19 +4557,19 @@ BB25_56:
        selp.b32        %r29, %r28, %r27, %p1;
        mov.b32          %f260, %r29;
 
-BB25_60:
+BB31_60:
        setp.eq.f32     %p69, %f2, 0f00000000;
        setp.eq.f32     %p70, %f1, 0f3F800000;
        or.pred         %p71, %p70, %p69;
        selp.f32        %f261, 0f3F800000, %f260, %p71;
 
-BB25_63:
+BB31_63:
        st.global.f32   [%rd1], %f261;
-       bra.uni         BB25_126;
+       bra.uni         BB31_126;
 
-BB25_118:
+BB31_118:
        setp.neu.f32    %p131, %f57, 0f7F800000;
-       @%p131 bra      BB25_122;
+       @%p131 bra      BB31_122;
 
        setp.ge.f32     %p132, %f1, 0f00000000;
        selp.b32        %r52, 2139095040, 0, %p132;
@@ -3889,16 +4577,16 @@ BB25_118:
        selp.b32        %r54, %r53, %r52, %p2;
        mov.b32          %f263, %r54;
 
-BB25_122:
+BB31_122:
        setp.eq.f32     %p136, %f1, 0f00000000;
        setp.eq.f32     %p137, %f2, 0f3F800000;
        or.pred         %p138, %p137, %p136;
        selp.f32        %f264, 0f3F800000, %f263, %p138;
 
-BB25_125:
+BB31_125:
        st.global.f32   [%rd1], %f264;
 
-BB25_126:
+BB31_126:
        bar.sync        0;
        ret;
 }
@@ -3924,14 +4612,14 @@ BB25_126:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB26_2;
+       @%p1 bra        BB32_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 

<TRUNCATED>

Reply via email to