http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 5f72887..1865e18 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,12 +1,12 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-22781540
+// Cuda compilation tools, release 9.0, V9.0.176
 // Based on LLVM 3.4svn
 //
 
-.version 5.0
+.version 6.0
 .target sm_30
 .address_size 64
 
@@ -128,15 +128,15 @@ BB1_2:
 )
 {
        .reg .pred      %p<13>;
-       .reg .b32       %r<74>;
+       .reg .b32       %r<72>;
        .reg .f64       %fd<2>;
-       .reg .b64       %rd<18>;
+       .reg .b64       %rd<17>;
 
 
-       ld.param.u64    %rd4, [sparse_dense_im2col_d_param_0];
-       ld.param.u64    %rd5, [sparse_dense_im2col_d_param_1];
-       ld.param.u64    %rd6, [sparse_dense_im2col_d_param_2];
-       ld.param.u64    %rd7, [sparse_dense_im2col_d_param_3];
+       ld.param.u64    %rd3, [sparse_dense_im2col_d_param_0];
+       ld.param.u64    %rd4, [sparse_dense_im2col_d_param_1];
+       ld.param.u64    %rd5, [sparse_dense_im2col_d_param_2];
+       ld.param.u64    %rd6, [sparse_dense_im2col_d_param_3];
        ld.param.u32    %r35, [sparse_dense_im2col_d_param_4];
        ld.param.u32    %r22, [sparse_dense_im2col_d_param_7];
        ld.param.u32    %r23, [sparse_dense_im2col_d_param_8];
@@ -158,27 +158,26 @@ BB1_2:
        setp.ge.s32     %p1, %r1, %r35;
        @%p1 bra        BB2_11;
 
-       cvta.to.global.u64      %rd1, %rd6;
-       cvta.to.global.u64      %rd2, %rd5;
-       cvta.to.global.u64      %rd8, %rd4;
-       cvt.s64.s32     %rd3, %r1;
-       mul.wide.s32    %rd9, %r1, 8;
-       add.s64         %rd10, %rd8, %rd9;
-       ld.global.f64   %fd1, [%rd10];
+       cvta.to.global.u64      %rd1, %rd5;
+       cvta.to.global.u64      %rd2, %rd4;
+       cvta.to.global.u64      %rd7, %rd3;
+       mul.wide.s32    %rd8, %r1, 8;
+       add.s64         %rd9, %rd7, %rd8;
+       ld.global.f64   %fd1, [%rd9];
        mov.u32         %r67, 0;
 
 BB2_2:
        mov.u32         %r2, %r67;
        add.s32         %r67, %r2, 1;
-       mul.wide.s32    %rd11, %r67, 4;
-       add.s64         %rd12, %rd2, %rd11;
-       ld.global.u32   %r40, [%rd12];
+       mul.wide.s32    %rd10, %r67, 4;
+       add.s64         %rd11, %rd2, %rd10;
+       ld.global.u32   %r40, [%rd11];
        setp.le.s32     %p2, %r40, %r1;
        @%p2 bra        BB2_2;
 
-       shl.b64         %rd13, %rd3, 2;
-       add.s64         %rd14, %rd1, %rd13;
-       ld.global.u32   %r41, [%rd14];
+       mul.wide.s32    %rd12, %r1, 4;
+       add.s64         %rd13, %rd1, %rd12;
+       ld.global.u32   %r41, [%rd13];
        div.s32         %r4, %r41, %r22;
        rem.s32         %r42, %r41, %r22;
        div.s32         %r43, %r42, %r23;
@@ -189,71 +188,69 @@ BB2_2:
        sub.s32         %r47, %r46, %r45;
        add.s32         %r48, %r47, %r5;
        mov.u32         %r49, 0;
-       max.s32         %r70, %r49, %r48;
+       max.s32         %r68, %r49, %r48;
        add.s32         %r50, %r24, -1;
        min.s32         %r7, %r50, %r5;
        add.s32         %r8, %r44, %r34;
        mul.lo.s32      %r51, %r32, %r27;
        sub.s32         %r52, %r46, %r51;
        add.s32         %r53, %r52, %r8;
-       max.s32         %r73, %r49, %r53;
+       max.s32         %r69, %r49, %r53;
        add.s32         %r54, %r25, -1;
        min.s32         %r10, %r54, %r8;
 
 BB2_4:
-       mov.u32         %r69, %r70;
-       sub.s32         %r55, %r5, %r69;
+       mov.u32         %r70, %r68;
+       sub.s32         %r55, %r5, %r70;
        rem.s32         %r56, %r55, %r31;
        setp.ne.s32     %p3, %r56, 0;
-       setp.le.s32     %p4, %r69, %r7;
-       and.pred        %p5, %p3, %p4;
-       add.s32         %r70, %r69, 1;
+       setp.le.s32     %p4, %r70, %r7;
+       and.pred        %p5, %p4, %p3;
+       add.s32         %r68, %r70, 1;
        @%p5 bra        BB2_4;
 
 BB2_5:
-       mov.u32         %r13, %r73;
+       mov.u32         %r13, %r69;
        sub.s32         %r57, %r8, %r13;
        rem.s32         %r58, %r57, %r32;
        setp.ne.s32     %p6, %r58, 0;
        setp.le.s32     %p7, %r13, %r10;
-       and.pred        %p8, %p6, %p7;
-       add.s32         %r73, %r13, 1;
+       and.pred        %p8, %p7, %p6;
+       add.s32         %r69, %r13, 1;
        @%p8 bra        BB2_5;
 
-       setp.gt.s32     %p9, %r69, %r7;
+       setp.gt.s32     %p9, %r70, %r7;
        @%p9 bra        BB2_11;
 
        mul.lo.s32      %r15, %r2, %r28;
        mul.lo.s32      %r16, %r4, %r29;
-       cvta.to.global.u64      %rd15, %rd7;
+       cvta.to.global.u64      %rd14, %rd6;
 
 BB2_8:
-       sub.s32         %r59, %r5, %r69;
+       sub.s32         %r59, %r5, %r70;
        div.s32         %r60, %r59, %r31;
        mad.lo.s32      %r18, %r60, %r27, %r15;
        setp.gt.s32     %p10, %r13, %r10;
-       mov.u32         %r72, %r13;
+       mov.u32         %r71, %r13;
        @%p10 bra       BB2_10;
 
 BB2_9:
-       mov.u32         %r19, %r72;
-       sub.s32         %r61, %r8, %r19;
+       sub.s32         %r61, %r8, %r71;
        div.s32         %r62, %r61, %r32;
-       mad.lo.s32      %r63, %r69, %r25, %r16;
-       add.s32         %r64, %r63, %r19;
+       mad.lo.s32      %r63, %r70, %r25, %r16;
+       add.s32         %r64, %r63, %r71;
        mad.lo.s32      %r65, %r64, %r30, %r18;
        add.s32         %r66, %r65, %r62;
-       mul.wide.s32    %rd16, %r66, 8;
-       add.s64         %rd17, %rd15, %rd16;
-       st.global.f64   [%rd17], %fd1;
-       add.s32         %r20, %r19, %r32;
-       setp.le.s32     %p11, %r20, %r10;
-       mov.u32         %r72, %r20;
+       mul.wide.s32    %rd15, %r66, 8;
+       add.s64         %rd16, %rd14, %rd15;
+       st.global.f64   [%rd16], %fd1;
+       add.s32         %r71, %r71, %r32;
+       setp.le.s32     %p11, %r71, %r10;
        @%p11 bra       BB2_9;
 
 BB2_10:
-       add.s32         %r69, %r69, %r31;
-       setp.le.s32     %p12, %r69, %r7;
+       add.s32         %r70, %r70, %r31;
+       setp.le.s32     %p12, %r70, %r7;
        @%p12 bra       BB2_8;
 
 BB2_11:
@@ -286,14 +283,14 @@ BB2_11:
 {
        .reg .pred      %p<13>;
        .reg .f32       %f<2>;
-       .reg .b32       %r<74>;
-       .reg .b64       %rd<18>;
+       .reg .b32       %r<72>;
+       .reg .b64       %rd<17>;
 
 
-       ld.param.u64    %rd4, [sparse_dense_im2col_f_param_0];
-       ld.param.u64    %rd5, [sparse_dense_im2col_f_param_1];
-       ld.param.u64    %rd6, [sparse_dense_im2col_f_param_2];
-       ld.param.u64    %rd7, [sparse_dense_im2col_f_param_3];
+       ld.param.u64    %rd3, [sparse_dense_im2col_f_param_0];
+       ld.param.u64    %rd4, [sparse_dense_im2col_f_param_1];
+       ld.param.u64    %rd5, [sparse_dense_im2col_f_param_2];
+       ld.param.u64    %rd6, [sparse_dense_im2col_f_param_3];
        ld.param.u32    %r35, [sparse_dense_im2col_f_param_4];
        ld.param.u32    %r22, [sparse_dense_im2col_f_param_7];
        ld.param.u32    %r23, [sparse_dense_im2col_f_param_8];
@@ -315,27 +312,25 @@ BB2_11:
        setp.ge.s32     %p1, %r1, %r35;
        @%p1 bra        BB3_11;
 
-       cvta.to.global.u64      %rd1, %rd6;
-       cvta.to.global.u64      %rd2, %rd5;
-       cvta.to.global.u64      %rd8, %rd4;
-       cvt.s64.s32     %rd3, %r1;
-       mul.wide.s32    %rd9, %r1, 4;
-       add.s64         %rd10, %rd8, %rd9;
-       ld.global.f32   %f1, [%rd10];
+       cvta.to.global.u64      %rd1, %rd5;
+       cvta.to.global.u64      %rd2, %rd4;
+       cvta.to.global.u64      %rd7, %rd3;
+       mul.wide.s32    %rd8, %r1, 4;
+       add.s64         %rd9, %rd7, %rd8;
+       ld.global.f32   %f1, [%rd9];
        mov.u32         %r67, 0;
 
 BB3_2:
        mov.u32         %r2, %r67;
        add.s32         %r67, %r2, 1;
-       mul.wide.s32    %rd11, %r67, 4;
-       add.s64         %rd12, %rd2, %rd11;
-       ld.global.u32   %r40, [%rd12];
+       mul.wide.s32    %rd10, %r67, 4;
+       add.s64         %rd11, %rd2, %rd10;
+       ld.global.u32   %r40, [%rd11];
        setp.le.s32     %p2, %r40, %r1;
        @%p2 bra        BB3_2;
 
-       shl.b64         %rd13, %rd3, 2;
-       add.s64         %rd14, %rd1, %rd13;
-       ld.global.u32   %r41, [%rd14];
+       add.s64         %rd13, %rd1, %rd8;
+       ld.global.u32   %r41, [%rd13];
        div.s32         %r4, %r41, %r22;
        rem.s32         %r42, %r41, %r22;
        div.s32         %r43, %r42, %r23;
@@ -346,71 +341,69 @@ BB3_2:
        sub.s32         %r47, %r46, %r45;
        add.s32         %r48, %r47, %r5;
        mov.u32         %r49, 0;
-       max.s32         %r70, %r49, %r48;
+       max.s32         %r68, %r49, %r48;
        add.s32         %r50, %r24, -1;
        min.s32         %r7, %r50, %r5;
        add.s32         %r8, %r44, %r34;
        mul.lo.s32      %r51, %r32, %r27;
        sub.s32         %r52, %r46, %r51;
        add.s32         %r53, %r52, %r8;
-       max.s32         %r73, %r49, %r53;
+       max.s32         %r69, %r49, %r53;
        add.s32         %r54, %r25, -1;
        min.s32         %r10, %r54, %r8;
 
 BB3_4:
-       mov.u32         %r69, %r70;
-       sub.s32         %r55, %r5, %r69;
+       mov.u32         %r70, %r68;
+       sub.s32         %r55, %r5, %r70;
        rem.s32         %r56, %r55, %r31;
        setp.ne.s32     %p3, %r56, 0;
-       setp.le.s32     %p4, %r69, %r7;
-       and.pred        %p5, %p3, %p4;
-       add.s32         %r70, %r69, 1;
+       setp.le.s32     %p4, %r70, %r7;
+       and.pred        %p5, %p4, %p3;
+       add.s32         %r68, %r70, 1;
        @%p5 bra        BB3_4;
 
 BB3_5:
-       mov.u32         %r13, %r73;
+       mov.u32         %r13, %r69;
        sub.s32         %r57, %r8, %r13;
        rem.s32         %r58, %r57, %r32;
        setp.ne.s32     %p6, %r58, 0;
        setp.le.s32     %p7, %r13, %r10;
-       and.pred        %p8, %p6, %p7;
-       add.s32         %r73, %r13, 1;
+       and.pred        %p8, %p7, %p6;
+       add.s32         %r69, %r13, 1;
        @%p8 bra        BB3_5;
 
-       setp.gt.s32     %p9, %r69, %r7;
+       setp.gt.s32     %p9, %r70, %r7;
        @%p9 bra        BB3_11;
 
        mul.lo.s32      %r15, %r2, %r28;
        mul.lo.s32      %r16, %r4, %r29;
-       cvta.to.global.u64      %rd15, %rd7;
+       cvta.to.global.u64      %rd14, %rd6;
 
 BB3_8:
-       sub.s32         %r59, %r5, %r69;
+       sub.s32         %r59, %r5, %r70;
        div.s32         %r60, %r59, %r31;
        mad.lo.s32      %r18, %r60, %r27, %r15;
        setp.gt.s32     %p10, %r13, %r10;
-       mov.u32         %r72, %r13;
+       mov.u32         %r71, %r13;
        @%p10 bra       BB3_10;
 
 BB3_9:
-       mov.u32         %r19, %r72;
-       sub.s32         %r61, %r8, %r19;
+       sub.s32         %r61, %r8, %r71;
        div.s32         %r62, %r61, %r32;
-       mad.lo.s32      %r63, %r69, %r25, %r16;
-       add.s32         %r64, %r63, %r19;
+       mad.lo.s32      %r63, %r70, %r25, %r16;
+       add.s32         %r64, %r63, %r71;
        mad.lo.s32      %r65, %r64, %r30, %r18;
        add.s32         %r66, %r65, %r62;
-       mul.wide.s32    %rd16, %r66, 4;
-       add.s64         %rd17, %rd15, %rd16;
-       st.global.f32   [%rd17], %f1;
-       add.s32         %r20, %r19, %r32;
-       setp.le.s32     %p11, %r20, %r10;
-       mov.u32         %r72, %r20;
+       mul.wide.s32    %rd15, %r66, 4;
+       add.s64         %rd16, %rd14, %rd15;
+       st.global.f32   [%rd16], %f1;
+       add.s32         %r71, %r71, %r32;
+       setp.le.s32     %p11, %r71, %r10;
        @%p11 bra       BB3_9;
 
 BB3_10:
-       add.s32         %r69, %r69, %r31;
-       setp.le.s32     %p12, %r69, %r7;
+       add.s32         %r70, %r70, %r31;
+       setp.le.s32     %p12, %r70, %r7;
        @%p12 bra       BB3_8;
 
 BB3_11:
@@ -439,7 +432,7 @@ BB3_11:
 )
 {
        .reg .pred      %p<12>;
-       .reg .b32       %r<71>;
+       .reg .b32       %r<69>;
        .reg .f64       %fd<2>;
        .reg .b64       %rd<9>;
 
@@ -484,38 +477,38 @@ BB3_11:
        sub.s32         %r45, %r44, %r43;
        add.s32         %r46, %r45, %r4;
        mov.u32         %r47, 0;
-       max.s32         %r67, %r47, %r46;
+       max.s32         %r65, %r47, %r46;
        add.s32         %r48, %r24, -1;
        min.s32         %r6, %r48, %r4;
        add.s32         %r7, %r42, %r34;
        mul.lo.s32      %r49, %r32, %r27;
        sub.s32         %r50, %r44, %r49;
        add.s32         %r51, %r50, %r7;
-       max.s32         %r70, %r47, %r51;
+       max.s32         %r66, %r47, %r51;
        add.s32         %r52, %r25, -1;
        min.s32         %r9, %r52, %r7;
 
 BB4_2:
-       mov.u32         %r66, %r67;
-       sub.s32         %r53, %r4, %r66;
+       mov.u32         %r67, %r65;
+       sub.s32         %r53, %r4, %r67;
        rem.s32         %r54, %r53, %r31;
        setp.ne.s32     %p2, %r54, 0;
-       setp.le.s32     %p3, %r66, %r6;
-       and.pred        %p4, %p2, %p3;
-       add.s32         %r67, %r66, 1;
+       setp.le.s32     %p3, %r67, %r6;
+       and.pred        %p4, %p3, %p2;
+       add.s32         %r65, %r67, 1;
        @%p4 bra        BB4_2;
 
 BB4_3:
-       mov.u32         %r12, %r70;
+       mov.u32         %r12, %r66;
        sub.s32         %r55, %r7, %r12;
        rem.s32         %r56, %r55, %r32;
        setp.ne.s32     %p5, %r56, 0;
        setp.le.s32     %p6, %r12, %r9;
-       and.pred        %p7, %p5, %p6;
-       add.s32         %r70, %r12, 1;
+       and.pred        %p7, %p6, %p5;
+       add.s32         %r66, %r12, 1;
        @%p7 bra        BB4_3;
 
-       setp.gt.s32     %p8, %r66, %r6;
+       setp.gt.s32     %p8, %r67, %r6;
        @%p8 bra        BB4_9;
 
        mul.lo.s32      %r14, %r2, %r28;
@@ -523,32 +516,30 @@ BB4_3:
        cvta.to.global.u64      %rd6, %rd2;
 
 BB4_6:
-       sub.s32         %r57, %r4, %r66;
+       sub.s32         %r57, %r4, %r67;
        div.s32         %r58, %r57, %r31;
        mad.lo.s32      %r17, %r58, %r27, %r14;
        setp.gt.s32     %p9, %r12, %r9;
-       mov.u32         %r69, %r12;
+       mov.u32         %r68, %r12;
        @%p9 bra        BB4_8;
 
 BB4_7:
-       mov.u32         %r18, %r69;
-       sub.s32         %r59, %r7, %r18;
+       sub.s32         %r59, %r7, %r68;
        div.s32         %r60, %r59, %r32;
-       mad.lo.s32      %r61, %r66, %r25, %r15;
-       add.s32         %r62, %r61, %r18;
+       mad.lo.s32      %r61, %r67, %r25, %r15;
+       add.s32         %r62, %r61, %r68;
        mad.lo.s32      %r63, %r62, %r30, %r17;
        add.s32         %r64, %r63, %r60;
        mul.wide.s32    %rd7, %r64, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd1;
-       add.s32         %r19, %r18, %r32;
-       setp.le.s32     %p10, %r19, %r9;
-       mov.u32         %r69, %r19;
+       add.s32         %r68, %r68, %r32;
+       setp.le.s32     %p10, %r68, %r9;
        @%p10 bra       BB4_7;
 
 BB4_8:
-       add.s32         %r66, %r66, %r31;
-       setp.le.s32     %p11, %r66, %r6;
+       add.s32         %r67, %r67, %r31;
+       setp.le.s32     %p11, %r67, %r6;
        @%p11 bra       BB4_6;
 
 BB4_9:
@@ -578,7 +569,7 @@ BB4_9:
 {
        .reg .pred      %p<12>;
        .reg .f32       %f<2>;
-       .reg .b32       %r<71>;
+       .reg .b32       %r<69>;
        .reg .b64       %rd<9>;
 
 
@@ -622,38 +613,38 @@ BB4_9:
        sub.s32         %r45, %r44, %r43;
        add.s32         %r46, %r45, %r4;
        mov.u32         %r47, 0;
-       max.s32         %r67, %r47, %r46;
+       max.s32         %r65, %r47, %r46;
        add.s32         %r48, %r24, -1;
        min.s32         %r6, %r48, %r4;
        add.s32         %r7, %r42, %r34;
        mul.lo.s32      %r49, %r32, %r27;
        sub.s32         %r50, %r44, %r49;
        add.s32         %r51, %r50, %r7;
-       max.s32         %r70, %r47, %r51;
+       max.s32         %r66, %r47, %r51;
        add.s32         %r52, %r25, -1;
        min.s32         %r9, %r52, %r7;
 
 BB5_2:
-       mov.u32         %r66, %r67;
-       sub.s32         %r53, %r4, %r66;
+       mov.u32         %r67, %r65;
+       sub.s32         %r53, %r4, %r67;
        rem.s32         %r54, %r53, %r31;
        setp.ne.s32     %p2, %r54, 0;
-       setp.le.s32     %p3, %r66, %r6;
-       and.pred        %p4, %p2, %p3;
-       add.s32         %r67, %r66, 1;
+       setp.le.s32     %p3, %r67, %r6;
+       and.pred        %p4, %p3, %p2;
+       add.s32         %r65, %r67, 1;
        @%p4 bra        BB5_2;
 
 BB5_3:
-       mov.u32         %r12, %r70;
+       mov.u32         %r12, %r66;
        sub.s32         %r55, %r7, %r12;
        rem.s32         %r56, %r55, %r32;
        setp.ne.s32     %p5, %r56, 0;
        setp.le.s32     %p6, %r12, %r9;
-       and.pred        %p7, %p5, %p6;
-       add.s32         %r70, %r12, 1;
+       and.pred        %p7, %p6, %p5;
+       add.s32         %r66, %r12, 1;
        @%p7 bra        BB5_3;
 
-       setp.gt.s32     %p8, %r66, %r6;
+       setp.gt.s32     %p8, %r67, %r6;
        @%p8 bra        BB5_9;
 
        mul.lo.s32      %r14, %r2, %r28;
@@ -661,32 +652,30 @@ BB5_3:
        cvta.to.global.u64      %rd6, %rd2;
 
 BB5_6:
-       sub.s32         %r57, %r4, %r66;
+       sub.s32         %r57, %r4, %r67;
        div.s32         %r58, %r57, %r31;
        mad.lo.s32      %r17, %r58, %r27, %r14;
        setp.gt.s32     %p9, %r12, %r9;
-       mov.u32         %r69, %r12;
+       mov.u32         %r68, %r12;
        @%p9 bra        BB5_8;
 
 BB5_7:
-       mov.u32         %r18, %r69;
-       sub.s32         %r59, %r7, %r18;
+       sub.s32         %r59, %r7, %r68;
        div.s32         %r60, %r59, %r32;
-       mad.lo.s32      %r61, %r66, %r25, %r15;
-       add.s32         %r62, %r61, %r18;
+       mad.lo.s32      %r61, %r67, %r25, %r15;
+       add.s32         %r62, %r61, %r68;
        mad.lo.s32      %r63, %r62, %r30, %r17;
        add.s32         %r64, %r63, %r60;
        mul.wide.s32    %rd7, %r64, 4;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f32   [%rd8], %f1;
-       add.s32         %r19, %r18, %r32;
-       setp.le.s32     %p10, %r19, %r9;
-       mov.u32         %r69, %r19;
+       add.s32         %r68, %r68, %r32;
+       setp.le.s32     %p10, %r68, %r9;
        @%p10 bra       BB5_7;
 
 BB5_8:
-       add.s32         %r66, %r66, %r31;
-       setp.le.s32     %p11, %r66, %r6;
+       add.s32         %r67, %r67, %r31;
+       setp.le.s32     %p11, %r67, %r6;
        @%p11 bra       BB5_6;
 
 BB5_9:
@@ -805,7 +794,7 @@ BB7_2:
 )
 {
        .reg .pred      %p<7>;
-       .reg .b32       %r<24>;
+       .reg .b32       %r<25>;
        .reg .f64       %fd<2>;
        .reg .b64       %rd<23>;
 
@@ -831,8 +820,8 @@ BB7_2:
        mul.wide.s32    %rd14, %r2, 4;
        add.s64         %rd1, %rd13, %rd14;
        ld.global.u32   %r23, [%rd1];
-       ld.global.u32   %r22, [%rd1+4];
-       setp.ge.s32     %p2, %r23, %r22;
+       ld.global.u32   %r24, [%rd1+4];
+       setp.ge.s32     %p2, %r23, %r24;
        @%p2 bra        BB8_6;
 
        cvta.to.global.u64      %rd2, %rd12;
@@ -857,13 +846,13 @@ BB8_3:
        mul.wide.s32    %rd19, %r21, 8;
        add.s64         %rd20, %rd2, %rd19;
        st.global.f64   [%rd20], %fd1;
-       ld.global.u32   %r22, [%rd1+4];
+       ld.global.u32   %r24, [%rd1+4];
 
 BB8_5:
        add.s64         %rd22, %rd22, 8;
        add.s64         %rd21, %rd21, 4;
        add.s32         %r23, %r23, 1;
-       setp.lt.s32     %p6, %r23, %r22;
+       setp.lt.s32     %p6, %r23, %r24;
        @%p6 bra        BB8_3;
 
 BB8_6:
@@ -885,7 +874,7 @@ BB8_6:
 {
        .reg .pred      %p<7>;
        .reg .f32       %f<2>;
-       .reg .b32       %r<24>;
+       .reg .b32       %r<25>;
        .reg .b64       %rd<22>;
 
 
@@ -910,8 +899,8 @@ BB8_6:
        mul.wide.s32    %rd14, %r2, 4;
        add.s64         %rd1, %rd13, %rd14;
        ld.global.u32   %r23, [%rd1];
-       ld.global.u32   %r22, [%rd1+4];
-       setp.ge.s32     %p2, %r23, %r22;
+       ld.global.u32   %r24, [%rd1+4];
+       setp.ge.s32     %p2, %r23, %r24;
        @%p2 bra        BB9_6;
 
        cvta.to.global.u64      %rd2, %rd12;
@@ -935,13 +924,13 @@ BB9_3:
        mul.wide.s32    %rd18, %r21, 4;
        add.s64         %rd19, %rd2, %rd18;
        st.global.f32   [%rd19], %f1;
-       ld.global.u32   %r22, [%rd1+4];
+       ld.global.u32   %r24, [%rd1+4];
 
 BB9_5:
        add.s64         %rd21, %rd21, 4;
        add.s64         %rd20, %rd20, 4;
        add.s32         %r23, %r23, 1;
-       setp.lt.s32     %p6, %r23, %r22;
+       setp.lt.s32     %p6, %r23, %r24;
        @%p6 bra        BB9_3;
 
 BB9_6:
@@ -964,13 +953,13 @@ BB9_6:
        .reg .pred      %p<6>;
        .reg .b32       %r<22>;
        .reg .f64       %fd<2>;
-       .reg .b64       %rd<22>;
+       .reg .b64       %rd<21>;
 
 
-       ld.param.u64    %rd5, [slice_sparse_dense_nnz_d_param_0];
-       ld.param.u64    %rd8, [slice_sparse_dense_nnz_d_param_1];
-       ld.param.u64    %rd6, [slice_sparse_dense_nnz_d_param_2];
-       ld.param.u64    %rd7, [slice_sparse_dense_nnz_d_param_3];
+       ld.param.u64    %rd4, [slice_sparse_dense_nnz_d_param_0];
+       ld.param.u64    %rd7, [slice_sparse_dense_nnz_d_param_1];
+       ld.param.u64    %rd5, [slice_sparse_dense_nnz_d_param_2];
+       ld.param.u64    %rd6, [slice_sparse_dense_nnz_d_param_3];
        ld.param.u32    %r5, [slice_sparse_dense_nnz_d_param_4];
        ld.param.u32    %r9, [slice_sparse_dense_nnz_d_param_5];
        ld.param.u32    %r6, [slice_sparse_dense_nnz_d_param_6];
@@ -980,24 +969,23 @@ BB9_6:
        mov.u32         %r11, %ctaid.x;
        mov.u32         %r12, %tid.x;
        mad.lo.s32      %r13, %r10, %r11, %r12;
-       cvta.to.global.u64      %rd1, %rd8;
-       mul.wide.s32    %rd9, %r5, 4;
-       add.s64         %rd10, %rd1, %rd9;
-       ld.global.u32   %r14, [%rd10];
+       cvta.to.global.u64      %rd1, %rd7;
+       mul.wide.s32    %rd8, %r5, 4;
+       add.s64         %rd9, %rd1, %rd8;
+       ld.global.u32   %r14, [%rd9];
        add.s32         %r1, %r13, %r14;
-       mul.wide.s32    %rd11, %r9, 4;
-       add.s64         %rd12, %rd1, %rd11;
-       ld.global.u32   %r15, [%rd12+4];
+       mul.wide.s32    %rd10, %r9, 4;
+       add.s64         %rd11, %rd1, %rd10;
+       ld.global.u32   %r15, [%rd11+4];
        setp.ge.s32     %p1, %r1, %r15;
        @%p1 bra        BB10_5;
 
-       cvta.to.global.u64      %rd2, %rd7;
-       cvta.to.global.u64      %rd3, %rd5;
-       cvta.to.global.u64      %rd13, %rd6;
-       cvt.s64.s32     %rd4, %r1;
-       mul.wide.s32    %rd14, %r1, 4;
-       add.s64         %rd15, %rd13, %rd14;
-       ld.global.u32   %r2, [%rd15];
+       cvta.to.global.u64      %rd2, %rd6;
+       cvta.to.global.u64      %rd3, %rd4;
+       cvta.to.global.u64      %rd12, %rd5;
+       mul.wide.s32    %rd13, %r1, 4;
+       add.s64         %rd14, %rd12, %rd13;
+       ld.global.u32   %r2, [%rd14];
        setp.lt.s32     %p2, %r2, %r6;
        setp.gt.s32     %p3, %r2, %r7;
        or.pred         %p4, %p2, %p3;
@@ -1007,24 +995,23 @@ BB9_6:
 
 BB10_3:
        mov.u32         %r3, %r21;
-       add.s32         %r4, %r3, 1;
-       mul.wide.s32    %rd16, %r4, 4;
-       add.s64         %rd17, %rd1, %rd16;
-       ld.global.u32   %r16, [%rd17];
+       add.s32         %r21, %r3, 1;
+       mul.wide.s32    %rd15, %r21, 4;
+       add.s64         %rd16, %rd1, %rd15;
+       ld.global.u32   %r16, [%rd16];
        setp.le.s32     %p5, %r16, %r1;
-       mov.u32         %r21, %r4;
        @%p5 bra        BB10_3;
 
-       shl.b64         %rd18, %rd4, 3;
-       add.s64         %rd19, %rd3, %rd18;
-       ld.global.f64   %fd1, [%rd19];
+       mul.wide.s32    %rd17, %r1, 8;
+       add.s64         %rd18, %rd3, %rd17;
+       ld.global.f64   %fd1, [%rd18];
        sub.s32         %r17, %r3, %r5;
        mul.lo.s32      %r18, %r17, %r8;
        sub.s32         %r19, %r18, %r6;
        add.s32         %r20, %r19, %r2;
-       mul.wide.s32    %rd20, %r20, 8;
-       add.s64         %rd21, %rd2, %rd20;
-       st.global.f64   [%rd21], %fd1;
+       mul.wide.s32    %rd19, %r20, 8;
+       add.s64         %rd20, %rd2, %rd19;
+       st.global.f64   [%rd20], %fd1;
 
 BB10_5:
        ret;
@@ -1046,13 +1033,13 @@ BB10_5:
        .reg .pred      %p<6>;
        .reg .f32       %f<2>;
        .reg .b32       %r<22>;
-       .reg .b64       %rd<22>;
+       .reg .b64       %rd<21>;
 
 
-       ld.param.u64    %rd5, [slice_sparse_dense_nnz_f_param_0];
-       ld.param.u64    %rd8, [slice_sparse_dense_nnz_f_param_1];
-       ld.param.u64    %rd6, [slice_sparse_dense_nnz_f_param_2];
-       ld.param.u64    %rd7, [slice_sparse_dense_nnz_f_param_3];
+       ld.param.u64    %rd4, [slice_sparse_dense_nnz_f_param_0];
+       ld.param.u64    %rd7, [slice_sparse_dense_nnz_f_param_1];
+       ld.param.u64    %rd5, [slice_sparse_dense_nnz_f_param_2];
+       ld.param.u64    %rd6, [slice_sparse_dense_nnz_f_param_3];
        ld.param.u32    %r5, [slice_sparse_dense_nnz_f_param_4];
        ld.param.u32    %r9, [slice_sparse_dense_nnz_f_param_5];
        ld.param.u32    %r6, [slice_sparse_dense_nnz_f_param_6];
@@ -1062,24 +1049,23 @@ BB10_5:
        mov.u32         %r11, %ctaid.x;
        mov.u32         %r12, %tid.x;
        mad.lo.s32      %r13, %r10, %r11, %r12;
-       cvta.to.global.u64      %rd1, %rd8;
-       mul.wide.s32    %rd9, %r5, 4;
-       add.s64         %rd10, %rd1, %rd9;
-       ld.global.u32   %r14, [%rd10];
+       cvta.to.global.u64      %rd1, %rd7;
+       mul.wide.s32    %rd8, %r5, 4;
+       add.s64         %rd9, %rd1, %rd8;
+       ld.global.u32   %r14, [%rd9];
        add.s32         %r1, %r13, %r14;
-       mul.wide.s32    %rd11, %r9, 4;
-       add.s64         %rd12, %rd1, %rd11;
-       ld.global.u32   %r15, [%rd12+4];
+       mul.wide.s32    %rd10, %r9, 4;
+       add.s64         %rd11, %rd1, %rd10;
+       ld.global.u32   %r15, [%rd11+4];
        setp.ge.s32     %p1, %r1, %r15;
        @%p1 bra        BB11_5;
 
-       cvta.to.global.u64      %rd2, %rd7;
-       cvta.to.global.u64      %rd3, %rd5;
-       cvta.to.global.u64      %rd13, %rd6;
-       cvt.s64.s32     %rd4, %r1;
-       mul.wide.s32    %rd14, %r1, 4;
-       add.s64         %rd15, %rd13, %rd14;
-       ld.global.u32   %r2, [%rd15];
+       cvta.to.global.u64      %rd2, %rd6;
+       cvta.to.global.u64      %rd3, %rd4;
+       cvta.to.global.u64      %rd12, %rd5;
+       mul.wide.s32    %rd13, %r1, 4;
+       add.s64         %rd14, %rd12, %rd13;
+       ld.global.u32   %r2, [%rd14];
        setp.lt.s32     %p2, %r2, %r6;
        setp.gt.s32     %p3, %r2, %r7;
        or.pred         %p4, %p2, %p3;
@@ -1089,24 +1075,22 @@ BB10_5:
 
 BB11_3:
        mov.u32         %r3, %r21;
-       add.s32         %r4, %r3, 1;
-       mul.wide.s32    %rd16, %r4, 4;
-       add.s64         %rd17, %rd1, %rd16;
-       ld.global.u32   %r16, [%rd17];
+       add.s32         %r21, %r3, 1;
+       mul.wide.s32    %rd15, %r21, 4;
+       add.s64         %rd16, %rd1, %rd15;
+       ld.global.u32   %r16, [%rd16];
        setp.le.s32     %p5, %r16, %r1;
-       mov.u32         %r21, %r4;
        @%p5 bra        BB11_3;
 
-       shl.b64         %rd18, %rd4, 2;
-       add.s64         %rd19, %rd3, %rd18;
-       ld.global.f32   %f1, [%rd19];
+       add.s64         %rd18, %rd3, %rd13;
+       ld.global.f32   %f1, [%rd18];
        sub.s32         %r17, %r3, %r5;
        mul.lo.s32      %r18, %r17, %r8;
        sub.s32         %r19, %r18, %r6;
        add.s32         %r20, %r19, %r2;
-       mul.wide.s32    %rd20, %r20, 4;
-       add.s64         %rd21, %rd2, %rd20;
-       st.global.f32   [%rd21], %f1;
+       mul.wide.s32    %rd19, %r20, 4;
+       add.s64         %rd20, %rd2, %rd19;
+       st.global.f32   [%rd20], %f1;
 
 BB11_5:
        ret;
@@ -1409,12 +1393,12 @@ BB17_2:
        .reg .pred      %p<5>;
        .reg .b32       %r<8>;
        .reg .f64       %fd<6>;
-       .reg .b64       %rd<14>;
+       .reg .b64       %rd<13>;
 
 
-       ld.param.u64    %rd2, [relu_backward_d_param_0];
-       ld.param.u64    %rd3, [relu_backward_d_param_1];
-       ld.param.u64    %rd4, [relu_backward_d_param_2];
+       ld.param.u64    %rd1, [relu_backward_d_param_0];
+       ld.param.u64    %rd2, [relu_backward_d_param_1];
+       ld.param.u64    %rd3, [relu_backward_d_param_2];
        ld.param.u32    %r2, [relu_backward_d_param_3];
        ld.param.u32    %r3, [relu_backward_d_param_4];
        mov.u32         %r4, %ntid.x;
@@ -1429,25 +1413,22 @@ BB17_2:
        bra.uni         BB18_1;
 
 BB18_1:
-       cvta.to.global.u64      %rd5, %rd2;
-       cvt.s64.s32     %rd1, %r1;
-       mul.wide.s32    %rd6, %r1, 8;
-       add.s64         %rd7, %rd5, %rd6;
-       ld.global.f64   %fd4, [%rd7];
+       cvta.to.global.u64      %rd4, %rd1;
+       mul.wide.s32    %rd5, %r1, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd4, [%rd6];
        mov.f64         %fd5, 0d0000000000000000;
        setp.leu.f64    %p4, %fd4, 0d0000000000000000;
        @%p4 bra        BB18_3;
 
-       cvta.to.global.u64      %rd8, %rd3;
-       shl.b64         %rd9, %rd1, 3;
-       add.s64         %rd10, %rd8, %rd9;
-       ld.global.f64   %fd5, [%rd10];
+       cvta.to.global.u64      %rd7, %rd2;
+       add.s64         %rd9, %rd7, %rd5;
+       ld.global.f64   %fd5, [%rd9];
 
 BB18_3:
-       cvta.to.global.u64      %rd11, %rd4;
-       shl.b64         %rd12, %rd1, 3;
-       add.s64         %rd13, %rd11, %rd12;
-       st.global.f64   [%rd13], %fd5;
+       cvta.to.global.u64      %rd10, %rd3;
+       add.s64         %rd12, %rd10, %rd5;
+       st.global.f64   [%rd12], %fd5;
 
 BB18_4:
        ret;
@@ -1465,12 +1446,12 @@ BB18_4:
        .reg .pred      %p<5>;
        .reg .f32       %f<6>;
        .reg .b32       %r<8>;
-       .reg .b64       %rd<14>;
+       .reg .b64       %rd<13>;
 
 
-       ld.param.u64    %rd2, [relu_backward_f_param_0];
-       ld.param.u64    %rd3, [relu_backward_f_param_1];
-       ld.param.u64    %rd4, [relu_backward_f_param_2];
+       ld.param.u64    %rd1, [relu_backward_f_param_0];
+       ld.param.u64    %rd2, [relu_backward_f_param_1];
+       ld.param.u64    %rd3, [relu_backward_f_param_2];
        ld.param.u32    %r2, [relu_backward_f_param_3];
        ld.param.u32    %r3, [relu_backward_f_param_4];
        mov.u32         %r4, %ntid.x;
@@ -1485,25 +1466,22 @@ BB18_4:
        bra.uni         BB19_1;
 
 BB19_1:
-       cvta.to.global.u64      %rd5, %rd2;
-       cvt.s64.s32     %rd1, %r1;
-       mul.wide.s32    %rd6, %r1, 4;
-       add.s64         %rd7, %rd5, %rd6;
-       ld.global.f32   %f4, [%rd7];
+       cvta.to.global.u64      %rd4, %rd1;
+       mul.wide.s32    %rd5, %r1, 4;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f32   %f4, [%rd6];
        mov.f32         %f5, 0f00000000;
        setp.leu.f32    %p4, %f4, 0f00000000;
        @%p4 bra        BB19_3;
 
-       cvta.to.global.u64      %rd8, %rd3;
-       shl.b64         %rd9, %rd1, 2;
-       add.s64         %rd10, %rd8, %rd9;
-       ld.global.f32   %f5, [%rd10];
+       cvta.to.global.u64      %rd7, %rd2;
+       add.s64         %rd9, %rd7, %rd5;
+       ld.global.f32   %f5, [%rd9];
 
 BB19_3:
-       cvta.to.global.u64      %rd11, %rd4;
-       shl.b64         %rd12, %rd1, 2;
-       add.s64         %rd13, %rd11, %rd12;
-       st.global.f32   [%rd13], %f5;
+       cvta.to.global.u64      %rd10, %rd3;
+       add.s64         %rd12, %rd10, %rd5;
+       st.global.f32   [%rd12], %f5;
 
 BB19_4:
        ret;
@@ -1965,106 +1943,119 @@ BB27_2:
        .param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-       .reg .pred      %p<77>;
-       .reg .b32       %r<56>;
-       .reg .f64       %fd<55>;
+       .reg .pred      %p<73>;
+       .reg .b32       %r<61>;
+       .reg .f64       %fd<51>;
        .reg .b64       %rd<19>;
 
 
        ld.param.u64    %rd2, [matrix_matrix_cellwise_op_d_param_0];
        ld.param.u64    %rd3, [matrix_matrix_cellwise_op_d_param_1];
        ld.param.u64    %rd4, [matrix_matrix_cellwise_op_d_param_2];
-       ld.param.u32    %r10, [matrix_matrix_cellwise_op_d_param_3];
-       ld.param.u32    %r6, [matrix_matrix_cellwise_op_d_param_4];
-       ld.param.u32    %r7, [matrix_matrix_cellwise_op_d_param_5];
-       ld.param.u32    %r8, [matrix_matrix_cellwise_op_d_param_6];
-       ld.param.u32    %r9, [matrix_matrix_cellwise_op_d_param_7];
-       mov.u32         %r11, %ctaid.x;
-       mov.u32         %r12, %ntid.x;
-       mov.u32         %r13, %tid.x;
-       mad.lo.s32      %r1, %r12, %r11, %r13;
-       div.s32         %r2, %r1, %r6;
-       setp.lt.s32     %p2, %r2, %r10;
-       setp.gt.s32     %p3, %r6, -1;
+       ld.param.u32    %r14, [matrix_matrix_cellwise_op_d_param_3];
+       ld.param.u32    %r10, [matrix_matrix_cellwise_op_d_param_4];
+       ld.param.u32    %r11, [matrix_matrix_cellwise_op_d_param_5];
+       ld.param.u32    %r12, [matrix_matrix_cellwise_op_d_param_6];
+       ld.param.u32    %r13, [matrix_matrix_cellwise_op_d_param_7];
+       mov.u32         %r15, %ntid.x;
+       mov.u32         %r16, %ctaid.x;
+       mov.u32         %r17, %tid.x;
+       mad.lo.s32      %r18, %r15, %r16, %r17;
+       div.s32         %r60, %r18, %r10;
+       rem.s32         %r2, %r18, %r10;
+       setp.lt.s32     %p2, %r60, %r14;
+       setp.gt.s32     %p3, %r10, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB28_65;
+       @!%p4 bra       BB28_77;
        bra.uni         BB28_1;
 
 BB28_1:
-       rem.s32         %r14, %r1, %r6;
-       cvta.to.global.u64      %rd5, %rd2;
-       mad.lo.s32      %r3, %r2, %r6, %r14;
-       setp.eq.s32     %p5, %r7, 2;
-       selp.b32        %r15, %r14, %r3, %p5;
-       setp.eq.s32     %p6, %r7, 1;
-       selp.b32        %r16, %r2, %r15, %p6;
-       setp.eq.s32     %p7, %r8, 2;
-       selp.b32        %r17, %r14, %r3, %p7;
-       setp.eq.s32     %p8, %r8, 1;
-       selp.b32        %r18, %r2, %r17, %p8;
-       mul.wide.s32    %rd6, %r16, 8;
-       add.s64         %rd7, %rd5, %rd6;
-       ld.global.f64   %fd1, [%rd7];
-       cvta.to.global.u64      %rd8, %rd3;
-       mul.wide.s32    %rd9, %r18, 8;
-       add.s64         %rd10, %rd8, %rd9;
+       mad.lo.s32      %r3, %r60, %r10, %r2;
+       setp.eq.s32     %p5, %r11, 1;
+       mov.u32         %r58, %r60;
+       @%p5 bra        BB28_4;
+
+       setp.ne.s32     %p6, %r11, 2;
+       mov.u32         %r58, %r3;
+       @%p6 bra        BB28_4;
+
+       mov.u32         %r58, %r2;
+
+BB28_4:
+       setp.eq.s32     %p7, %r12, 1;
+       @%p7 bra        BB28_7;
+
+       setp.ne.s32     %p8, %r12, 2;
+       mov.u32         %r60, %r3;
+       @%p8 bra        BB28_7;
+
+       mov.u32         %r60, %r2;
+
+BB28_7:
+       cvta.to.global.u64      %rd5, %rd3;
+       cvta.to.global.u64      %rd6, %rd2;
+       mul.wide.s32    %rd7, %r58, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       ld.global.f64   %fd1, [%rd8];
+       mul.wide.s32    %rd9, %r60, 8;
+       add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd54, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p9, %r9, 8;
-       @%p9 bra        BB28_18;
+       mov.f64         %fd50, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p9, %r13, 8;
+       @%p9 bra        BB28_24;
 
-       setp.gt.s32     %p23, %r9, 3;
-       @%p23 bra       BB28_10;
+       setp.gt.s32     %p23, %r13, 3;
+       @%p23 bra       BB28_16;
 
-       setp.gt.s32     %p30, %r9, 1;
-       @%p30 bra       BB28_7;
+       setp.gt.s32     %p30, %r13, 1;
+       @%p30 bra       BB28_13;
 
-       setp.eq.s32     %p33, %r9, 0;
-       @%p33 bra       BB28_63;
-       bra.uni         BB28_5;
+       setp.eq.s32     %p33, %r13, 0;
+       @%p33 bra       BB28_75;
+       bra.uni         BB28_11;
 
-BB28_63:
-       add.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_75:
+       add.f64         %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_18:
-       setp.gt.s32     %p10, %r9, 13;
-       @%p10 bra       BB28_27;
+BB28_24:
+       setp.gt.s32     %p10, %r13, 13;
+       @%p10 bra       BB28_33;
 
-       setp.gt.s32     %p17, %r9, 10;
-       @%p17 bra       BB28_23;
+       setp.gt.s32     %p17, %r13, 10;
+       @%p17 bra       BB28_29;
 
-       setp.eq.s32     %p21, %r9, 9;
-       @%p21 bra       BB28_45;
-       bra.uni         BB28_21;
+       setp.eq.s32     %p21, %r13, 9;
+       @%p21 bra       BB28_53;
+       bra.uni         BB28_27;
 
-BB28_45:
-       setp.eq.f64     %p50, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB28_64;
+BB28_53:
+       setp.eq.f64     %p48, %fd1, %fd2;
+       selp.f64        %fd50, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB28_76;
 
-BB28_10:
-       setp.gt.s32     %p24, %r9, 5;
-       @%p24 bra       BB28_14;
+BB28_16:
+       setp.gt.s32     %p24, %r13, 5;
+       @%p24 bra       BB28_20;
 
-       setp.eq.s32     %p28, %r9, 4;
-       @%p28 bra       BB28_48;
-       bra.uni         BB28_12;
+       setp.eq.s32     %p28, %r13, 4;
+       @%p28 bra       BB28_56;
+       bra.uni         BB28_18;
 
-BB28_48:
+BB28_56:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r4}, %fd1;
+       mov.b64         {%temp, %r8}, %fd1;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r5}, %fd2;
+       mov.b64         {%temp, %r9}, %fd2;
        }
-       bfe.u32         %r31, %r5, 20, 11;
+       bfe.u32         %r31, %r9, 20, 11;
        add.s32         %r32, %r31, -1012;
        mov.b64          %rd15, %fd2;
        shl.b64         %rd1, %rd15, %r32;
-       setp.eq.s64     %p55, %rd1, -9223372036854775808;
+       setp.eq.s64     %p53, %rd1, -9223372036854775808;
        abs.f64         %fd19, %fd1;
        // Callseq Start 0
        {
@@ -2081,341 +2072,343 @@ BB28_48:
        param0, 
        param1
        );
-       ld.param.f64    %fd53, [retval0+0];
+       ld.param.f64    %fd25, [retval0+0];
        
        //{
        }// Callseq End 0
-       setp.lt.s32     %p56, %r4, 0;
-       and.pred        %p1, %p56, %p55;
-       @!%p1 bra       BB28_50;
-       bra.uni         BB28_49;
+       setp.lt.s32     %p54, %r8, 0;
+       and.pred        %p1, %p54, %p53;
+       @!%p1 bra       BB28_58;
+       bra.uni         BB28_57;
 
-BB28_49:
+BB28_57:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r33}, %fd53;
+       mov.b64         {%temp, %r33}, %fd25;
        }
        xor.b32         %r34, %r33, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd53;
+       mov.b64         {%r35, %temp}, %fd25;
        }
-       mov.b64         %fd53, {%r35, %r34};
+       mov.b64         %fd25, {%r35, %r34};
 
-BB28_50:
-       mov.f64         %fd52, %fd53;
-       setp.eq.f64     %p57, %fd1, 0d0000000000000000;
-       @%p57 bra       BB28_53;
-       bra.uni         BB28_51;
+BB28_58:
+       setp.eq.f64     %p55, %fd1, 0d0000000000000000;
+       @%p55 bra       BB28_61;
+       bra.uni         BB28_59;
 
-BB28_53:
-       selp.b32        %r36, %r4, 0, %p55;
+BB28_61:
+       selp.b32        %r36, %r8, 0, %p53;
        or.b32          %r37, %r36, 2146435072;
-       setp.lt.s32     %p61, %r5, 0;
-       selp.b32        %r38, %r37, %r36, %p61;
+       setp.lt.s32     %p59, %r9, 0;
+       selp.b32        %r38, %r37, %r36, %p59;
        mov.u32         %r39, 0;
-       mov.b64         %fd52, {%r39, %r38};
-       bra.uni         BB28_54;
+       mov.b64         %fd25, {%r39, %r38};
+       bra.uni         BB28_62;
 
-BB28_27:
-       setp.gt.s32     %p11, %r9, 15;
-       @%p11 bra       BB28_31;
+BB28_33:
+       setp.gt.s32     %p11, %r13, 15;
+       @%p11 bra       BB28_37;
 
-       setp.eq.s32     %p15, %r9, 14;
-       @%p15 bra       BB28_42;
-       bra.uni         BB28_29;
+       setp.eq.s32     %p15, %r13, 14;
+       @%p15 bra       BB28_50;
+       bra.uni         BB28_35;
 
-BB28_42:
+BB28_50:
        cvt.rni.s64.f64 %rd11, %fd1;
-       cvt.rni.s64.f64 %rd12, %fd2;
        cvt.u32.u64     %r25, %rd11;
+       cvt.rni.s64.f64 %rd12, %fd2;
        cvt.u32.u64     %r26, %rd12;
        or.b32          %r27, %r26, %r25;
-       setp.eq.s32     %p47, %r27, 0;
-       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-       bra.uni         BB28_64;
+       setp.eq.s32     %p45, %r27, 0;
+       selp.f64        %fd50, 0d0000000000000000, 0d3FF0000000000000, %p45;
+       bra.uni         BB28_76;
 
-BB28_7:
-       setp.eq.s32     %p31, %r9, 2;
-       @%p31 bra       BB28_62;
-       bra.uni         BB28_8;
+BB28_13:
+       setp.eq.s32     %p31, %r13, 2;
+       @%p31 bra       BB28_74;
+       bra.uni         BB28_14;
 
-BB28_62:
-       mul.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_74:
+       mul.f64         %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_23:
-       setp.eq.s32     %p18, %r9, 11;
-       @%p18 bra       BB28_44;
+BB28_29:
+       setp.eq.s32     %p18, %r13, 11;
+       @%p18 bra       BB28_52;
 
-       setp.eq.s32     %p19, %r9, 12;
-       @%p19 bra       BB28_43;
-       bra.uni         BB28_25;
+       setp.eq.s32     %p19, %r13, 12;
+       @%p19 bra       BB28_51;
+       bra.uni         BB28_31;
 
-BB28_43:
-       max.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_51:
+       max.f64         %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_14:
-       setp.eq.s32     %p25, %r9, 6;
-       @%p25 bra       BB28_47;
+BB28_20:
+       setp.eq.s32     %p25, %r13, 6;
+       @%p25 bra       BB28_55;
 
-       setp.eq.s32     %p26, %r9, 7;
-       @%p26 bra       BB28_46;
-       bra.uni         BB28_16;
+       setp.eq.s32     %p26, %r13, 7;
+       @%p26 bra       BB28_54;
+       bra.uni         BB28_22;
 
-BB28_46:
-       setp.gt.f64     %p52, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-       bra.uni         BB28_64;
+BB28_54:
+       setp.gt.f64     %p50, %fd1, %fd2;
+       selp.f64        %fd50, 0d3FF0000000000000, 0d0000000000000000, %p50;
+       bra.uni         BB28_76;
 
-BB28_31:
-       setp.eq.s32     %p12, %r9, 16;
-       @%p12 bra       BB28_41;
+BB28_37:
+       setp.eq.s32     %p12, %r13, 16;
+       @%p12 bra       BB28_49;
 
-       setp.eq.s32     %p13, %r9, 17;
-       @%p13 bra       BB28_37;
-       bra.uni         BB28_33;
+       setp.eq.s32     %p13, %r13, 17;
+       @%p13 bra       BB28_44;
+       bra.uni         BB28_39;
 
-BB28_37:
-       setp.eq.f64     %p39, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p40, %fd2, 0d8000000000000000;
-       or.pred         %p41, %p39, %p40;
-       mov.f64         %fd54, 0d7FF8000000000000;
-       @%p41 bra       BB28_64;
+BB28_44:
+       setp.eq.f64     %p38, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p39, %fd2, 0d8000000000000000;
+       or.pred         %p40, %p38, %p39;
+       mov.f64         %fd50, 0d7FF8000000000000;
+       @%p40 bra       BB28_76;
 
-       div.rn.f64      %fd54, %fd1, %fd2;
-       abs.f64         %fd39, %fd54;
-       setp.gtu.f64    %p42, %fd39, 0d7FF0000000000000;
-       @%p42 bra       BB28_64;
+       div.rn.f64      %fd50, %fd1, %fd2;
+       abs.f64         %fd39, %fd50;
+       setp.gtu.f64    %p41, %fd39, 0d7FF0000000000000;
+       @%p41 bra       BB28_76;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r22, %temp}, %fd54;
+       mov.b64         {%temp, %r22}, %fd50;
        }
+       and.b32         %r23, %r22, 2147483647;
+       setp.ne.s32     %p42, %r23, 2146435072;
+       @%p42 bra       BB28_48;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd54;
+       mov.b64         {%r24, %temp}, %fd50;
        }
-       and.b32         %r24, %r23, 2147483647;
-       setp.ne.s32     %p43, %r24, 2146435072;
-       setp.ne.s32     %p44, %r22, 0;
-       or.pred         %p45, %p43, %p44;
-       @!%p45 bra      BB28_64;
-       bra.uni         BB28_40;
+       setp.eq.s32     %p43, %r24, 0;
+       @%p43 bra       BB28_76;
 
-BB28_40:
-       cvt.rmi.f64.f64 %fd40, %fd54;
+BB28_48:
+       cvt.rmi.f64.f64 %fd40, %fd50;
        mul.f64         %fd41, %fd2, %fd40;
-       sub.f64         %fd54, %fd1, %fd41;
-       bra.uni         BB28_64;
+       sub.f64         %fd50, %fd1, %fd41;
+       bra.uni         BB28_76;
 
-BB28_5:
-       setp.eq.s32     %p34, %r9, 1;
-       @%p34 bra       BB28_6;
-       bra.uni         BB28_64;
+BB28_11:
+       setp.eq.s32     %p34, %r13, 1;
+       @%p34 bra       BB28_12;
+       bra.uni         BB28_76;
 
-BB28_6:
-       sub.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_12:
+       sub.f64         %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_21:
-       setp.eq.s32     %p22, %r9, 10;
-       @%p22 bra       BB28_22;
-       bra.uni         BB28_64;
+BB28_27:
+       setp.eq.s32     %p22, %r13, 10;
+       @%p22 bra       BB28_28;
+       bra.uni         BB28_76;
 
-BB28_22:
-       setp.neu.f64    %p49, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB28_64;
+BB28_28:
+       setp.neu.f64    %p47, %fd1, %fd2;
+       selp.f64        %fd50, 0d3FF0000000000000, 0d0000000000000000, %p47;
+       bra.uni         BB28_76;
 
-BB28_12:
-       setp.eq.s32     %p29, %r9, 5;
-       @%p29 bra       BB28_13;
-       bra.uni         BB28_64;
+BB28_18:
+       setp.eq.s32     %p29, %r13, 5;
+       @%p29 bra       BB28_19;
+       bra.uni         BB28_76;
 
-BB28_13:
-       setp.lt.f64     %p54, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-       bra.uni         BB28_64;
+BB28_19:
+       setp.lt.f64     %p52, %fd1, %fd2;
+       selp.f64        %fd50, 0d3FF0000000000000, 0d0000000000000000, %p52;
+       bra.uni         BB28_76;
 
-BB28_29:
-       setp.eq.s32     %p16, %r9, 15;
-       @%p16 bra       BB28_30;
-       bra.uni         BB28_64;
+BB28_35:
+       setp.eq.s32     %p16, %r13, 15;
+       @%p16 bra       BB28_36;
+       bra.uni         BB28_76;
 
-BB28_30:
+BB28_36:
        mul.f64         %fd43, %fd1, %fd2;
        mov.f64         %fd44, 0d3FF0000000000000;
-       sub.f64         %fd54, %fd44, %fd43;
-       bra.uni         BB28_64;
+       sub.f64         %fd50, %fd44, %fd43;
+       bra.uni         BB28_76;
 
-BB28_8:
-       setp.eq.s32     %p32, %r9, 3;
-       @%p32 bra       BB28_9;
-       bra.uni         BB28_64;
+BB28_14:
+       setp.eq.s32     %p32, %r13, 3;
+       @%p32 bra       BB28_15;
+       bra.uni         BB28_76;
 
-BB28_9:
-       div.rn.f64      %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_15:
+       div.rn.f64      %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_44:
-       min.f64         %fd54, %fd1, %fd2;
-       bra.uni         BB28_64;
+BB28_52:
+       min.f64         %fd50, %fd1, %fd2;
+       bra.uni         BB28_76;
 
-BB28_25:
-       setp.eq.s32     %p20, %r9, 13;
-       @%p20 bra       BB28_26;
-       bra.uni         BB28_64;
+BB28_31:
+       setp.eq.s32     %p20, %r13, 13;
+       @%p20 bra       BB28_32;
+       bra.uni         BB28_76;
 
-BB28_26:
+BB28_32:
        cvt.rni.s64.f64 %rd13, %fd1;
-       cvt.rni.s64.f64 %rd14, %fd2;
        cvt.u32.u64     %r28, %rd13;
+       cvt.rni.s64.f64 %rd14, %fd2;
        cvt.u32.u64     %r29, %rd14;
        and.b32         %r30, %r29, %r28;
-       setp.eq.s32     %p48, %r30, 0;
-       selp.f64        %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-       bra.uni         BB28_64;
+       setp.eq.s32     %p46, %r30, 0;
+       selp.f64        %fd50, 0d0000000000000000, 0d3FF0000000000000, %p46;
+       bra.uni         BB28_76;
 
-BB28_47:
-       setp.le.f64     %p53, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-       bra.uni         BB28_64;
+BB28_55:
+       setp.gtu.f64    %p51, %fd1, %fd2;
+       selp.f64        %fd50, 0d0000000000000000, 0d3FF0000000000000, %p51;
+       bra.uni         BB28_76;
 
-BB28_16:
-       setp.eq.s32     %p27, %r9, 8;
-       @%p27 bra       BB28_17;
-       bra.uni         BB28_64;
+BB28_22:
+       setp.eq.s32     %p27, %r13, 8;
+       @%p27 bra       BB28_23;
+       bra.uni         BB28_76;
 
-BB28_17:
-       setp.ge.f64     %p51, %fd1, %fd2;
-       selp.f64        %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-       bra.uni         BB28_64;
+BB28_23:
+       setp.ltu.f64    %p49, %fd1, %fd2;
+       selp.f64        %fd50, 0d0000000000000000, 0d3FF0000000000000, %p49;
+       bra.uni         BB28_76;
 
-BB28_41:
-       setp.neu.f64    %p46, %fd1, 0d0000000000000000;
+BB28_49:
+       setp.neu.f64    %p44, %fd1, 0d0000000000000000;
        sub.f64         %fd42, %fd1, %fd2;
-       selp.f64        %fd54, %fd42, 0d0000000000000000, %p46;
-       bra.uni         BB28_64;
+       selp.f64        %fd50, %fd42, 0d0000000000000000, %p44;
+       bra.uni         BB28_76;
 
-BB28_33:
-       setp.ne.s32     %p14, %r9, 18;
-       @%p14 bra       BB28_64;
+BB28_39:
+       setp.ne.s32     %p14, %r13, 18;
+       @%p14 bra       BB28_76;
 
-       div.rn.f64      %fd54, %fd1, %fd2;
-       abs.f64         %fd37, %fd54;
+       div.rn.f64      %fd50, %fd1, %fd2;
+       abs.f64         %fd37, %fd50;
        setp.gtu.f64    %p35, %fd37, 0d7FF0000000000000;
-       @%p35 bra       BB28_64;
+       @%p35 bra       BB28_76;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r19, %temp}, %fd54;
+       mov.b64         {%temp, %r19}, %fd50;
        }
+       and.b32         %r20, %r19, 2147483647;
+       setp.ne.s32     %p36, %r20, 2146435072;
+       @%p36 bra       BB28_43;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r20}, %fd54;
+       mov.b64         {%r21, %temp}, %fd50;
        }
-       and.b32         %r21, %r20, 2147483647;
-       setp.ne.s32     %p36, %r21, 2146435072;
-       setp.ne.s32     %p37, %r19, 0;
-       or.pred         %p38, %p36, %p37;
-       @!%p38 bra      BB28_64;
-       bra.uni         BB28_36;
+       setp.eq.s32     %p37, %r21, 0;
+       @%p37 bra       BB28_76;
 
-BB28_36:
-       cvt.rmi.f64.f64 %fd54, %fd54;
-       bra.uni         BB28_64;
+BB28_43:
+       cvt.rmi.f64.f64 %fd50, %fd50;
+       bra.uni         BB28_76;
 
-BB28_51:
-       setp.gt.s32     %p58, %r4, -1;
-       @%p58 bra       BB28_54;
+BB28_59:
+       setp.gt.s32     %p56, %r8, -1;
+       @%p56 bra       BB28_62;
 
        cvt.rzi.f64.f64 %fd45, %fd2;
-       setp.neu.f64    %p59, %fd45, %fd2;
-       selp.f64        %fd52, 0dFFF8000000000000, %fd52, %p59;
+       setp.neu.f64    %p57, %fd45, %fd2;
+       selp.f64        %fd25, 0dFFF8000000000000, %fd25, %p57;
 
-BB28_54:
-       mov.f64         %fd25, %fd52;
-       add.f64         %fd26, %fd1, %fd2;
+BB28_62:
+       add.f64         %fd49, %fd1, %fd2;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r40}, %fd26;
+       mov.b64         {%temp, %r40}, %fd49;
        }
        and.b32         %r41, %r40, 2146435072;
-       setp.ne.s32     %p62, %r41, 2146435072;
-       mov.f64         %fd51, %fd25;
-       @%p62 bra       BB28_61;
+       setp.ne.s32     %p60, %r41, 2146435072;
+       @%p60 bra       BB28_63;
 
-       setp.gtu.f64    %p63, %fd19, 0d7FF0000000000000;
-       mov.f64         %fd51, %fd26;
-       @%p63 bra       BB28_61;
+       setp.gtu.f64    %p61, %fd19, 0d7FF0000000000000;
+       @%p61 bra       BB28_73;
 
        abs.f64         %fd46, %fd2;
-       setp.gtu.f64    %p64, %fd46, 0d7FF0000000000000;
-       mov.f64         %fd50, %fd26;
-       mov.f64         %fd51, %fd50;
-       @%p64 bra       BB28_61;
+       setp.gtu.f64    %p62, %fd46, 0d7FF0000000000000;
+       @%p62 bra       BB28_73;
+
+       and.b32         %r42, %r9, 2147483647;
+       setp.ne.s32     %p63, %r42, 2146435072;
+       @%p63 bra       BB28_68;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r42, %temp}, %fd2;
-       }
-       and.b32         %r43, %r5, 2147483647;
-       setp.eq.s32     %p65, %r43, 2146435072;
-       setp.eq.s32     %p66, %r42, 0;
-       and.pred        %p67, %p65, %p66;
-       @%p67 bra       BB28_60;
-       bra.uni         BB28_58;
-
-BB28_60:
-       setp.gt.f64     %p71, %fd19, 0d3FF0000000000000;
-       selp.b32        %r51, 2146435072, 0, %p71;
-       xor.b32         %r52, %r51, 2146435072;
-       setp.lt.s32     %p72, %r5, 0;
-       selp.b32        %r53, %r52, %r51, %p72;
-       setp.eq.f64     %p73, %fd1, 0dBFF0000000000000;
-       selp.b32        %r54, 1072693248, %r53, %p73;
-       mov.u32         %r55, 0;
-       mov.b64         %fd51, {%r55, %r54};
-       bra.uni         BB28_61;
+       mov.b64         {%r43, %temp}, %fd2;
+       }
+       setp.eq.s32     %p64, %r43, 0;
+       @%p64 bra       BB28_72;
+
+BB28_68:
+       and.b32         %r44, %r8, 2147483647;
+       setp.ne.s32     %p65, %r44, 2146435072;
+       @%p65 bra       BB28_69;
 
-BB28_58:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r44, %temp}, %fd1;
-       }
-       and.b32         %r45, %r4, 2147483647;
-       setp.eq.s32     %p68, %r45, 2146435072;
-       setp.eq.s32     %p69, %r44, 0;
-       and.pred        %p70, %p68, %p69;
-       mov.f64         %fd51, %fd25;
-       @!%p70 bra      BB28_61;
-       bra.uni         BB28_59;
+       mov.b64         {%r45, %temp}, %fd1;
+       }
+       setp.ne.s32     %p66, %r45, 0;
+       mov.f64         %fd49, %fd25;
+       @%p66 bra       BB28_73;
 
-BB28_59:
-       shr.s32         %r46, %r5, 31;
+       shr.s32         %r46, %r9, 31;
        and.b32         %r47, %r46, -2146435072;
-       selp.b32        %r48, -1048576, 2146435072, %p1;
-       add.s32         %r49, %r48, %r47;
-       mov.u32         %r50, 0;
-       mov.b64         %fd51, {%r50, %r49};
+       add.s32         %r48, %r47, 2146435072;
+       or.b32          %r49, %r48, -2147483648;
+       selp.b32        %r50, %r49, %r48, %p1;
+       mov.u32         %r51, 0;
+       mov.b64         %fd49, {%r51, %r50};
+       bra.uni         BB28_73;
 
-BB28_61:
-       setp.eq.f64     %p74, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p75, %fd1, 0d3FF0000000000000;
-       or.pred         %p76, %p75, %p74;
-       selp.f64        %fd54, 0d3FF0000000000000, %fd51, %p76;
+BB28_63:
+       mov.f64         %fd49, %fd25;
+
+BB28_73:
+       setp.eq.f64     %p70, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p71, %fd1, 0d3FF0000000000000;
+       or.pred         %p72, %p71, %p70;
+       selp.f64        %fd50, 0d3FF0000000000000, %fd49, %p72;
 
-BB28_64:
+BB28_76:
        cvta.to.global.u64      %rd16, %rd4;
        mul.wide.s32    %rd17, %r3, 8;
        add.s64         %rd18, %rd16, %rd17;
-       st.global.f64   [%rd18], %fd54;
+       st.global.f64   [%rd18], %fd50;
        bar.sync        0;
 
-BB28_65:
+BB28_77:
        ret;
+
+BB28_69:
+       mov.f64         %fd49, %fd25;
+       bra.uni         BB28_73;
+
+BB28_72:
+       setp.gt.f64     %p67, %fd19, 0d3FF0000000000000;
+       selp.b32        %r52, 2146435072, 0, %p67;
+       xor.b32         %r53, %r52, 2146435072;
+       setp.lt.s32     %p68, %r9, 0;
+       selp.b32        %r54, %r53, %r52, %p68;
+       setp.eq.f64     %p69, %fd1, 0dBFF0000000000000;
+       selp.b32        %r55, 1072693248, %r54, %p69;
+       mov.u32         %r56, 0;
+       mov.b64         %fd49, {%r56, %r55};
+       bra.uni         BB28_73;
 }
 
        // .globl       matrix_matrix_cellwise_op_f
@@ -2431,425 +2424,436 @@ BB28_65:
 )
 {
        .reg .pred      %p<76>;
-       .reg .f32       %f<134>;
-       .reg .b32       %r<42>;
+       .reg .f32       %f<135>;
+       .reg .b32       %r<46>;
        .reg .b64       %rd<17>;
 
 
        ld.param.u64    %rd1, [matrix_matrix_cellwise_op_f_param_0];
        ld.param.u64    %rd2, [matrix_matrix_cellwise_op_f_param_1];
        ld.param.u64    %rd3, [matrix_matrix_cellwise_op_f_param_2];
-       ld.param.u32    %r8, [matrix_matrix_cellwise_op_f_param_3];
-       ld.param.u32    %r4, [matrix_matrix_cellwise_op_f_param_4];
-       ld.param.u32    %r5, [matrix_matrix_cellwise_op_f_param_5];
-       ld.param.u32    %r6, [matrix_matrix_cellwise_op_f_param_6];
-       ld.param.u32    %r7, [matrix_matrix_cellwise_op_f_param_7];
-       mov.u32         %r9, %ntid.x;
-       mov.u32         %r10, %ctaid.x;
-       mov.u32         %r11, %tid.x;
-       mad.lo.s32      %r1, %r9, %r10, %r11;
-       div.s32         %r2, %r1, %r4;
-       setp.lt.s32     %p2, %r2, %r8;
-       setp.gt.s32     %p3, %r4, -1;
+       ld.param.u32    %r12, [matrix_matrix_cellwise_op_f_param_3];
+       ld.param.u32    %r8, [matrix_matrix_cellwise_op_f_param_4];
+       ld.param.u32    %r9, [matrix_matrix_cellwise_op_f_param_5];
+       ld.param.u32    %r10, [matrix_matrix_cellwise_op_f_param_6];
+       ld.param.u32    %r11, [matrix_matrix_cellwise_op_f_param_7];
+       mov.u32         %r13, %ntid.x;
+       mov.u32         %r14, %ctaid.x;
+       mov.u32         %r15, %tid.x;
+       mad.lo.s32      %r16, %r13, %r14, %r15;
+       div.s32         %r45, %r16, %r8;
+       rem.s32         %r2, %r16, %r8;
+       setp.lt.s32     %p2, %r45, %r12;
+       setp.gt.s32     %p3, %r8, -1;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB29_63;
+       @!%p4 bra       BB29_69;
        bra.uni         BB29_1;
 
 BB29_1:
-       rem.s32         %r12, %r1, %r4;
-       cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r3, %r2, %r4, %r12;
-       setp.eq.s32     %p5, %r5, 2;
-       selp.b32        %r13, %r12, %r3, %p5;
-       setp.eq.s32     %p6, %r5, 1;
-       selp.b32        %r14, %r2, %r13, %p6;
-       setp.eq.s32     %p7, %r6, 2;
-       selp.b32        %r15, %r12, %r3, %p7;
-       setp.eq.s32     %p8, %r6, 1;
-       selp.b32        %r16, %r2, %r15, %p8;
-       mul.wide.s32    %rd5, %r14, 4;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.global.f32   %f1, [%rd6];
-       cvta.to.global.u64      %rd7, %rd2;
-       mul.wide.s32    %rd8, %r16, 4;
-       add.s64         %rd9, %rd7, %rd8;
+       mad.lo.s32      %r3, %r45, %r8, %r2;
+       setp.eq.s32     %p5, %r9, 1;
+       mov.u32         %r43, %r45;
+       @%p5 bra        BB29_4;
+
+       setp.ne.s32     %p6, %r9, 2;
+       mov.u32         %r43, %r3;
+       @%p6 bra        BB29_4;
+
+       mov.u32         %r43, %r2;
+
+BB29_4:
+       setp.eq.s32     %p7, %r10, 1;
+       @%p7 bra        BB29_7;
+
+       setp.ne.s32     %p8, %r10, 2;
+       mov.u32         %r45, %r3;
+       @%p8 bra        BB29_7;
+
+       mov.u32         %r45, %r2;
+
+BB29_7:
+       cvta.to.global.u64      %rd4, %rd2;
+       cvta.to.global.u64      %rd5, %rd1;
+       mul.wide.s32    %rd6, %r43, 4;
+       add.s64         %rd7, %rd5, %rd6;
+       ld.global.f32   %f1, [%rd7];
+       mul.wide.s32    %rd8, %r45, 4;
+       add.s64         %rd9, %rd4, %rd8;
        ld.global.f32   %f2, [%rd9];
-       mov.f32         %f133, 0f7F7FFFFF;
-       setp.gt.s32     %p9, %r7, 8;
-       @%p9 bra        BB29_18;
+       mov.f32         %f134, 0f7F7FFFFF;
+       setp.gt.s32     %p9, %r11, 8;
+       @%p9 bra        BB29_24;
 
-       setp.gt.s32     %p23, %r7, 3;
-       @%p23 bra       BB29_10;
+       setp.gt.s32     %p23, %r11, 3;
+       @%p23 bra       BB29_16;
 
-       setp.gt.s32     %p30, %r7, 1;
-       @%p30 bra       BB29_7;
+       setp.gt.s32     %p30, %r11, 1;
+       @%p30 bra       BB29_13;
 
-       setp.eq.s32     %p33, %r7, 0;
-       @%p33 bra       BB29_61;
-       bra.uni         BB29_5;
+       setp.eq.s32     %p33, %r11, 0;
+       @%p33 bra       BB29_67;
+       bra.uni         BB29_11;
 
-BB29_61:
-       add.f32         %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_67:
+       add.f32         %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_18:
-       setp.gt.s32     %p10, %r7, 13;
-       @%p10 bra       BB29_27;
+BB29_24:
+       setp.gt.s32     %p10, %r11, 13;
+       @%p10 bra       BB29_33;
 
-       setp.gt.s32     %p17, %r7, 10;
-       @%p17 bra       BB29_23;
+       setp.gt.s32     %p17, %r11, 10;
+       @%p17 bra       BB29_29;
 
-       setp.eq.s32     %p21, %r7, 9;
-       @%p21 bra       BB29_43;
-       bra.uni         BB29_21;
+       setp.eq.s32     %p21, %r11, 9;
+       @%p21 bra       BB29_49;
+       bra.uni         BB29_27;
 
-BB29_43:
+BB29_49:
        setp.eq.f32     %p44, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p44;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f3F800000, 0f00000000, %p44;
+       bra.uni         BB29_68;
 
-BB29_10:
-       setp.gt.s32     %p24, %r7, 5;
-       @%p24 bra       BB29_14;
+BB29_16:
+       setp.gt.s32     %p24, %r11, 5;
+       @%p24 bra       BB29_20;
 
-       setp.eq.s32     %p28, %r7, 4;
-       @%p28 bra       BB29_46;
-       bra.uni         BB29_12;
+       setp.eq.s32     %p28, %r11, 4;
+       @%p28 bra       BB29_52;
+       bra.uni         BB29_18;
 
-BB29_46:
-       mul.f32         %f53, %f2, 0f3F000000;
-       cvt.rzi.f32.f32 %f54, %f53;
-       fma.rn.f32      %f55, %f54, 0fC0000000, %f2;
-       abs.f32         %f19, %f55;
+BB29_52:
+       mul.f32         %f51, %f2, 0f3F000000;
+       cvt.rzi.f32.f32 %f52, %f51;
+       fma.rn.f32      %f53, %f52, 0fC0000000, %f2;
+       abs.f32         %f19, %f53;
        abs.f32         %f20, %f1;
        setp.lt.f32     %p49, %f20, 0f00800000;
-       mul.f32         %f56, %f20, 0f4B800000;
-       selp.f32        %f57, 0fC3170000, 0fC2FE0000, %p49;
-       selp.f32        %f58, %f56, %f20, %p49;
-       mov.b32          %r23, %f58;
+       mul.f32         %f54, %f20, 0f4B800000;
+       selp.f32        %f55, 0fC3170000, 0fC2FE0000, %p49;
+       selp.f32        %f56, %f54, %f20, %p49;
+       mov.b32          %r23, %f56;
        and.b32         %r24, %r23, 8388607;
        or.b32          %r25, %r24, 1065353216;
-       mov.b32          %f59, %r25;
+       mov.b32          %f57, %r25;
        shr.u32         %r26, %r23, 23;
-       cvt.rn.f32.u32  %f60, %r26;
-       add.f32         %f61, %f57, %f60;
-       setp.gt.f32     %p50, %f59, 0f3FB504F3;
-       mul.f32         %f62, %f59, 0f3F000000;
-       add.f32         %f63, %f61, 0f3F800000;
-       selp.f32        %f64, %f62, %f59, %p50;
-       selp.f32        %f65, %f63, %f61, %p50;
-       add.f32         %f66, %f64, 0fBF800000;
-       add.f32         %f50, %f64, 0f3F800000;
+       cvt.rn.f32.u32  %f58, %r26;
+       add.f32         %f59, %f55, %f58;
+       setp.gt.f32     %p50, %f57, 0f3FB504F3;
+       mul.f32         %f60, %f57, 0f3F000000;
+       add.f32         %f61, %f59, 0f3F800000;
+       selp.f32        %f62, %f60, %f57, %p50;
+       selp.f32        %f63, %f61, %f59, %p50;
+       add.f32         %f64, %f62, 0fBF800000;
+       add.f32         %f50, %f62, 0f3F800000;
        // inline asm
        rcp.approx.ftz.f32 %f49,%f50;
        // inline asm
-       add.f32         %f67, %f66, %f66;
-       mul.f32         %f68, %f49, %f67;
-       mul.f32         %f69, %f68, %f68;
-       mov.f32         %f70, 0f3C4CAF63;
-       mov.f32         %f71, 0f3B18F0FE;
-       fma.rn.f32      %f72, %f71, %f69, %f70;
-       mov.f32         %f73, 0f3DAAAABD;
-       fma.rn.f32      %f74, %f72, %f69, %f73;
-       mul.rn.f32      %f75, %f74, %f69;
-       mul.rn.f32      %f76, %f75, %f68;
-       sub.f32         %f77, %f66, %f68;
-       neg.f32         %f78, %f68;
-       add.f32         %f79, %f77, %f77;
-       fma.rn.f32      %f80, %f78, %f66, %f79;
-       mul.rn.f32      %f81, %f49, %f80;
-       add.f32         %f82, %f76, %f68;
-       sub.f32         %f83, %f68, %f82;
-       add.f32         %f84, %f76, %f83;
-       add.f32         %f85, %f81, %f84;
-       add.f32         %f86, %f82, %f85;
-       sub.f32         %f87, %f82, %f86;
-       add.f32         %f88, %f85, %f87;
-       mov.f32         %f89, 0f3F317200;
-       mul.rn.f32      %f90, %f65, %f89;
-       mov.f32         %f91, 0f35BFBE8E;
-       mul.rn.f32      %f92, %f65, %f91;
-       add.f32         %f93, %f90, %f86;
-       sub.f32         %f94, %f90, %f93;
-       add.f32         %f95, %f86, %f94;
-       add.f32         %f96, %f88, %f95;
-       add.f32         %f97, %f92, %f96;
-       add.f32         %f98, %f93, %f97;
-       sub.f32         %f99, %f93, %f98;
-       add.f32         %f100, %f97, %f99;
+       add.f32         %f65, %f64, %f64;
+       mul.f32         %f66, %f49, %f65;
+       mul.f32         %f67, %f66, %f66;
+       mov.f32         %f68, 0f3C4CAF63;
+       mov.f32         %f69, 0f3B18F0FE;
+       fma.rn.f32      %f70, %f69, %f67, %f68;
+       mov.f32         %f71, 0f3DAAAABD;
+       fma.rn.f32      %f72, %f70, %f67, %f71;
+       mul.rn.f32      %f73, %f72, %f67;
+       mul.rn.f32      %f74, %f73, %f66;
+       sub.f32         %f75, %f64, %f66;
+       neg.f32         %f76, %f66;
+       add.f32         %f77, %f75, %f75;
+       fma.rn.f32      %f78, %f76, %f64, %f77;
+       mul.rn.f32      %f79, %f49, %f78;
+       add.f32         %f80, %f74, %f66;
+       sub.f32         %f81, %f66, %f80;
+       add.f32         %f82, %f74, %f81;
+       add.f32         %f83, %f79, %f82;
+       add.f32         %f84, %f80, %f83;
+       sub.f32         %f85, %f80, %f84;
+       add.f32         %f86, %f83, %f85;
+       mov.f32         %f87, 0f3F317200;
+       mul.rn.f32      %f88, %f63, %f87;
+       mov.f32         %f89, 0f35BFBE8E;
+       mul.rn.f32      %f90, %f63, %f89;
+       add.f32         %f91, %f88, %f84;
+       sub.f32         %f92, %f88, %f91;
+       add.f32         %f93, %f84, %f92;
+       add.f32         %f94, %f86, %f93;
+       add.f32         %f95, %f90, %f94;
+       add.f32         %f96, %f91, %f95;
+       sub.f32         %f97, %f91, %f96;
+       add.f32         %f98, %f95, %f97;
        abs.f32         %f21, %f2;
        setp.gt.f32     %p51, %f21, 0f77F684DF;
-       mul.f32         %f101, %f2, 0f39000000;
-       selp.f32        %f102, %f101, %f2, %p51;
-       mul.rn.f32      %f103, %f102, %f98;
-       neg.f32         %f104, %f103;
-       fma.rn.f32      %f105, %f102, %f98, %f104;
-       fma.rn.f32      %f106, %f102, %f100, %f105;
-       mov.f32         %f107, 0f00000000;
-       fma.rn.f32      %f108, %f107, %f98, %f106;
-       add.rn.f32      %f109, %f103, %f108;
-       neg.f32         %f110, %f109;
-       add.rn.f32      %f111, %f103, %f110;
-       add.rn.f32      %f112, %f111, %f108;
-       mov.b32          %r27, %f109;
+       mul.f32         %f99, %f2, 0f39000000;
+       selp.f32        %f100, %f99, %f2, %p51;
+       mul.rn.f32      %f101, %f100, %f96;
+       neg.f32         %f102, %f101;
+       fma.rn.f32      %f103, %f100, %f96, %f102;
+       fma.rn.f32      %f104, %f100, %f98, %f103;
+       mov.f32         %f105, 0f00000000;
+       fma.rn.f32      %f106, %f105, %f96, %f104;
+       add.rn.f32      %f107, %f101, %f106;
+       neg.f32         %f108, %f107;
+       add.rn.f32      %f109, %f101, %f108;
+       add.rn.f32      %f110, %f109, %f106;
+       mov.b32          %r27, %f107;
        setp.eq.s32     %p52, %r27, 1118925336;
        add.s32         %r28, %r27, -1;
-       mov.b32          %f113, %r28;
-       add.f32         %f114, %f112, 0f37000000;
-       selp.f32        %f115, %f113, %f109, %p52;
-       selp.f32        %f22, %f114, %f112, %p52;
-       mul.f32         %f116, %f115, 0f3FB8AA3B;
-       cvt.rzi.f32.f32 %f117, %f116;
-       mov.f32         %f118, 0fBF317200;
-       fma.rn.f32      %f119, %f117, %f118, %f115;
-       mov.f32         %f120, 0fB5BFBE8E;
-       fma.rn.f32      %f121, %f117, %f120, %f119;
-       mul.f32         %f52, %f121, 0f3FB8AA3B;
-       // inline asm
-       ex2.approx.ftz.f32 %f51,%f52;
-       // inline asm
-       add.f32         %f122, %f117, 0f00000000;
+       mov.b32          %f111, %r28;
+       add.f32         %f112, %f110, 0f37000000;
+       selp.f32        %f113, %f111, %f107, %p52;
+       selp.f32        %f22, %f112, %f110, %p52;
+       mul.f32         %f114, %f113, 0f3FB8AA3B;
+       cvt.rzi.f32.f32 %f115, %f114;
+       mov.f32         %f116, 0fBF317200;
+       fma.rn.f32      %f117, %f115, %f116, %f113;
+       mov.f32         %f118, 0fB5BFBE8E;
+       fma.rn.f32      %f119, %f115, %f118, %f117;
+       mul.f32         %f120, %f119, 0f3FB8AA3B;
+       ex2.approx.ftz.f32      %f121, %f120;
+       add.f32         %f122, %f115, 0f00000000;
        ex2.approx.f32  %f123, %f122;
-       mul.f32         %f124, %f51, %f123;
-       setp.lt.f32     %p53, %f115, 0fC2D20000;
+       mul.f32         %f124, %f121, %f123;
+       setp.lt.f32     %p53, %f113, 0fC2D20000;
        selp.f32        %f125, 0f00000000, %f124, %p53;
-       setp.gt.f32     %p54, %f115, 0f42D20000;
+       setp.gt.f32     %p54, %f113, 0f42D20000;
        selp.f32        %f131, 0f7F800000, %f125, %p54;
        setp.eq.f32     %p55, %f131, 0f7F800000;
-       @%p55 bra       BB29_48;
+       @%p55 bra       BB29_54;
 
        fma.rn.f32      %f131, %f131, %f22, %f131;
 
-BB29_48:
+BB29_54:
        setp.lt.f32     %p56, %f1, 0f00000000;
        setp.eq.f32     %p57, %f19, 0f3F800000;
        and.pred        %p1, %p56, %p57;
        mov.b32          %r29, %f131;
        xor.b32         %r30, %r29, -2147483648;
        mov.b32          %f126, %r30;
-       selp.f32        %f132, %f126, %f131, %p1;
+       selp.f32        %f133, %f126, %f131, %p1;
        setp.eq.f32     %p58, %f1, 0f00000000;
-       @%p58 bra       BB29_51;
-       bra.uni         BB29_49;
+       @%p58 bra       BB29_57;
+       bra.uni         BB29_55;
 
-BB29_51:
+BB29_57:
        add.f32         %f128, %f1, %f1;
        mov.b32          %r31, %f128;
        selp.b32        %r32, %r31, 0, %p57;
        or.b32          %r33, %r32, 2139095040;
        setp.lt.f32     %p62, %f2, 0f00000000;
        selp.b32        %r34, %r33, %r32, %p62;
-       mov.b32          %f132, %r34;
-       bra.uni         BB29_52;
+       mov.b32          %f133, %r34;
+       bra.uni         BB29_58;
 
-BB29_27:
-       setp.gt.s32     %p11, %r7, 15;
-       @%p11 bra       BB29_31;
+BB29_33:
+       setp.gt.s32     %p11, %r11, 15;
+       @%p11 bra       BB29_37;
 
-       setp.eq.s32     %p15, %r7, 14;
-       @%p15 bra       BB29_40;
-       bra.uni         BB29_29;
+       setp.eq.s32     %p15, %r11, 14;
+       @%p15 bra       BB29_46;
+       bra.uni         BB29_35;
 
-BB29_40:
+BB29_46:
        cvt.rni.s64.f32 %rd10, %f1;
-       cvt.rni.s64.f32 %rd11, %f2;
        cvt.u32.u64     %r17, %rd10;
+       cvt.rni.s64.f32 %rd11, %f2;
        cvt.u32.u64     %r18, %rd11;
        or.b32          %r19, %r18, %r17;
        setp.eq.s32     %p41, %r19, 0;
-       selp.f32        %f133, 0f00000000, 0f3F800000, %p41;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f00000000, 0f3F800000, %p41;
+       bra.uni         BB29_68;
 
-BB29_7:
-       setp.eq.s32     %p31, %r7, 2;
-       @%p31 bra       BB29_60;
-       bra.uni         BB29_8;
+BB29_13:
+       setp.eq.s32     %p31, %r11, 2;
+       @%p31 bra       BB29_66;
+       bra.uni         BB29_14;
 
-BB29_60:
-       mul.f32         %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_66:
+       mul.f32         %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_23:
-       setp.eq.s32     %p18, %r7, 11;
-       @%p18 bra       BB29_42;
+BB29_29:
+       setp.eq.s32     %p18, %r11, 11;
+       @%p18 bra       BB29_48;
 
-       setp.eq.s32     %p19, %r7, 12;
-       @%p19 bra       BB29_41;
-       bra.uni         BB29_25;
+       setp.eq.s32     %p19, %r11, 12;
+       @%p19 bra       BB29_47;
+       bra.uni         BB29_31;
 
-BB29_41:
-       max.f32         %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_47:
+       max.f32         %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_14:
-       setp.eq.s32     %p25, %r7, 6;
-       @%p25 bra       BB29_45;
+BB29_20:
+       setp.eq.s32     %p25, %r11, 6;
+       @%p25 bra       BB29_51;
 
-       setp.eq.s32     %p26, %r7, 7;
-       @%p26 bra       BB29_44;
-       bra.uni         BB29_16;
+       setp.eq.s32     %p26, %r11, 7;
+       @%p26 bra       BB29_50;
+       bra.uni         BB29_22;
 
-BB29_44:
+BB29_50:
        setp.gt.f32     %p46, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p46;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f3F800000, 0f00000000, %p46;
+       bra.uni         BB29_68;
 
-BB29_31:
-       setp.eq.s32     %p12, %r7, 16;
-       @%p12 bra       BB29_39;
+BB29_37:
+       setp.eq.s32     %p12, %r11, 16;
+       @%p12 bra       BB29_45;
 
-       setp.eq.s32     %p13, %r7, 17;
-       @%p13 bra       BB29_36;
-       bra.uni         BB29_33;
+       setp.eq.s32     %p13, %r11, 17;
+       @%p13 bra       BB29_42;
+       bra.uni         BB29_39;
 
-BB29_36:
+BB29_42:
        setp.eq.f32     %p36, %f2, 0f00000000;
        setp.eq.f32     %p37, %f2, 0f80000000;
        or.pred         %p38, %p36, %p37;
-       mov.f32         %f133, 0f7FC00000;
-       @%p38 bra       BB29_62;
+       mov.f32         %f134, 0f7FC00000;
+       @%p38 bra       BB29_68;
 
-       div.rn.f32      %f133, %f1, %f2;
-       abs.f32         %f43, %f133;
+       div.rn.f32      %f134, %f1, %f2;
+       abs.f32         %f43, %f134;
        setp.geu.f32    %p39, %f43, 0f7F800000;
-       @%p39 bra       BB29_62;
+       @%p39 bra       BB29_68;
 
-       cvt.rmi.f32.f32 %f44, %f133;
+       cvt.rmi.f32.f32 %f44, %f134;
        mul.f32         %f45, %f2, %f44;
-       sub.f32         %f133, %f1, %f45;
-       bra.uni         BB29_62;
+       sub.f32         %f134, %f1, %f45;
+       bra.uni         BB29_68;
 
-BB29_5:
-       setp.eq.s32     %p34, %r7, 1;
-       @%p34 bra       BB29_6;
-       bra.uni         BB29_62;
+BB29_11:
+       setp.eq.s32     %p34, %r11, 1;
+       @%p34 bra       BB29_12;
+       bra.uni         BB29_68;
 
-BB29_6:
-       sub.f32         %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_12:
+       sub.f32         %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_21:
-       setp.eq.s32     %p22, %r7, 10;
-       @%p22 bra       BB29_22;
-       bra.uni         BB29_62;
+BB29_27:
+       setp.eq.s32     %p22, %r11, 10;
+       @%p22 bra       BB29_28;
+       bra.uni         BB29_68;
 
-BB29_22:
+BB29_28:
        setp.neu.f32    %p43, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p43;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f3F800000, 0f00000000, %p43;
+       bra.uni         BB29_68;
 
-BB29_12:
-       setp.eq.s32     %p29, %r7, 5;
-       @%p29 bra       BB29_13;
-       bra.uni         BB29_62;
+BB29_18:
+       setp.eq.s32     %p29, %r11, 5;
+       @%p29 bra       BB29_19;
+       bra.uni         BB29_68;
 
-BB29_13:
+BB29_19:
        setp.lt.f32     %p48, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p48;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f3F800000, 0f00000000, %p48;
+       bra.uni         BB29_68;
 
-BB29_29:
-       setp.eq.s32     %p16, %r7, 15;
-       @%p16 bra       BB29_30;
-       bra.uni         BB29_62;
+BB29_35:
+       setp.eq.s32     %p16, %r11, 15;
+       @%p16 bra       BB29_36;
+       bra.uni         BB29_68;
 
-BB29_30:
+BB29_36:
        mul.f32         %f47, %f1, %f2;
        mov.f32         %f48, 0f3F800000;
-       sub.f32         %f133, %f48, %f47;
-       bra.uni         BB29_62;
+       sub.f32         %f134, %f48, %f47;
+       bra.uni         BB29_68;
 
-BB29_8:
-       setp.eq.s32     %p32, %r7, 3;
-       @%p32 bra       BB29_9;
-       bra.uni         BB29_62;
+BB29_14:
+       setp.eq.s32     %p32, %r11, 3;
+       @%p32 bra       BB29_15;
+       bra.uni         BB29_68;
 
-BB29_9:
-       div.rn.f32      %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_15:
+       div.rn.f32      %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_42:
-       min.f32         %f133, %f1, %f2;
-       bra.uni         BB29_62;
+BB29_48:
+       min.f32         %f134, %f1, %f2;
+       bra.uni         BB29_68;
 
-BB29_25:
-       setp.eq.s32     %p20, %r7, 13;
-       @%p20 bra       BB29_26;
-       bra.uni         BB29_62;
+BB29_31:
+       setp.eq.s32     %p20, %r11, 13;
+       @%p20 bra       BB29_32;
+       bra.uni         BB29_68;
 
-BB29_26:
+BB29_32:
        cvt.rni.s64.f32 %rd12, %f1;
-       cvt.rni.s64.f32 %rd13, %f2;
        cvt.u32.u64     %r20, %rd12;
+       cvt.rni.s64.f32 %rd13, %f2;
        cvt.u32.u64     %r21, %rd13;
        and.b32         %r22, %r21, %r20;
        setp.eq.s32     %p42, %r22, 0;
-       selp.f32        %f133, 0f00000000, 0f3F800000, %p42;
-       bra.uni         BB29_62;
+       selp.f32        %f134, 0f00000000, 0f3F800000, %p42;
+       bra.uni         BB29_68;
 
-BB29_45:
-       setp.le.f32     %p47, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p47;
-       bra.uni         BB29_62;
+BB29_51:
+       setp.gtu.f32    %p47, %f1, %f2;
+       selp.f32        %f134, 0f00000000, 0f3F800000, %p47;
+       bra.uni         BB29_68;
 
-BB29_16:
-       setp.eq.s32     %p27, %r7, 8;
-       @%p27 bra       BB29_17;
-       bra.uni         BB29_62;
+BB29_22:
+       setp.eq.s32     %p27, %r11, 8;
+       @%p27 bra       BB29_23;
+       bra.uni         BB29_68;
 
-BB29_17:
-       setp.ge.f32     %p45, %f1, %f2;
-       selp.f32        %f133, 0f3F800000, 0f00000000, %p45;
-       bra.uni         BB29_62;
+BB29_23:
+       setp.ltu.f32    %p45, %f1, %f2;
+       selp.f32        %f134, 0f00000000, 0f3F800000, %p45;
+       bra.uni         BB29_68;
 
-BB29_39:
+BB29_45:
        setp.neu.f32    %p40, %f1, 0f00000000;
        sub.f32         %f46, %f1, %f2;
-       selp.f32        %f133, %f46, 0f00000000, %p40;
-       bra.uni         BB29_62;
+       selp.f32        %f134, %f46, 0f00000000, %p40;
+       bra.uni         BB29_68;
 
-BB29_33:
-       setp.ne.s32     %p14, %r7, 18;
-       @%p14 bra       BB29_62;
+BB29_39:
+       setp.ne.s32     %p14, %r11, 18;
+       @%p14 bra       BB29_68;
 
-       div.rn.f32      %f133, %f1, %f2;
-       abs.f32         %f41, %f133;
+       div.rn.f32      %f134, %f1, %f2;
+       abs.f32         %f41, %f134;
        setp.geu.f32    %p35, %f41, 0f7F800000;
-       @%p35 bra       BB29_62;
+       @%p35 bra       BB29_68;
 
-       cvt.rmi.f32.f32 %f133, %f133;
-       bra.uni         BB29_62;
+       cvt.rmi.f32.f32 %f134, %f134;
+       bra.uni         BB29_68;
 
-BB29_49:
+BB29_55:
        setp.geu.f32    %p59, %f1, 0f00000000;
-       @%p59 bra       BB29_52;
+       @%p59 bra       BB29_58;
 
        cvt.rzi.f32.f32 %f127, %f2;
        setp.neu.f32    %p60, %f127, %f2;
-       selp.f32        %f132, 0f7FFFFFFF, %f132, %p60;
+       selp.f32        %f133, 0f7FFFFFFF, %f133, %p60;
 
-BB29_52:
+BB29_58:
        add.f32         %f129, %f20, %f21;
        mov.b32          %r35, %f129;
        setp.lt.s32     %p63, %r35, 2139095040;
-       @%p63 bra       BB29_59;
+       @%p63 bra       BB29_65;
 
        setp.gtu.f32    %p64, %f20, 0f7F800000;
        setp.gtu.f32    %p65, %f21, 0f7F800000;
        or.pred         %p66, %p64, %p65;
-       @%p66 bra       BB29_58;
-       bra.uni         BB29_54;
+       @%p66 bra       BB29_64;
+       bra.uni         BB29_60;
 
-BB29_58:
-       add.f32         %f132, %f1, %f2;
-       bra.uni         BB29_59;
+BB29_64:
+       add.f32         %f133, %f1, %f2;
+       bra.uni         BB29_65;
 
-BB29_54:
+BB29_60:
        setp.eq.f32     %p67, %f21, 0f7F800000;
-       @%p67 bra       BB29_57;
-       bra.uni         BB29_55;
+       @%p67 bra       BB29_63;
+       bra.uni         BB29_61;
 
-BB29_57:
+BB29_63:
        setp.gt.f32     %p70, %f20, 0f3F800000;
        selp.b32        %r39, 2139095040, 0, %p70;
        xor.b32         %r40, %r39, 2139095040;
@@ -2857,33 +2861,33 @@ BB29_57:
        selp.b32        %r41, %r40, %r39, %p71;
        mov.b32          %f130, %r41;
        setp.eq.f32     %p72, %f1, 0fBF800000;
-       selp.f32        %f132, 0f3F800000, %f130, %p72;
-       bra.uni         BB29_59;
+       selp.f32        %f133, 0f3F800000, %f130, %p72;
+       bra.uni         BB29_65;
 
-BB29_55:
+BB29_61:
        setp.neu.f32    %p68, %f20, 0f7F800000;
-       @%p68 bra       BB29_59;
+       @%p68 bra       BB29_65;
 
-       setp.ge.f32     %p69, %f2, 0f00000000;
-       selp.b32        %r36, 2139095040, 0, %p69;
+       setp.ltu.f32    %p69, %f2, 0f00000000;
+       selp.b32        %r36, 0, 2139095040, %p69;
        or.b32          %r37, %r36, -2147483648;
        selp.b32        %r38, %r37, %r36, %p1;
-       mov.b32          %f132, %r38;
+       mov.b32          %f133, %r38;
 
-BB29_59:
+BB29_65:
        setp.eq.f32     %p73, %f2, 0f00000000;
        setp.eq.f32     %p74, %f1, 0f3F800000;
        or.pred         %p75, %p74, %p73;
-       selp.f32        %f133, 0f3F800000, %f132, %p75;
+       selp.f32        %f134, 0f3F800000, %f133, %p75;
 
-BB29_62:
+BB29_68:
        cvta.to.global.u64      %rd14, %rd3;
        mul.wide.s32    %rd15, %r3, 4;
        add.s64         %rd16, %rd14, %rd15;
-       st.global.f32   [%rd16], %f133;
+       st.global.f32   [%rd16], %f134;
        bar.sync        0;
 
-BB29_63:
+BB29_69:
        ret;
 }
 
@@ -2897,9 +2901,9 @@ BB29_63:
        .param .u32 matrix_scalar_op_d_param_5
 )
 {
-       .reg .pred      %p<141>;
-       .reg .b32       %r<86>;
-       .reg .f64       %fd<107>;
+       .reg .pred      %p<133>;
+       .reg .b32       %r<88>;
+       .reg .f64       %fd<99>;
        .reg .b64       %rd<20>;
 
 
@@ -2914,7 +2918,7 @@ BB29_63:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r9, %r10, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB30_130;
+       @%p3 bra        BB30_142;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -2923,9 +2927,9 @@ BB29_63:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB30_66;
+       @%p4 bra        BB30_72;
 
-       mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd94, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 8;
        @%p5 bra        BB30_19;
 
@@ -2936,31 +2940,31 @@ BB29_63:
        @%p26 bra       BB30_8;
 
        setp.eq.s32     %p29, %r6, 0;
-       @%p29 bra       BB30_64;
+       @%p29 bra       BB30_70;
        bra.uni         BB30_6;
 
-BB30_64:
-       add.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB30_65;
+BB30_70:
+       add.f64         %fd94, %fd1, %fd68;
+       bra.uni         BB30_71;
 
-BB30_66:
-       mov.f64         %fd106, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p73, %r6, 8;
-       @%p73 bra       BB30_83;
+BB30_72:
+       mov.f64         %fd98, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p69, %r6, 8;
+       @%p69 bra       BB30_89;
 
-       setp.gt.s32     %p87, %r6, 3;
-       @%p87 bra       BB30_75;
+       setp.gt.s32     %p83, %r6, 3;
+       @%p83 bra       BB30_81;
 
-       setp.gt.s32     %p94, %r6, 1;
-       @%p94 bra       BB30_72;
+       setp.gt.s32     %p90, %r6, 1;
+       @%p90 bra       BB30_78;
 
-       setp.eq.s32     %p97, %r6, 0;
-       @%p97 bra       BB30_128;
-       bra.uni         BB30_70;
+       setp.eq.s32     %p93, %r6, 0;
+       @%p93 bra       BB30_140;
+       bra.uni         BB30_76;
 
-BB30_128:
-       add.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_140:
+       add.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
 BB30_19:
        setp.gt.s32     %p6, %r6, 13;
@@ -2970,39 +2974,39 @@ BB30_19:
        @%p13 bra       BB30_24;
 
        setp.eq.s32     %p17, %r6, 9;
-       @%p17 bra       BB30_46;
+       @%p17 bra       BB30_48;
        bra.uni         BB30_22;
 
-BB30_46:
-       setp.eq.f64     %p46, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-       bra.uni         BB30_65;
+BB30_48:
+       setp.eq.f64     %p44, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44;
+       bra.uni         BB30_71;
 
-BB30_83:
-       setp.gt.s32     %p74, %r6, 13;
-       @%p74 bra       BB30_92;
+BB30_89:
+       setp.gt.s32     %p70, %r6, 13;
+       @%p70 bra       BB30_98;
 
-       setp.gt.s32     %p81, %r6, 10;
-       @%p81 bra       BB30_88;
+       setp.gt.s32     %p77, %r6, 10;
+       @%p77 bra       BB30_94;
 
-       setp.eq.s32     %p85, %r6, 9;
-       @%p85 bra       BB30_110;
-       bra.uni         BB30_86;
+       setp.eq.s32     %p81, %r6, 9;
+       @%p81 bra       BB30_118;
+       bra.uni         BB30_92;
 
-BB30_110:
-       setp.eq.f64     %p114, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-       bra.uni         BB30_129;
+BB30_118:
+       setp.eq.f64     %p108, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p108;
+       bra.uni         BB30_141;
 
 BB30_11:
        setp.gt.s32     %p20, %r6, 5;
        @%p20 bra       BB30_15;
 
        setp.eq.s32     %p24, %r6, 4;
-       @%p24 bra       BB30_49;
+       @%p24 bra       BB30_51;
        bra.uni         BB30_13;
 
-BB30_49:
+BB30_51:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd68;
@@ -3015,7 +3019,7 @@ BB30_49:
        add.s32         %r25, %r24, -1012;
        mov.b64          %rd14, %fd1;
        shl.b64         %rd2, %rd14, %r25;
-       setp.eq.s64     %p51, %rd2, -9223372036854775808;
+       setp.eq.s64     %p49, %rd2, -9223372036854775808;
        abs.f64         %fd18, %fd68;
        // Callseq Start 1
        {
@@ -3032,69 +3036,68 @@ BB30_49:
        param0, 
        param1
        );
-       ld.param.f64    %fd97, [retval0+0];
+       ld.param.f64    %fd24, [retval0+0];
        
        //{
        }// Callseq End 1
-       setp.lt.s32     %p52, %r2, 0;
-       and.pred        %p1, %p52, %p51;
-       @!%p1 bra       BB30_51;
-       bra.uni         BB30_50;
+       setp.lt.s32     %p50, %r2, 0;
+       and.pred        %p1, %p50, %p49;
+       @!%p1 bra       BB30_53;
+       bra.uni         BB30_52;
 
-BB30_50:
+BB30_52:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r26}, %fd97;
+       mov.b64         {%temp, %r26}, %fd24;
        }
        xor.b32         %r27, %r26, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r28, %temp}, %fd97;
+       mov.b64         {%r28, %temp}, %fd24;
        }
-       mov.b64         %fd97, {%r28, %r27};
+       mov.b64         %fd24, {%r28, %r27};
 
-BB30_51:
-       mov.f64         %fd96, %fd97;
-       setp.eq.f64     %p53, %fd68, 0d0000000000000000;
-       @%p53 bra       BB30_54;
-       bra.uni         BB30_52;
+BB30_53:
+       setp.eq.f64     %p51, %fd68, 0d0000000000000000;
+       @%p51 bra       BB30_56;
+       bra.uni         BB30_54;
 
-BB30_54:
-       selp.b32        %r29, %r2, 0, %p51;
+BB30_56:
+       selp.b32        %r29, %r2, 0, %p49;
        or.b32          %r30, %r29, 2146435072;
-       setp.lt.s32     %p57, %r3, 0;
-       selp.b32        %r31, %r30, %r29, %p57;
+       setp.lt.s32     %p55, %r3, 0;
+       selp.b32        %r31, %r30, %r29, %p55;
        mov.u32         %r32, 0;
-       mov.b64         %fd96, {%r32, %r31};
-       bra.uni         BB30_55;
+       mov.b64         %fd24, {%r32, %r31};
+       bra.uni         BB30_57;
 
 BB30_28:
        setp.gt.s32     %p7, %r6, 15;
        @%p7 bra        BB30_32;
 
        setp.eq.s32     %p11, %r6, 14;
-       @%p11 bra       BB30_43;
+       @%p11 bra       BB30_45;
        bra.uni         BB30_30;
 
-BB30_43:
+BB30_45:
        cvt.rni.s64.f64 %rd10, %fd68;
-       cvt.rni.s64.f64 %rd11, %fd1;
        cvt.u32.u64     %r18, %rd10;
+       cvt.rni.s64.f64 %rd11, %fd1;
        cvt.u32.u64     %r19, %rd11;
        or.b32          %r20, %r19, %r18;
-       setp.eq.s32     %p43, %r20, 0;
-       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-       bra.uni         BB30_65;
+       setp.eq.s32     %p41, %r20, 0;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41;
+       bra.uni         BB30_71;
 
-BB30_75:
-       setp.gt.s32     %p88, %r6, 5;
-       @%p88 bra       BB30_79;
+BB30_81:
+       setp.gt.s32     %p84, %r6, 5;
+       @%p84 bra       BB30_85;
 
-       setp.eq.s32     %p92, %r6, 4;
-       @%p92 bra       BB30_113;
-       bra.uni         BB30_77;
+       setp.eq.s32     %p88, %r6, 4;
+       @%p88 bra       BB30_121;
+       bra.uni         BB30_83;
 
-BB30_113:
+BB30_121:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -3103,11 +3106,11 @@ BB30_113:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd68;
        }
-       bfe.u32         %r61, %r5, 20, 11;
-       add.s32         %r62, %r61, -1012;
+       bfe.u32         %r62, %r5, 20, 11;
+       add.s32         %r63, %r62, -1012;
        mov.b64          %rd19, %fd68;
-       shl.b64         %rd3, %rd19, %r62;
-       setp.eq.s64     %p119, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd19, %r63;
+       setp.eq.s64     %p113, %rd3, -9223372036854775808;
        abs.f64         %fd51, %fd1;
        // Callseq Start 2
        {
@@ -3124,614 +3127,619 @@ BB30_113:
        param0, 
        param1
        );
-       ld.param.f64    %fd105, [retval0+0];
+       ld.param.f64    %fd57, [retval0+0];
        
        //{
        }// Callseq End 2
-       setp.lt.s32     %p120, %r4, 0;
-       and.pred        %p2, %p120, %p119;
-       @!%p2 bra       BB30_115;
-       bra.uni         BB30_114;
+       setp.lt.s32     %p114, %r4, 0;
+       and.pred        %p2, %p114, %p113;
+       @!%p2 bra       BB30_123;
+       bra.uni         BB30_122;
 
-BB30_114:
+BB30_122:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r63}, %fd105;
+       mov.b64         {%temp, %r64}, %fd57;
        }
-       xor.b32         %r64, %r63, -2147483648;
+       xor.b32         %r65, %r64, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r65, %temp}, %fd105;
+       mov.b64         {%r66, %temp}, %fd57;
        }
-       mov.b64         %fd105, {%r65, %r64};
+       mov.b64         %fd57, {%r66, %r65};
 
-BB30_115:
-       mov.f64         %fd104, %fd105;
-       setp.eq.f64     %p121, %fd1, 0d0000000000000000;
-       @%p121 bra      BB30_118;
-       bra.uni         BB30_116;
+BB30_123:
+       setp.eq.f64     %p115, %fd1, 0d0000000000000000;
+       @%p115 bra      BB30_126;
+       bra.uni         BB30_124;
 
-BB30_118:
-       selp.b32        %r66, %r4, 0, %p119;
-       or.b32          %r67, %r66, 2146435072;
-       setp.lt.s32     %p125, %r5, 0;
-       selp.b32        %r68, %r67, %r66, %p125;
-       mov.u32         %r69, 0;
-       mov.b64         %fd104, {%r69, %r68};
-       bra.uni         BB30_119;
+BB30_126:
+       selp.b32        %r67, %r4, 0, %p113;
+       or.b32          %r68, %r67, 2146435072;
+       setp.lt.s32     %p119, %r5, 0;
+       selp.b32        %r69, %r68, %r67, %p119;
+       mov.u32         %r70, 0;
+       mov.b64         %fd57, {%r70, %r69};
+       bra.uni         BB30_127;
 
-BB30_92:
-       setp.gt.s32     %p75, %r6, 15;
-       @%p75 bra       BB30_96;
+BB30_98:
+       setp.gt.s32     %p71, %r6, 15;
+       @%p71 bra       BB30_102;
 
-       setp.eq.s32     %p79, %r6, 14;
-       @%p79 bra       BB30_107;
-       bra.uni         BB30_94;
+       setp.eq.s32     %p75, %r6, 14;
+       @%p75 bra       BB30_115;
+       bra.uni         BB30_100;
 
-BB30_107:
+BB30_115:
        cvt.rni.s64.f64 %rd15, %fd1;
+       cvt.u32.u64     %r56, %rd15;
        cvt.rni.s64.f64 %rd16, %fd68;
-       cvt.u32.u64     %r55, %rd15;
-       cvt.u32.u64     %r56, %rd16;
-       or.b32          %r57, %r56, %r55;
-       setp.eq.s32     %p111, %r57, 0;
-       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-       bra.uni         BB30_129;
+       cvt.u32.u64     %r57, %rd16;
+       or.b32          %r58, %r57, %r56;
+       setp.eq.s32     %p105, %r58, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p105;
+       bra.uni         BB30_141;
 
 BB30_8:
        setp.eq.s32     %p27, %r6, 2;
-       @%p27 bra       BB30_63;
+       @%p27 bra       BB30_69;
        bra.uni         BB30_9;
 
-BB30_63:
-       mul.f64         %fd98, %fd1, %fd68;
-       bra.uni         BB30_65;
+BB30_69:
+       mul.f64         %fd94, %fd1, %fd68;
+       bra.uni         BB30_71;
 
 BB30_24:
        setp.eq.s32     %p14, %r6, 11;
-       @%p14 bra       BB30_45;
+       @%p14 bra       BB30_47;
 
        setp.eq.s32     %p15, %r6, 12;
-       @%p15 bra       BB30_44;
+       @%p15 bra       BB30_46;
        bra.uni         BB30_26;
 
-BB30_44:
-       max.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB30_65;
+BB30_46:
+       max.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB30_71;
 
 BB30_15:
        setp.eq.s32     %p21, %r6, 6;
-       @%p21 bra       BB30_48;
+       @%p21 bra       BB30_50;
 
        setp.eq.s32     %p22, %r6, 7;
-       @%p22 bra       BB30_47;
+       @%p22 bra       BB30_49;
        bra.uni         BB30_17;
 
-BB30_47:
-       setp.lt.f64     %p48, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-       bra.uni         BB30_65;
+BB30_49:
+       setp.lt.f64     %p46, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46;
+       bra.uni         BB30_71;
 
 BB30_32:
        setp.eq.s32     %p8, %r6, 16;
-       @%p8 bra        BB30_42;
+       @%p8 bra        BB30_44;
 
        setp.eq.s32     %p9, %r6, 17;
-       @%p9 bra        BB30_38;
+       @%p9 bra        BB30_39;
        bra.uni         BB30_34;
 
-BB30_38:
-       setp.eq.f64     %p35, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p36, %fd1, 0d8000000000000000;
-       or.pred         %p37, %p35, %p36;
-       mov.f64         %fd98, 0d7FF8000000000000;
-       @%p37 bra       BB30_65;
+BB30_39:
+       setp.eq.f64     %p34, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p35, %fd1, 0d8000000000000000;
+       or.pred         %p36, %p34, %p35;
+       mov.f64         %fd94, 0d7FF8000000000000;
+       @%p36 bra       BB30_71;
 
-       div.rn.f64      %fd98, %fd68, %fd1;
-       abs.f64         %fd72, %fd98;
-       setp.gtu.f64    %p38, %fd72, 0d7FF0000000000000;
-       @%p38 bra       BB30_65;
+       div.rn.f64      %fd94, %fd68, %fd1;
+       abs.f64         %fd72, %fd94;
+       setp.gtu.f64    %p37, %fd72, 0d7FF0000000000000;
+       @%p37 bra       BB30_71;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r15, %temp}, %fd98;
+       mov.b64         {%temp, %r15}, %fd94;
        }
+       and.b32         %r16, %r15, 2147483647;
+       setp.ne.s32     %p38, %r16, 2146435072;
+       @%p38 bra       BB30_43;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r16}, %fd98;
+       mov.b64         {%r17, %temp}, %fd94;
        }
-       and.b32         %r17, %r16, 2147483647;
-       setp.ne.s32     %p39, %r17, 2146435072;
-       setp.ne.s32     %p40, %r15, 0;
-       or.pred         %p41, %p39, %p40;
-       @!%p41 bra      BB30_65;
-       bra.uni         BB30_41;
+       setp.eq.s32     %p39, %r17, 0;
+       @%p39 bra       BB30_71;
 
-BB30_41:
-       cvt.rmi.f64.f64 %fd73, %fd98;
+BB30_43:
+       cvt.rmi.f64.f64 %fd73, %fd94;
        mul.f64         %fd74, %fd1, %fd73;
-       sub.f64         %fd98, %fd68, %fd74;
-       bra.uni         BB30_65;
+       sub.f64         %fd94, %fd68, %fd74;
+       bra.uni         BB30_71;
 
-BB30_72:
-       setp.eq.s32     %p95, %r6, 2;
-       @%p95 bra       BB30_127;
-       bra.uni         BB30_73;
+BB30_78:
+       setp.eq.s32     %p91, %r6, 2;
+       @%p91 bra       BB30_139;
+       bra.uni         BB30_79;
 
-BB30_127:
-       mul.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_139:
+       mul.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
-BB30_88:
-       setp.eq.s32     %p82, %r6, 11;
-       @%p82 bra       BB30_109;
+BB30_94:
+       setp.eq.s32     %p78, %r6, 11;
+       @%p78 bra       BB30_117;
 
-       setp.eq.s32     %p83, %r6, 12;
-       @%p83 bra       BB30_108;
-       bra.uni         BB30_90;
+       setp.eq.s32     %p79, %r6, 12;
+       @%p79 bra       BB30_116;
+       bra.uni         BB30_96;
 
-BB30_108:
-       max.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_116:
+       max.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
-BB30_79:
-       setp.eq.s32     %p89, %r6, 6;
-       @%p89 bra       BB30_112;
+BB30_85:
+       setp.eq.s32     %p85, %r6, 6;
+       @%p85 bra       BB30_120;
 
-       setp.eq.s32     %p90, %r6, 7;
-       @%p90 bra       BB30_111;
-       bra.uni         BB30_81;
+       setp.eq.s32     %p86, %r6, 7;
+       @%p86 bra       BB30_119;
+       bra.uni         BB30_87;
 
-BB30_111:
-       setp.gt.f64     %p116, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-       bra.uni         BB30_129;
+BB30_119:
+       setp.gt.f64     %p110, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110;
+       bra.uni         BB30_141;
 
-BB30_96:
-       setp.eq.s32     %p76, %r6, 16;
-       @%p76 bra       BB30_106;
+BB30_102:
+       setp.eq.s32     %p72, %r6, 16;
+       @%p72 bra       BB30_114;
 
-       setp.eq.s32     %p77, %r6, 17;
-       @%p77 bra       BB30_102;
-       bra.uni         BB30_98;
+       setp.eq.s32     %p73, %r6, 17;
+       @%p73 bra       BB30_109;
+       bra.uni         BB30_104;
 
-BB30_102:
-       setp.eq.f64     %p103, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p104, %fd68, 0d8000000000000000;
-       or.pred         %p105, %p103, %p104;
-       mov.f64         %fd106, 0d7FF8000000000000;
-       @%p105 bra      BB30_129;
+BB30_109:
+       setp.eq.f64     %p98, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p99, %fd68, 0d8000000000000000;
+       or.pred         %p100, %p98, %p99;
+       mov.f64         %fd98, 0d7FF8000000000000;
+       @%p100 bra      BB30_141;
 
-       div.rn.f64      %fd106, %fd1, %fd68;
-       abs.f64         %fd83, %fd106;
-       setp.gtu.f64    %p106, %fd83, 0d7FF0000000000000;
-       @%p106 bra      BB30_129;
+       div.rn.f64      %fd98, %fd1, %fd68;
+       abs.f64         %fd83, %fd98;
+       setp.gtu.f64    %p101, %fd83, 0d7FF0000000000000;
+       @%p101 bra      BB30_141;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r52, %temp}, %fd106;
+       mov.b64         {%temp, %r53}, %fd98;
        }
+       and.b32         %r54, %r53, 2147483647;
+       setp.ne.s32     %p102, %r54, 2146435072;
+       @%p102 bra      BB30_113;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r53}, %fd106;
+       mov.b64         {%r55, %temp}, %fd98;
        }
-       and.b32         %r54, %r53, 2147483647;
-       setp.ne.s32     %p107, %r54, 2146435072;
-       setp.ne.s32     %p108, %r52, 0;
-       or.pred         %p109, %p107, %p108;
-       @!%p109 bra     BB30_129;
-       bra.uni         BB30_105;
-
-BB30_105:
-       cvt.rmi.f64.f64 %fd84, %fd106;
+       setp.eq.s32     %p103, %r55, 0;
+       @%p103 bra      BB30_141;
+
+BB30_113:
+       cvt.rmi.f64.f64 %fd84, %fd98;
        mul.f64         %fd85, %fd84, %fd68;
-       sub.f64         %fd106, %fd1, %fd85;
-       bra.uni         BB30_129;
+       sub.f64         %fd98, %fd1, %fd85;
+       bra.uni         BB30_141;
 
 BB30_6:
        setp.eq.s32     %p30, %r6, 1;
        @%p30 bra       BB30_7;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_7:
-       sub.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB30_65;
+       sub.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB30_71;
 
 BB30_22:
        setp.eq.s32     %p18, %r6, 10;
        @%p18 bra       BB30_23;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_23:
-       setp.neu.f64    %p45, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-       bra.uni         BB30_65;
+       setp.neu.f64    %p43, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43;
+       bra.uni         BB30_71;
 
 BB30_13:
        setp.eq.s32     %p25, %r6, 5;
        @%p25 bra       BB30_14;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_14:
-       setp.gt.f64     %p50, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-       bra.uni         BB30_65;
+       setp.gt.f64     %p48, %fd1, %fd68;
+       selp.f64        %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48;
+       bra.uni         BB30_71;
 
 BB30_30:
        setp.eq.s32     %p12, %r6, 15;
        @%p12 bra       BB30_31;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_31:
        mul.f64         %fd76, %fd1, %fd68;
        mov.f64         %fd77, 0d3FF0000000000000;
-       sub.f64         %fd98, %fd77, %fd76;
-       bra.uni         BB30_65;
+       sub.f64         %fd94, %fd77, %fd76;
+       bra.uni         BB30_71;
 
 BB30_9:
        setp.eq.s32     %p28, %r6, 3;
        @%p28 bra       BB30_10;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_10:
-       div.rn.f64      %fd98, %fd68, %fd1;
-       bra.uni         BB30_65;
+       div.rn.f64      %fd94, %fd68, %fd1;
+       bra.uni         BB30_71;
 
-BB30_45:
-       min.f64         %fd98, %fd68, %fd1;
-       bra.uni         BB30_65;
+BB30_47:
+       min.f64         %fd94, %fd68, %fd1;
+       bra.uni         BB30_71;
 
 BB30_26:
        setp.eq.s32     %p16, %r6, 13;
        @%p16 bra       BB30_27;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_27:
        cvt.rni.s64.f64 %rd12, %fd68;
-       cvt.rni.s64.f64 %rd13, %fd1;
        cvt.u32.u64     %r21, %rd12;
+       cvt.rni.s64.f64 %rd13, %fd1;
        cvt.u32.u64     %r22, %rd13;
        and.b32         %r23, %r22, %r21;
-       setp.eq.s32     %p44, %r23, 0;
-       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-       bra.uni         BB30_65;
+       setp.eq.s32     %p42, %r23, 0;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42;
+       bra.uni         BB30_71;
 
-BB30_48:
-       setp.ge.f64     %p49, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49;
-       bra.uni         BB30_65;
+BB30_50:
+       setp.ltu.f64    %p47, %fd1, %fd68;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47;
+       bra.uni         BB30_71;
 
 BB30_17:
        setp.eq.s32     %p23, %r6, 8;
        @%p23 bra       BB30_18;
-       bra.uni         BB30_65;
+       bra.uni         BB30_71;
 
 BB30_18:
-       setp.le.f64     %p47, %fd1, %fd68;
-       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47;
-       bra.uni         BB30_65;
+       setp.gtu.f64    %p45, %fd1, %fd68;
+       selp.f64        %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45;
+       bra.uni         BB30_71;
 
-BB30_42:
-       setp.neu.f64    %p42, %fd68, 0d0000000000000000;
+BB30_44:
+       setp.neu.f64    %p40, %fd68, 0d0000000000000000;
        sub.f64         %fd75, %fd68, %fd1;
-       selp.f64        %fd98, %fd75, 0d0000000000000000, %p42;
-       bra.uni         BB30_65;
+       selp.f64        %fd94, %fd75, 0d0000000000000000, %p40;
+       bra.uni         BB30_71;
 
 BB30_34:
        setp.ne.s32     %p10, %r6, 18;
-       @%p10 bra       BB30_65;
+       @%p10 bra       BB30_71;
 
-       div.rn.f64      %fd98, %fd68, %fd1;
-       abs.f64         %fd70, %fd98;
+       div.rn.f64      %fd94, %fd68, %fd1;
+       abs.f64         %fd70, %fd94;
        setp.gtu.f64    %p31, %fd70, 0d7FF0000000000000;
-       @%p31 bra       BB30_65;
+       @%p31 bra       BB30_71;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r12, %temp}, %fd98;
+       mov.b64         {%temp, %r12}, %fd94;
        }
+       and.b32         %r13, %r12, 2147483647;
+       setp.ne.s32     %p32, %r13, 2146435072;
+       @%p32 bra       BB30_38;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r13}, %fd98;
+       mov.b64         {%r14, %temp}, %fd94;
        }
-       and.b32         %r14, %r13, 2147483647;
-       setp.ne.s32     %p32, %r14, 2146435072;
-       setp.ne.s32     %p33, %r12, 0;
-       or.pred         %p34, %p32, %p33;
-       @!%p34 bra      BB30_65;
-       bra.uni         BB30_37;
+       setp.eq.s32     %p33, %r14, 0;
+       @%p33 bra       BB30_71;
 
-BB30_37:
-       cvt.rmi.f64.f64 %fd98, %fd98;
-       bra.uni         BB30_65;
+BB30_38:
+       cvt.rmi.f64.f64 %fd94, %fd94;
+       bra.uni         BB30_71;
 
-BB30_70:
-       setp.eq.s32     %p98, %r6, 1;
-       @%p98 bra       BB30_71;
-       bra.uni         BB30_129;
+BB30_76:
+       setp.eq.s32     %p94, %r6, 1;
+       @%p94 bra       BB30_77;
+       bra.uni         BB30_141;
 
-BB30_71:
-       sub.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_77:
+       sub.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
-BB30_86:
-       setp.eq.s32     %p86, %r6, 10;
-       @%p86 bra       BB30_87;
-       bra.uni         BB30_129;
+BB30_92:
+       setp.eq.s32     %p82, %r6, 10;
+       @%p82 bra       BB30_93;
+       bra.uni         BB30_141;
 
-BB30_87:
-       setp.neu.f64    %p113, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113;
-       bra.uni         BB30_129;
+BB30_93:
+       setp.neu.f64    %p107, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p107;
+       bra.uni         BB30_141;
 
-BB30_77:
-       setp.eq.s32     %p93, %r6, 5;
-       @%p93 bra       BB30_78;
-       bra.uni         BB30_129;
+BB30_83:
+       setp.eq.s32     %p89, %r6, 5;
+       @%p89 bra       BB30_84;
+       bra.uni         BB30_141;
 
-BB30_78:
-       setp.lt.f64     %p118, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118;
-       bra.uni         BB30_129;
+BB30_84:
+       setp.lt.f64     %p112, %fd1, %fd68;
+       selp.f64        %fd98, 0d3FF0000000000000, 0d0000000000000000, %p112;
+       bra.uni         BB30_141;
 
-BB30_94:
-       setp.eq.s32     %p80, %r6, 15;
-       @%p80 bra       BB30_95;
-       bra.uni         BB30_129;
+BB30_100:
+       setp.eq.s32     %p76, %r6, 15;
+       @%p76 bra       BB30_101;
+       bra.uni         BB30_141;
 
-BB30_95:
+BB30_101:
        mul.f64         %fd87, %fd1, %fd68;
        mov.f64         %fd88, 0d3FF0000000000000;
-       sub.f64         %fd106, %fd88, %fd87;
-       bra.uni         BB30_129;
+       sub.f64         %fd98, %fd88, %fd87;
+       bra.uni         BB30_141;
 
-BB30_73:
-       setp.eq.s32     %p96, %r6, 3;
-       @%p96 bra       BB30_74;
-       bra.uni         BB30_129;
+BB30_79:
+       setp.eq.s32     %p92, %r6, 3;
+       @%p92 bra       BB30_80;
+       bra.uni         BB30_141;
 
-BB30_74:
-       div.rn.f64      %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_80:
+       div.rn.f64      %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
-BB30_109:
-       min.f64         %fd106, %fd1, %fd68;
-       bra.uni         BB30_129;
+BB30_117:
+       min.f64         %fd98, %fd1, %fd68;
+       bra.uni         BB30_141;
 
-BB30_90:
-       setp.eq.s32     %p84, %r6, 13;
-       @%p84 bra       BB30_91;
-       bra.uni         BB30_129;
+BB30_96:
+       setp.eq.s32     %p80, %r6, 13;
+       @%p80 bra       BB30_97;
+       bra.uni         BB30_141;
 
-BB30_91:
+BB30_97:
        cvt.rni.s64.f64 %rd17, %fd1;
+       cvt.u32.u64     %r59, %rd17;
        cvt.rni.s64.f64 %rd18, %fd68;
-       cvt.u32.u64     %r58, %rd17;
-       cvt.u32.u64     %r59, %rd18;
-       and.b32         %r60, %r59, %r58;
-       setp.eq.s32     %p112, %r60, 0;
-       selp.f64        %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112;
-       bra.uni         BB30_129;
-
-BB30_112:
-       setp.le.f64     %p117, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117;
-       bra.uni         BB30_129;
+       cvt.u32.u64     %r60, %rd18;
+       and.b32         %r61, %r60, %r59;
+       setp.eq.s32     %p106, %r61, 0;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p106;
+       bra.uni         BB30_141;
 
-BB30_81:
-       setp.eq.s32     %p91, %r6, 8;
-       @%p91 bra       BB30_82;
-       bra.uni         BB30_129;
+BB30_120:
+       setp.gtu.f64    %p111, %fd1, %fd68;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p111;
+       bra.uni         BB30_141;
+
+BB30_87:
+       setp.eq.s32     %p87, %r6, 8;
+       @%p87 bra       BB30_88;
+       bra.uni         BB30_141;
 
-BB30_82:
-       setp.ge.f64     %p115, %fd1, %fd68;
-       selp.f64        %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115;
-       bra.uni         BB30_129;
+BB30_88:
+       setp.ltu.f64    %p109, %fd1, %fd68;
+       selp.f64        %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109;
+       bra.uni         BB30_141;
 
-BB30_106:
-       setp.neu.f64    %p110, %fd1, 0d0000000000000000;
+BB30_114:
+       setp.neu.f64    %p104, %fd1, 0d0000000000000000;
        sub.f64         %fd86, %fd1, %fd68;
-       selp.f64        %fd106, %fd86, 0d0000000000000000, %p110;
-       bra.uni         BB30_129;
+       selp.f64        %fd98, %fd86, 0d0000000000000000, %p104;
+       bra.uni         BB30_141;
 
-BB30_98:
-       setp.ne.s32     %p78, %r6, 18;
-       @%p78 bra       BB30_129;
+BB30_104:
+       setp.ne.s32     %p74, %r6, 18;
+       @%p74 bra       BB30_141;
 
-       div.rn.f64      %fd106, %fd1, %fd68;
-       abs.f64         %fd81, %fd106;
-       setp.gtu.f64    %p99, %fd81, 0d7FF0000000000000;
-       @%p99 bra       BB30_129;
+       div.rn.f64      %fd98, %fd1, %fd68;
+       abs.f64         %fd81, %fd98;
+       setp.gtu.f64    %p95, %fd81, 0d7FF0000000000000;
+       @%p95 bra       BB30_141;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r49, %temp}, %fd106;
+       mov.b64         {%temp, %r50}, %fd98;
        }
+       and.b32         %r51, %r50, 2147483647;
+       setp.ne.s32     %p96, %r51, 2146435072;
+       @%p96 bra       BB30_108;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r50}, %fd106;
+       mov.b64         {%r52, %temp}, %fd98;
        }
-       and.b32         %r51, %r50, 2147483647;
-       setp.ne.s32     %p100, %r51, 2146435072;
-       setp.ne.s32     %p101, %r49, 0;
-       or.pred         %p102, %p100, %p101;
-       @!%p102 bra     BB30_129;
-       bra.uni         BB30_101;
+       setp.eq.s32     %p97, %r52, 0;
+       @%p97 bra       BB30_141;
 
-BB30_101:
-       cvt.rmi.f64.f64 %fd106, %fd106;
-       bra.uni         BB30_129;
+BB30_108:
+       cvt.rmi.f64.f64 %fd98, %fd98;
+       bra.uni         BB30_141;
 
-BB30_52:
-       setp.gt.s32     %p54, %r2, -1;
-       @%p54 bra       BB30_55;
+BB30_54:
+       setp.gt.s32     %p52, %r2, -1;
+       @%p52 bra       BB30_57;
 
        cvt.rzi.f64.f64 %fd78, %fd1;
-       setp.neu.f64    %p55, %fd78, %fd1;
-       selp.f64        %fd96, 0dFFF8000000000000, %fd96, %p55;
+       setp.neu.f64    %p53, %fd78, %fd1;
+       selp.f64        %fd24, 0dFFF8000000000000, %fd24, %p53;
 
-BB30_55:
-       mov.f64         %fd24, %fd96;
-       add.f64         %fd25, %fd1, %fd68;
+BB30_57:
+       add.f64         %fd93, %fd1, %fd68;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r33}, %fd25;
+       mov.b64         {%temp, %r33}, %fd93;
        }
        and.b32         %r34, %r33, 2146435072;
-       setp.ne.s32     %p58, %r34, 2146435072;
-       mov.f64         %fd95, %fd24;
-       @%p58 bra       BB30_62;
+       setp.ne.s32     %p56, %r34, 2146435072;
+       @%p56 bra       BB30_58;
 
-       setp.gtu.f64    %p59, %fd18, 0d7FF0000000000000;
-       mov.f64         %fd95, %fd25;
-       @%p59 bra       BB30_62;
+       setp.gtu.f64    %p57, %fd18, 0d7FF0000000000000;
+       @%p57 bra       BB30_68;
 
        abs.f64         %fd79, %fd1;
-       setp.gtu.f64    %p60, %fd79, 0d7FF0000000000000;
-       mov.f64         %fd94, %fd25;
-       mov.f64         %fd95, %fd94;
-       @%p60 bra       BB30_62;
+       setp.gtu.f64    %p58, %fd79, 0d7FF0000000000000;
+       @%p58 bra       BB30_68;
+
+       and.b32         %r35, %r3, 2147483647;
+       setp.ne.s32     %p59, %r35, 2146435072;
+       @%p59 bra       BB30_63;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd1;
-       }
-       and.b32         %r36, %r3, 2147483647;
-       setp.eq.s32     %p61, %r36, 2146435072;
-       setp.eq.s32     %p62, %r35, 0;
-       and.pred        %p63, %p61, %p62;
-       @%p63 bra       BB30_61;
-       bra.uni         BB30_59;
-
-BB30_61:
-       setp.gt.f64     %p67, %fd18, 0d3FF0000000000000;
-       selp.b32        %r44, 2146435072, 0, %p67;
-       xor.b32         %r45, %r44, 2146435072;
-       setp.lt.s32     %p68, %r3, 0;
-       selp.b32        %r46, %r45, %r44, %p68;
-       setp.eq.f64     %p69, %fd68, 0dBFF0000000000000;
-       selp.b32        %r47, 1072693248, %r46, %p69;
-       mov.u32         %r48, 0;
-       mov.b64         %fd95, {%r48, %r47};
-       bra.uni         BB30_62;
-
-BB30_116:
-       setp.gt.s32     %p122, %r4, -1;
-       @%p122 bra      BB30_119;
+       mov.b64         {%r36, %temp}, %fd1;
+       }
+       setp.eq.s32     %p60, %r36, 0;
+       @%p60 bra       BB30_67;
 
-       cvt.rzi.f64.f64 %fd89, %fd68;
-       setp.neu.f64    %p123, %fd89, %fd68;
-       selp.f64        %fd104, 0dFFF8000000000000, %fd104, %p123;
+BB30_63:
+       and.b32         %r37, %r2, 2147483647;
+       setp.ne.s32     %p61, %r37, 2146435072;
+       @%p61 bra       BB30_64;
 
-BB30_119:
-       mov.f64         %fd57, %fd104;
-       add.f64         %fd58, %fd1, %fd68;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r70}, %fd58;
+       mov.b64         {%r38, %temp}, %fd68;
        }
-       and.b32         %r71, %r70, 2146435072;
-       setp.ne.s32     %p126, %r71, 2146435072;
-       mov.f64         %fd103, %fd57;
-       @%p126 bra      BB30_126;
+       setp.ne.s32     %p62, %r38, 0;
+       mov.f64         %fd93, %fd24;
+       @%p62 bra       BB30_68;
 
-       setp.gtu.f64    %p127, %fd51, 0d7FF0000000000000;
-       mov.f64         %fd103, %fd58;
-       @%p127 bra      BB30_126;
+       shr.s32         %r39, %r3, 31;
+       and.b32         %r40, %r39, -2146435072;
+       add.s32         %r41, %r40, 2146435072;
+       or.b32          %r42, %r41, -2147483648;
+       selp.b32        %r43, %r42, %r41, %p1;
+       mov.u32         %r44, 0;
+       mov.b64         %fd93, {%r44, %r43};
+       bra.uni         BB30_68;
 
-       abs.f64         %fd90, %fd68;
-       setp.gtu.f64    %p128, %fd90, 0d7FF0000000000000;
-       mov.f64         %fd102, %fd58;
-       mov.f64         %fd103, %fd102;
-       @%p128 bra      BB30_126;
+BB30_58:
+       mov.f64         %fd93, %fd24;
 
+BB30_68:
+       setp.eq.f64     %p66, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p67, %fd68, 0d3FF0000000000000;
+       or.pred         %p68, %p67, %p66;
+       selp.f64        %fd94, 0d3FF0000000000000, %fd93, %p68;
+
+BB30_71:
+       st.global.f64   [%rd1], %fd94;
+       bra.uni         BB30_142;
+
+BB30_124:
+       setp.gt.s32     %p116, %r4, -1;
+       @%p116 bra      BB30_127;
+
+       cvt.rzi.f64.f64 %fd89, %fd68;
+       setp.neu.f64    %p117, %fd89, %fd68;
+       selp.f64        %fd57, 0dFFF8000000000000, %fd57, %p117;
+
+BB30_127:
+       add.f64         %fd97, %fd1, %fd68;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r72, %temp}, %fd68;
+       mov.b64         {%temp, %r71}, %fd97;
        }
+       and.b32         %r72, %r71, 2146435072;
+       setp.ne.s32     %p120, %r72, 2146435072;
+       @%p120 bra      BB30_128;
+
+       setp.gtu.f64    %p121, %fd51, 0d7FF0000000000000;
+       @%p121 bra      BB30_138;
+
+       abs.f64         %fd90, %fd68;
+       setp.gtu.f64    %p122, %fd90, 0d7FF0000000000000;
+       @%p122 bra      BB30_138;
+
        and.b32         %r73, %r5, 2147483647;
-       setp.eq.s32     %p129, %r73, 2146435072;
-       setp.eq.s32     %p130, %r72, 0;
-       and.pred        %p131, %p129, %p130;
-       @%p131 bra      BB30_125;
-       bra.uni         BB30_123;
-
-BB30_125:
-       setp.gt.f64     %p135, %fd51, 0d3FF0000000000000;
-       selp.b32        %r81, 2146435072, 0, %p135;
-       xor.b32         %r82, %r81, 2146435072;
-       setp.lt.s32     %p136, %r5, 0;
-       selp.b32        %r83, %r82, %r81, %p136;
-       setp.eq.f64     %p137, %fd1, 0dBFF0000000000000;
-       selp.b32        %r84, 1072693248, %r83, %p137;
-       mov.u32         %r85, 0;
-       mov.b64         %fd103, {%r85, %r84};
-       bra.uni         BB30_126;
-
-BB30_59:
+       setp.ne.s32     %p123, %r73, 2146435072;
+       @%p123 bra      BB30_133;
+
        {
        .reg .b32 %temp; 
-       mov.b64         {%r37, %temp}, %fd68;
+       mov.b64         {%r74, %temp}, %fd68;
        }
-       and.b32         %r38, %r2, 2147483647;
-       setp.eq.s32     %p64, %r38, 2146435072;
-       setp.eq.s32     %p65, %r37, 0;
-       and.pred        %p66, %p64, %p65;
-       mov.f64         %fd95, %fd24;
-       @!%p66 bra      BB30_62;
-       bra.uni         BB30_60;
-
-BB30_60:
-       shr.s32         %r39, %r3, 31;
-       and.b32         %r40, %r39, -2146435072;
-       selp.b32        %r41, -1048576, 2146435072, %p1;
-       add.s32         %r42, %r41, %r40;
-       mov.u32         %r43, 0;
-       mov.b64         %fd95, {%r43, %r42};
-
-BB30_62:
-       setp.eq.f64     %p70, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p71, %fd68, 0d3FF0000000000000;
-       or.pred         %p72, %p71, %p70;
-       selp.f64        %fd98, 0d3FF0000000000000, %fd95, %p72;
+       setp.eq.s32     %p124, %r74, 0;
+       @%p124 bra      BB30_137;
 
-BB30_65:
-       st.global.f64   [%rd1], %fd98;
-       bra.uni         BB30_130;
+BB30_133:
+       and.b32         %r75, %r4, 2147483647;
+       setp.ne.s32     %p125, %r75, 2146435072;
+       @%p125 bra      BB30_134;
 
-BB30_123:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r74, %temp}, %fd1;
+       mov.b64         {%r76, %temp}, %fd1;
        }
-       and.b32         %r75, %r4, 2147483647;
-       setp.eq.s32     %p132, %r75, 2146435072;
-       setp.eq.s32     %p133, %r74, 0;
-       and.pred        %p134, %p132, %p133;
-       mov.f64         %fd103, %fd57;
-       @!%p134 bra     BB30_126;
-       bra.uni         BB30_124;
+       setp.ne.s32     %p126, %r76, 0;
+       mov.f64         %fd97, %fd57;
+       @%p126 bra      BB30_138;
 
-BB30_124:
-       shr.s32         %r76, %r5, 31;
-       and.b32         %r77, %r76, -2146435072;
-       selp.b32        %r78, -1048576, 2146435072, %p2;
-       add.s32         %r79, %r78, %r77;
-       mov.u32         %r80, 0;
-       mov.b64         %fd103, {%r80, %r79};
+       shr.s32         %r77, %r5, 31;
+       and.b32         %r78, %r77, -2146435072;
+       add.s32         %r79, %r78, 2146435072;
+       or.b32          %r80, %r79, -2147483648;
+       selp.b32        %r81, %r80, %r79, %p2;
+       mov.u32         %r82, 0;
+       mov.b64         %fd97, {%r82, %r81};
+       bra.uni         BB30_138;
 
-BB30_126:
-       setp.eq.f64     %p138, %fd68, 0d0000000000000000;
-       setp.eq.f64     %p139, %fd1, 0d3FF0000000000000;
-       or.pred         %p140, %p139, %p138;
-       selp.f64        %fd106, 0d3FF0000000000000, %fd103, %p140;
+BB30_128:
+       mov.f64         %fd97, %fd57;
+
+BB30_138:
+       setp.eq.f64     %p130, %fd68, 0d0000000000000000;
+       setp.eq.f64     %p131, %fd1, 0d3FF0000000000000;
+       or.pred         %p132, %p131, %p130;
+       selp.f64        %fd98, 0d3FF0000000000000, %fd97, %p132;
 
-BB30_129:

<TRUNCATED>

Reply via email to