[SYSTEMML-1411] Add CPU and GPU bias multiply operation

This operation is similar to bias_add, except that it performs
element-wise multiplication rather than addition. It avoids unnecessary
multiplication of ones and reshape. This pattern is common in deep
learning layers such as batch normalization.

Closes #433.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d127dfa2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d127dfa2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d127dfa2

Branch: refs/heads/master
Commit: d127dfa2d3e8a8c58b742e1722f797a9f6968955
Parents: 6e7e887
Author: Niketan Pansare <[email protected]>
Authored: Fri Mar 17 12:33:49 2017 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Fri Mar 17 13:33:49 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   12 +
 src/main/cpp/kernels/SystemML.ptx               | 1312 +++++++++---------
 .../org/apache/sysml/hops/ConvolutionOp.java    |    6 +-
 src/main/java/org/apache/sysml/hops/Hop.java    |    3 +-
 .../apache/sysml/lops/ConvolutionTransform.java |    7 +-
 .../sysml/parser/BuiltinFunctionExpression.java |    3 +
 .../org/apache/sysml/parser/DMLTranslator.java  |    9 +
 .../org/apache/sysml/parser/Expression.java     |    2 +-
 .../instructions/CPInstructionParser.java       |    1 +
 .../instructions/GPUInstructionParser.java      |    1 +
 .../cp/ConvolutionCPInstruction.java            |   36 +-
 .../gpu/ConvolutionGPUInstruction.java          |   17 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |   37 +
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |   95 +-
 14 files changed, 895 insertions(+), 646 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 7bb2c34..eca2a49 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -111,6 +111,18 @@ __global__ void bias_add(double* input,  double* bias, 
double* ret, int rlen, in
        }
 }
 
+// Performs similar operation as bias_add except elementwise multiplication 
instead of add
+extern "C"
+__global__ void bias_multiply(double* input,  double* bias, double* ret, int 
rlen, int clen, int PQ) {
+       int ix = blockIdx.x * blockDim.x + threadIdx.x;
+       int iy = blockIdx.y * blockDim.y + threadIdx.y;
+       if(ix < rlen && iy < clen) {
+               int index = ix * clen + iy;
+               int biasIndex = iy / PQ;
+               ret[index] = input[index] * bias[biasIndex];
+       }
+}
+
 // Compares the value and set
 extern "C"
 __global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, 
double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index efaf29b..3fd5c07 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21112126
+// Cuda compilation tools, release 8.0, V8.0.43
 // Based on LLVM 3.4svn
 //
 
@@ -227,6 +227,62 @@ BB3_2:
        ret;
 }
 
+       // .globl       bias_multiply
+.visible .entry bias_multiply(
+       .param .u64 bias_multiply_param_0,
+       .param .u64 bias_multiply_param_1,
+       .param .u64 bias_multiply_param_2,
+       .param .u32 bias_multiply_param_3,
+       .param .u32 bias_multiply_param_4,
+       .param .u32 bias_multiply_param_5
+)
+{
+       .reg .pred      %p<4>;
+       .reg .b32       %r<14>;
+       .reg .f64       %fd<4>;
+       .reg .b64       %rd<12>;
+
+
+       ld.param.u64    %rd1, [bias_multiply_param_0];
+       ld.param.u64    %rd2, [bias_multiply_param_1];
+       ld.param.u64    %rd3, [bias_multiply_param_2];
+       ld.param.u32    %r5, [bias_multiply_param_3];
+       ld.param.u32    %r3, [bias_multiply_param_4];
+       ld.param.u32    %r4, [bias_multiply_param_5];
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %tid.x;
+       mad.lo.s32      %r1, %r7, %r6, %r8;
+       mov.u32         %r9, %ntid.y;
+       mov.u32         %r10, %ctaid.y;
+       mov.u32         %r11, %tid.y;
+       mad.lo.s32      %r2, %r9, %r10, %r11;
+       setp.lt.s32     %p1, %r1, %r5;
+       setp.lt.s32     %p2, %r2, %r3;
+       and.pred        %p3, %p1, %p2;
+       @!%p3 bra       BB4_2;
+       bra.uni         BB4_1;
+
+BB4_1:
+       cvta.to.global.u64      %rd4, %rd1;
+       mad.lo.s32      %r12, %r1, %r3, %r2;
+       mul.wide.s32    %rd5, %r12, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       div.s32         %r13, %r2, %r4;
+       cvta.to.global.u64      %rd7, %rd2;
+       mul.wide.s32    %rd8, %r13, 8;
+       add.s64         %rd9, %rd7, %rd8;
+       ld.global.f64   %fd1, [%rd9];
+       ld.global.f64   %fd2, [%rd6];
+       mul.f64         %fd3, %fd2, %fd1;
+       cvta.to.global.u64      %rd10, %rd3;
+       add.s64         %rd11, %rd10, %rd5;
+       st.global.f64   [%rd11], %fd3;
+
+BB4_2:
+       ret;
+}
+
        // .globl       compare_and_set
 .visible .entry compare_and_set(
        .param .u64 compare_and_set_param_0,
@@ -267,10 +323,10 @@ BB3_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB4_6;
-       bra.uni         BB4_1;
+       @!%p3 bra       BB5_6;
+       bra.uni         BB5_1;
 
-BB4_1:
+BB5_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -280,26 +336,26 @@ BB4_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB4_5;
-       bra.uni         BB4_2;
+       @%p4 bra        BB5_5;
+       bra.uni         BB5_2;
 
-BB4_5:
+BB5_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB4_6;
+       bra.uni         BB5_6;
 
-BB4_2:
+BB5_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB4_4;
-       bra.uni         BB4_3;
+       @%p5 bra        BB5_4;
+       bra.uni         BB5_3;
 
-BB4_4:
+BB5_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB4_6;
+       bra.uni         BB5_6;
 
-BB4_3:
+BB5_3:
        st.global.f64   [%rd1], %fd6;
 
-BB4_6:
+BB5_6:
        ret;
 }
 
@@ -340,42 +396,42 @@ BB4_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB5_53;
-       bra.uni         BB5_1;
+       @!%p4 bra       BB6_53;
+       bra.uni         BB6_1;
 
-BB5_1:
+BB6_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
        mov.u32         %r53, %r1;
-       @%p5 bra        BB5_5;
+       @%p5 bra        BB6_5;
 
        setp.ne.s32     %p6, %r11, 2;
        mov.u32         %r54, %r3;
-       @%p6 bra        BB5_4;
+       @%p6 bra        BB6_4;
 
        mov.u32         %r54, %r2;
 
-BB5_4:
+BB6_4:
        mov.u32         %r48, %r54;
        mov.u32         %r4, %r48;
        mov.u32         %r53, %r4;
 
-BB5_5:
+BB6_5:
        mov.u32         %r5, %r53;
        setp.eq.s32     %p7, %r12, 1;
        mov.u32         %r51, %r1;
-       @%p7 bra        BB5_9;
+       @%p7 bra        BB6_9;
 
        setp.ne.s32     %p8, %r12, 2;
        mov.u32         %r52, %r3;
-       @%p8 bra        BB5_8;
+       @%p8 bra        BB6_8;
 
        mov.u32         %r52, %r2;
 
-BB5_8:
+BB6_8:
        mov.u32         %r51, %r52;
 
-BB5_9:
+BB6_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
@@ -386,47 +442,47 @@ BB5_9:
        ld.global.f64   %fd2, [%rd10];
        mov.f64         %fd38, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB5_19;
+       @%p9 bra        BB6_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB5_15;
+       @%p19 bra       BB6_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB5_51;
+       @%p23 bra       BB6_51;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB5_50;
-       bra.uni         BB5_13;
+       @%p24 bra       BB6_50;
+       bra.uni         BB6_13;
 
-BB5_50:
+BB6_50:
        sub.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_19:
+BB6_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB5_24;
+       @%p10 bra       BB6_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB5_34;
+       @%p16 bra       BB6_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB5_33;
-       bra.uni         BB5_22;
+       @%p17 bra       BB6_33;
+       bra.uni         BB6_22;
 
-BB5_33:
+BB6_33:
        setp.gt.f64     %p29, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_15:
+BB6_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB5_49;
+       @%p20 bra       BB6_49;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB5_35;
-       bra.uni         BB5_17;
+       @%p21 bra       BB6_35;
+       bra.uni         BB6_17;
 
-BB5_35:
+BB6_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -462,10 +518,10 @@ BB5_35:
        }// Callseq End 0
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB5_37;
-       bra.uni         BB5_36;
+       @!%p1 bra       BB6_37;
+       bra.uni         BB6_36;
 
-BB5_36:
+BB6_36:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r23}, %fd37;
@@ -477,111 +533,111 @@ BB5_36:
        }
        mov.b64         %fd37, {%r25, %r24};
 
-BB5_37:
+BB6_37:
        mov.f64         %fd36, %fd37;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB5_40;
-       bra.uni         BB5_38;
+       @%p34 bra       BB6_40;
+       bra.uni         BB6_38;
 
-BB5_40:
+BB6_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
        mov.b64         %fd36, {%r29, %r28};
-       bra.uni         BB5_41;
+       bra.uni         BB6_41;
 
-BB5_24:
+BB6_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB5_28;
+       @%p11 bra       BB6_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB5_32;
-       bra.uni         BB5_26;
+       @%p14 bra       BB6_32;
+       bra.uni         BB6_26;
 
-BB5_32:
+BB6_32:
        setp.eq.f64     %p27, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_28:
+BB6_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB5_31;
-       bra.uni         BB5_29;
+       @%p12 bra       BB6_31;
+       bra.uni         BB6_29;
 
-BB5_31:
+BB6_31:
        min.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_51:
+BB6_51:
        add.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_13:
+BB6_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB5_14;
-       bra.uni         BB5_52;
+       @%p25 bra       BB6_14;
+       bra.uni         BB6_52;
 
-BB5_14:
+BB6_14:
        mul.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_34:
+BB6_34:
        setp.le.f64     %p30, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_22:
+BB6_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB5_23;
-       bra.uni         BB5_52;
+       @%p18 bra       BB6_23;
+       bra.uni         BB6_52;
 
-BB5_23:
+BB6_23:
        setp.ge.f64     %p28, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_49:
+BB6_49:
        div.rn.f64      %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_17:
+BB6_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB5_18;
-       bra.uni         BB5_52;
+       @%p22 bra       BB6_18;
+       bra.uni         BB6_52;
 
-BB5_18:
+BB6_18:
        setp.lt.f64     %p31, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_26:
+BB6_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB5_27;
-       bra.uni         BB5_52;
+       @%p15 bra       BB6_27;
+       bra.uni         BB6_52;
 
-BB5_27:
+BB6_27:
        setp.neu.f64    %p26, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_29:
+BB6_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB5_52;
+       @%p13 bra       BB6_52;
 
        max.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       bra.uni         BB6_52;
 
-BB5_38:
+BB6_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB5_41;
+       @%p35 bra       BB6_41;
 
        cvt.rzi.f64.f64 %fd29, %fd2;
        setp.neu.f64    %p36, %fd29, %fd2;
        selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
 
-BB5_41:
+BB6_41:
        mov.f64         %fd17, %fd36;
        add.f64         %fd18, %fd1, %fd2;
        {
@@ -591,17 +647,17 @@ BB5_41:
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
        mov.f64         %fd35, %fd17;
-       @%p39 bra       BB5_48;
+       @%p39 bra       BB6_48;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
        mov.f64         %fd35, %fd18;
-       @%p40 bra       BB5_48;
+       @%p40 bra       BB6_48;
 
        abs.f64         %fd30, %fd2;
        setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
        mov.f64         %fd34, %fd18;
        mov.f64         %fd35, %fd34;
-       @%p41 bra       BB5_48;
+       @%p41 bra       BB6_48;
 
        {
        .reg .b32 %temp; 
@@ -611,10 +667,10 @@ BB5_41:
        setp.eq.s32     %p42, %r33, 2146435072;
        setp.eq.s32     %p43, %r32, 0;
        and.pred        %p44, %p42, %p43;
-       @%p44 bra       BB5_47;
-       bra.uni         BB5_45;
+       @%p44 bra       BB6_47;
+       bra.uni         BB6_45;
 
-BB5_47:
+BB6_47:
        setp.gt.f64     %p48, %fd11, 0d3FF0000000000000;
        selp.b32        %r41, 2146435072, 0, %p48;
        xor.b32         %r42, %r41, 2146435072;
@@ -624,9 +680,9 @@ BB5_47:
        selp.b32        %r44, 1072693248, %r43, %p50;
        mov.u32         %r45, 0;
        mov.b64         %fd35, {%r45, %r44};
-       bra.uni         BB5_48;
+       bra.uni         BB6_48;
 
-BB5_45:
+BB6_45:
        {
        .reg .b32 %temp; 
        mov.b64         {%r34, %temp}, %fd1;
@@ -636,10 +692,10 @@ BB5_45:
        setp.eq.s32     %p46, %r34, 0;
        and.pred        %p47, %p45, %p46;
        mov.f64         %fd35, %fd17;
-       @!%p47 bra      BB5_48;
-       bra.uni         BB5_46;
+       @!%p47 bra      BB6_48;
+       bra.uni         BB6_46;
 
-BB5_46:
+BB6_46:
        shr.s32         %r36, %r9, 31;
        and.b32         %r37, %r36, -2146435072;
        selp.b32        %r38, -1048576, 2146435072, %p1;
@@ -647,20 +703,20 @@ BB5_46:
        mov.u32         %r40, 0;
        mov.b64         %fd35, {%r40, %r39};
 
-BB5_48:
+BB6_48:
        setp.eq.f64     %p51, %fd2, 0d0000000000000000;
        setp.eq.f64     %p52, %fd1, 0d3FF0000000000000;
        or.pred         %p53, %p52, %p51;
        selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p53;
 
-BB5_52:
+BB6_52:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd38;
        bar.sync        0;
 
-BB5_53:
+BB6_53:
        ret;
 }
 
@@ -691,7 +747,7 @@ BB5_53:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r10, %r9, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB6_90;
+       @%p3 bra        BB7_90;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -700,86 +756,86 @@ BB5_53:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB6_46;
+       @%p4 bra        BB7_46;
 
        mov.f64         %fd66, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 5;
-       @%p5 bra        BB6_12;
+       @%p5 bra        BB7_12;
 
        setp.gt.s32     %p15, %r6, 2;
-       @%p15 bra       BB6_8;
+       @%p15 bra       BB7_8;
 
        setp.eq.s32     %p19, %r6, 0;
-       @%p19 bra       BB6_44;
+       @%p19 bra       BB7_44;
 
        setp.eq.s32     %p20, %r6, 1;
-       @%p20 bra       BB6_43;
-       bra.uni         BB6_6;
+       @%p20 bra       BB7_43;
+       bra.uni         BB7_6;
 
-BB6_43:
+BB7_43:
        sub.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_46:
+BB7_46:
        mov.f64         %fd74, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p50, %r6, 5;
-       @%p50 bra       BB6_56;
+       @%p50 bra       BB7_56;
 
        setp.gt.s32     %p60, %r6, 2;
-       @%p60 bra       BB6_52;
+       @%p60 bra       BB7_52;
 
        setp.eq.s32     %p64, %r6, 0;
-       @%p64 bra       BB6_88;
+       @%p64 bra       BB7_88;
 
        setp.eq.s32     %p65, %r6, 1;
-       @%p65 bra       BB6_87;
-       bra.uni         BB6_50;
+       @%p65 bra       BB7_87;
+       bra.uni         BB7_50;
 
-BB6_87:
+BB7_87:
        sub.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_12:
+BB7_12:
        setp.gt.s32     %p6, %r6, 8;
-       @%p6 bra        BB6_17;
+       @%p6 bra        BB7_17;
 
        setp.eq.s32     %p12, %r6, 6;
-       @%p12 bra       BB6_27;
+       @%p12 bra       BB7_27;
 
        setp.eq.s32     %p13, %r6, 7;
-       @%p13 bra       BB6_26;
-       bra.uni         BB6_15;
+       @%p13 bra       BB7_26;
+       bra.uni         BB7_15;
 
-BB6_26:
+BB7_26:
        setp.lt.f64     %p25, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_56:
+BB7_56:
        setp.gt.s32     %p51, %r6, 8;
-       @%p51 bra       BB6_61;
+       @%p51 bra       BB7_61;
 
        setp.eq.s32     %p57, %r6, 6;
-       @%p57 bra       BB6_71;
+       @%p57 bra       BB7_71;
 
        setp.eq.s32     %p58, %r6, 7;
-       @%p58 bra       BB6_70;
-       bra.uni         BB6_59;
+       @%p58 bra       BB7_70;
+       bra.uni         BB7_59;
 
-BB6_70:
+BB7_70:
        setp.gt.f64     %p70, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_8:
+BB7_8:
        setp.eq.s32     %p16, %r6, 3;
-       @%p16 bra       BB6_42;
+       @%p16 bra       BB7_42;
 
        setp.eq.s32     %p17, %r6, 4;
-       @%p17 bra       BB6_28;
-       bra.uni         BB6_10;
+       @%p17 bra       BB7_28;
+       bra.uni         BB7_10;
 
-BB6_28:
+BB7_28:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd52;
@@ -815,10 +871,10 @@ BB6_28:
        }// Callseq End 1
        setp.lt.s32     %p29, %r2, 0;
        and.pred        %p1, %p29, %p28;
-       @!%p1 bra       BB6_30;
-       bra.uni         BB6_29;
+       @!%p1 bra       BB7_30;
+       bra.uni         BB7_29;
 
-BB6_29:
+BB7_29:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r14}, %fd65;
@@ -830,43 +886,43 @@ BB6_29:
        }
        mov.b64         %fd65, {%r16, %r15};
 
-BB6_30:
+BB7_30:
        mov.f64         %fd64, %fd65;
        setp.eq.f64     %p30, %fd52, 0d0000000000000000;
-       @%p30 bra       BB6_33;
-       bra.uni         BB6_31;
+       @%p30 bra       BB7_33;
+       bra.uni         BB7_31;
 
-BB6_33:
+BB7_33:
        selp.b32        %r17, %r2, 0, %p28;
        or.b32          %r18, %r17, 2146435072;
        setp.lt.s32     %p34, %r3, 0;
        selp.b32        %r19, %r18, %r17, %p34;
        mov.u32         %r20, 0;
        mov.b64         %fd64, {%r20, %r19};
-       bra.uni         BB6_34;
+       bra.uni         BB7_34;
 
-BB6_17:
+BB7_17:
        setp.gt.s32     %p7, %r6, 10;
-       @%p7 bra        BB6_21;
+       @%p7 bra        BB7_21;
 
        setp.eq.s32     %p10, %r6, 9;
-       @%p10 bra       BB6_25;
-       bra.uni         BB6_19;
+       @%p10 bra       BB7_25;
+       bra.uni         BB7_19;
 
-BB6_25:
+BB7_25:
        setp.eq.f64     %p23, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_52:
+BB7_52:
        setp.eq.s32     %p61, %r6, 3;
-       @%p61 bra       BB6_86;
+       @%p61 bra       BB7_86;
 
        setp.eq.s32     %p62, %r6, 4;
-       @%p62 bra       BB6_72;
-       bra.uni         BB6_54;
+       @%p62 bra       BB7_72;
+       bra.uni         BB7_54;
 
-BB6_72:
+BB7_72:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -902,10 +958,10 @@ BB6_72:
        }// Callseq End 2
        setp.lt.s32     %p74, %r4, 0;
        and.pred        %p2, %p74, %p73;
-       @!%p2 bra       BB6_74;
-       bra.uni         BB6_73;
+       @!%p2 bra       BB7_74;
+       bra.uni         BB7_73;
 
-BB6_73:
+BB7_73:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r39}, %fd73;
@@ -917,179 +973,179 @@ BB6_73:
        }
        mov.b64         %fd73, {%r41, %r40};
 
-BB6_74:
+BB7_74:
        mov.f64         %fd72, %fd73;
        setp.eq.f64     %p75, %fd1, 0d0000000000000000;
-       @%p75 bra       BB6_77;
-       bra.uni         BB6_75;
+       @%p75 bra       BB7_77;
+       bra.uni         BB7_75;
 
-BB6_77:
+BB7_77:
        selp.b32        %r42, %r4, 0, %p73;
        or.b32          %r43, %r42, 2146435072;
        setp.lt.s32     %p79, %r5, 0;
        selp.b32        %r44, %r43, %r42, %p79;
        mov.u32         %r45, 0;
        mov.b64         %fd72, {%r45, %r44};
-       bra.uni         BB6_78;
+       bra.uni         BB7_78;
 
-BB6_61:
+BB7_61:
        setp.gt.s32     %p52, %r6, 10;
-       @%p52 bra       BB6_65;
+       @%p52 bra       BB7_65;
 
        setp.eq.s32     %p55, %r6, 9;
-       @%p55 bra       BB6_69;
-       bra.uni         BB6_63;
+       @%p55 bra       BB7_69;
+       bra.uni         BB7_63;
 
-BB6_69:
+BB7_69:
        setp.eq.f64     %p68, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_21:
+BB7_21:
        setp.eq.s32     %p8, %r6, 11;
-       @%p8 bra        BB6_24;
-       bra.uni         BB6_22;
+       @%p8 bra        BB7_24;
+       bra.uni         BB7_22;
 
-BB6_24:
+BB7_24:
        min.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_44:
+BB7_44:
        add.f64         %fd66, %fd1, %fd52;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_6:
+BB7_6:
        setp.eq.s32     %p21, %r6, 2;
-       @%p21 bra       BB6_7;
-       bra.uni         BB6_45;
+       @%p21 bra       BB7_7;
+       bra.uni         BB7_45;
 
-BB6_7:
+BB7_7:
        mul.f64         %fd66, %fd1, %fd52;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_27:
+BB7_27:
        setp.ge.f64     %p26, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_15:
+BB7_15:
        setp.eq.s32     %p14, %r6, 8;
-       @%p14 bra       BB6_16;
-       bra.uni         BB6_45;
+       @%p14 bra       BB7_16;
+       bra.uni         BB7_45;
 
-BB6_16:
+BB7_16:
        setp.le.f64     %p24, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_42:
+BB7_42:
        div.rn.f64      %fd66, %fd52, %fd1;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_10:
+BB7_10:
        setp.eq.s32     %p18, %r6, 5;
-       @%p18 bra       BB6_11;
-       bra.uni         BB6_45;
+       @%p18 bra       BB7_11;
+       bra.uni         BB7_45;
 
-BB6_11:
+BB7_11:
        setp.gt.f64     %p27, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_65:
+BB7_65:
        setp.eq.s32     %p53, %r6, 11;
-       @%p53 bra       BB6_68;
-       bra.uni         BB6_66;
+       @%p53 bra       BB7_68;
+       bra.uni         BB7_66;
 
-BB6_68:
+BB7_68:
        min.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_19:
+BB7_19:
        setp.eq.s32     %p11, %r6, 10;
-       @%p11 bra       BB6_20;
-       bra.uni         BB6_45;
+       @%p11 bra       BB7_20;
+       bra.uni         BB7_45;
 
-BB6_20:
+BB7_20:
        setp.neu.f64    %p22, %fd1, %fd52;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_22:
+BB7_22:
        setp.ne.s32     %p9, %r6, 12;
-       @%p9 bra        BB6_45;
+       @%p9 bra        BB7_45;
 
        max.f64         %fd66, %fd52, %fd1;
-       bra.uni         BB6_45;
+       bra.uni         BB7_45;
 
-BB6_88:
+BB7_88:
        add.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_50:
+BB7_50:
        setp.eq.s32     %p66, %r6, 2;
-       @%p66 bra       BB6_51;
-       bra.uni         BB6_89;
+       @%p66 bra       BB7_51;
+       bra.uni         BB7_89;
 
-BB6_51:
+BB7_51:
        mul.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_71:
+BB7_71:
        setp.le.f64     %p71, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_59:
+BB7_59:
        setp.eq.s32     %p59, %r6, 8;
-       @%p59 bra       BB6_60;
-       bra.uni         BB6_89;
+       @%p59 bra       BB7_60;
+       bra.uni         BB7_89;
 
-BB6_60:
+BB7_60:
        setp.ge.f64     %p69, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_86:
+BB7_86:
        div.rn.f64      %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_54:
+BB7_54:
        setp.eq.s32     %p63, %r6, 5;
-       @%p63 bra       BB6_55;
-       bra.uni         BB6_89;
+       @%p63 bra       BB7_55;
+       bra.uni         BB7_89;
 
-BB6_55:
+BB7_55:
        setp.lt.f64     %p72, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_63:
+BB7_63:
        setp.eq.s32     %p56, %r6, 10;
-       @%p56 bra       BB6_64;
-       bra.uni         BB6_89;
+       @%p56 bra       BB7_64;
+       bra.uni         BB7_89;
 
-BB6_64:
+BB7_64:
        setp.neu.f64    %p67, %fd1, %fd52;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_66:
+BB7_66:
        setp.ne.s32     %p54, %r6, 12;
-       @%p54 bra       BB6_89;
+       @%p54 bra       BB7_89;
 
        max.f64         %fd74, %fd1, %fd52;
-       bra.uni         BB6_89;
+       bra.uni         BB7_89;
 
-BB6_31:
+BB7_31:
        setp.gt.s32     %p31, %r2, -1;
-       @%p31 bra       BB6_34;
+       @%p31 bra       BB7_34;
 
        cvt.rzi.f64.f64 %fd54, %fd1;
        setp.neu.f64    %p32, %fd54, %fd1;
        selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p32;
 
-BB6_34:
+BB7_34:
        mov.f64         %fd16, %fd64;
        add.f64         %fd17, %fd1, %fd52;
        {
@@ -1099,17 +1155,17 @@ BB6_34:
        and.b32         %r22, %r21, 2146435072;
        setp.ne.s32     %p35, %r22, 2146435072;
        mov.f64         %fd63, %fd16;
-       @%p35 bra       BB6_41;
+       @%p35 bra       BB7_41;
 
        setp.gtu.f64    %p36, %fd10, 0d7FF0000000000000;
        mov.f64         %fd63, %fd17;
-       @%p36 bra       BB6_41;
+       @%p36 bra       BB7_41;
 
        abs.f64         %fd55, %fd1;
        setp.gtu.f64    %p37, %fd55, 0d7FF0000000000000;
        mov.f64         %fd62, %fd17;
        mov.f64         %fd63, %fd62;
-       @%p37 bra       BB6_41;
+       @%p37 bra       BB7_41;
 
        {
        .reg .b32 %temp; 
@@ -1119,10 +1175,10 @@ BB6_34:
        setp.eq.s32     %p38, %r24, 2146435072;
        setp.eq.s32     %p39, %r23, 0;
        and.pred        %p40, %p38, %p39;
-       @%p40 bra       BB6_40;
-       bra.uni         BB6_38;
+       @%p40 bra       BB7_40;
+       bra.uni         BB7_38;
 
-BB6_40:
+BB7_40:
        setp.gt.f64     %p44, %fd10, 0d3FF0000000000000;
        selp.b32        %r32, 2146435072, 0, %p44;
        xor.b32         %r33, %r32, 2146435072;
@@ -1132,17 +1188,17 @@ BB6_40:
        selp.b32        %r35, 1072693248, %r34, %p46;
        mov.u32         %r36, 0;
        mov.b64         %fd63, {%r36, %r35};
-       bra.uni         BB6_41;
+       bra.uni         BB7_41;
 
-BB6_75:
+BB7_75:
        setp.gt.s32     %p76, %r4, -1;
-       @%p76 bra       BB6_78;
+       @%p76 bra       BB7_78;
 
        cvt.rzi.f64.f64 %fd57, %fd52;
        setp.neu.f64    %p77, %fd57, %fd52;
        selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p77;
 
-BB6_78:
+BB7_78:
        mov.f64         %fd41, %fd72;
        add.f64         %fd42, %fd1, %fd52;
        {
@@ -1152,17 +1208,17 @@ BB6_78:
        and.b32         %r47, %r46, 2146435072;
        setp.ne.s32     %p80, %r47, 2146435072;
        mov.f64         %fd71, %fd41;
-       @%p80 bra       BB6_85;
+       @%p80 bra       BB7_85;
 
        setp.gtu.f64    %p81, %fd35, 0d7FF0000000000000;
        mov.f64         %fd71, %fd42;
-       @%p81 bra       BB6_85;
+       @%p81 bra       BB7_85;
 
        abs.f64         %fd58, %fd52;
        setp.gtu.f64    %p82, %fd58, 0d7FF0000000000000;
        mov.f64         %fd70, %fd42;
        mov.f64         %fd71, %fd70;
-       @%p82 bra       BB6_85;
+       @%p82 bra       BB7_85;
 
        {
        .reg .b32 %temp; 
@@ -1172,10 +1228,10 @@ BB6_78:
        setp.eq.s32     %p83, %r49, 2146435072;
        setp.eq.s32     %p84, %r48, 0;
        and.pred        %p85, %p83, %p84;
-       @%p85 bra       BB6_84;
-       bra.uni         BB6_82;
+       @%p85 bra       BB7_84;
+       bra.uni         BB7_82;
 
-BB6_84:
+BB7_84:
        setp.gt.f64     %p89, %fd35, 0d3FF0000000000000;
        selp.b32        %r57, 2146435072, 0, %p89;
        xor.b32         %r58, %r57, 2146435072;
@@ -1185,9 +1241,9 @@ BB6_84:
        selp.b32        %r60, 1072693248, %r59, %p91;
        mov.u32         %r61, 0;
        mov.b64         %fd71, {%r61, %r60};
-       bra.uni         BB6_85;
+       bra.uni         BB7_85;
 
-BB6_38:
+BB7_38:
        {
        .reg .b32 %temp; 
        mov.b64         {%r25, %temp}, %fd52;
@@ -1197,10 +1253,10 @@ BB6_38:
        setp.eq.s32     %p42, %r25, 0;
        and.pred        %p43, %p41, %p42;
        mov.f64         %fd63, %fd16;
-       @!%p43 bra      BB6_41;
-       bra.uni         BB6_39;
+       @!%p43 bra      BB7_41;
+       bra.uni         BB7_39;
 
-BB6_39:
+BB7_39:
        shr.s32         %r27, %r3, 31;
        and.b32         %r28, %r27, -2146435072;
        selp.b32        %r29, -1048576, 2146435072, %p1;
@@ -1208,17 +1264,17 @@ BB6_39:
        mov.u32         %r31, 0;
        mov.b64         %fd63, {%r31, %r30};
 
-BB6_41:
+BB7_41:
        setp.eq.f64     %p47, %fd1, 0d0000000000000000;
        setp.eq.f64     %p48, %fd52, 0d3FF0000000000000;
        or.pred         %p49, %p48, %p47;
        selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p49;
 
-BB6_45:
+BB7_45:
        st.global.f64   [%rd1], %fd66;
-       bra.uni         BB6_90;
+       bra.uni         BB7_90;
 
-BB6_82:
+BB7_82:
        {
        .reg .b32 %temp; 
        mov.b64         {%r50, %temp}, %fd1;
@@ -1228,10 +1284,10 @@ BB6_82:
        setp.eq.s32     %p87, %r50, 0;
        and.pred        %p88, %p86, %p87;
        mov.f64         %fd71, %fd41;
-       @!%p88 bra      BB6_85;
-       bra.uni         BB6_83;
+       @!%p88 bra      BB7_85;
+       bra.uni         BB7_83;
 
-BB6_83:
+BB7_83:
        shr.s32         %r52, %r5, 31;
        and.b32         %r53, %r52, -2146435072;
        selp.b32        %r54, -1048576, 2146435072, %p2;
@@ -1239,16 +1295,16 @@ BB6_83:
        mov.u32         %r56, 0;
        mov.b64         %fd71, {%r56, %r55};
 
-BB6_85:
+BB7_85:
        setp.eq.f64     %p92, %fd52, 0d0000000000000000;
        setp.eq.f64     %p93, %fd1, 0d3FF0000000000000;
        or.pred         %p94, %p93, %p92;
        selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p94;
 
-BB6_89:
+BB7_89:
        st.global.f64   [%rd1], %fd74;
 
-BB6_90:
+BB7_90:
        bar.sync        0;
        ret;
 }
@@ -1274,14 +1330,14 @@ BB6_90:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB7_2;
+       @%p1 bra        BB8_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB7_2:
+BB8_2:
        ret;
 }
 
@@ -1309,9 +1365,9 @@ BB7_2:
        mov.f64         %fd76, 0d0000000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB8_4;
+       @%p1 bra        BB9_4;
 
-BB8_1:
+BB9_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -1320,23 +1376,23 @@ BB8_1:
        add.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB8_3;
+       @%p2 bra        BB9_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        add.f64         %fd78, %fd78, %fd31;
 
-BB8_3:
+BB9_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB8_1;
+       @%p3 bra        BB9_1;
 
-BB8_4:
+BB9_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -1344,130 +1400,130 @@ BB8_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB8_8;
+       @%p4 bra        BB9_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB8_7;
+       @%p5 bra        BB9_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        add.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB8_7:
+BB9_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB8_8:
+BB9_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB8_12;
+       @%p6 bra        BB9_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB8_11;
+       @%p7 bra        BB9_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        add.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB8_11:
+BB9_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB8_12:
+BB9_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB8_16;
+       @%p8 bra        BB9_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB8_15;
+       @%p9 bra        BB9_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        add.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB8_15:
+BB9_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB8_16:
+BB9_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB8_20;
+       @%p10 bra       BB9_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB8_19;
+       @%p11 bra       BB9_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        add.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB8_19:
+BB9_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB8_20:
+BB9_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB8_33;
+       @%p12 bra       BB9_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB8_23;
+       @%p13 bra       BB9_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        add.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB8_23:
+BB9_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB8_25;
+       @%p14 bra       BB9_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        add.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB8_25:
+BB9_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB8_27;
+       @%p15 bra       BB9_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        add.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB8_27:
+BB9_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB8_29;
+       @%p16 bra       BB9_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        add.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB8_29:
+BB9_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB8_31;
+       @%p17 bra       BB9_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        add.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB8_31:
+BB9_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB8_33;
+       @%p18 bra       BB9_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        add.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB8_33:
+BB9_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB8_35;
+       @%p19 bra       BB9_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -1475,7 +1531,7 @@ BB8_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB8_35:
+BB9_35:
        ret;
 }
 
@@ -1499,17 +1555,17 @@ BB8_35:
        ld.param.u32    %r4, [reduce_row_sum_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB9_35;
+       @%p1 bra        BB10_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0d0000000000000000;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB9_4;
+       @%p2 bra        BB10_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB9_3:
+BB10_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -1519,9 +1575,9 @@ BB9_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB9_3;
+       @%p3 bra        BB10_3;
 
-BB9_4:
+BB10_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -1531,130 +1587,130 @@ BB9_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB9_8;
+       @%p4 bra        BB10_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB9_7;
+       @%p5 bra        BB10_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        add.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB9_7:
+BB10_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB9_8:
+BB10_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB9_12;
+       @%p6 bra        BB10_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB9_11;
+       @%p7 bra        BB10_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        add.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB9_11:
+BB10_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB9_12:
+BB10_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB9_16;
+       @%p8 bra        BB10_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB9_15;
+       @%p9 bra        BB10_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        add.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB9_15:
+BB10_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB9_16:
+BB10_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB9_20;
+       @%p10 bra       BB10_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB9_19;
+       @%p11 bra       BB10_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        add.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB9_19:
+BB10_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB9_20:
+BB10_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB9_33;
+       @%p12 bra       BB10_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB9_23;
+       @%p13 bra       BB10_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        add.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB9_23:
+BB10_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB9_25;
+       @%p14 bra       BB10_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        add.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB9_25:
+BB10_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB9_27;
+       @%p15 bra       BB10_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        add.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB9_27:
+BB10_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB9_29;
+       @%p16 bra       BB10_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        add.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB9_29:
+BB10_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB9_31;
+       @%p17 bra       BB10_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        add.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB9_31:
+BB10_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB9_33;
+       @%p18 bra       BB10_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        add.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB9_33:
+BB10_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB9_35;
+       @%p19 bra       BB10_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -1662,7 +1718,7 @@ BB9_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB9_35:
+BB10_35:
        ret;
 }
 
@@ -1689,18 +1745,18 @@ BB9_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB10_5;
+       @%p1 bra        BB11_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0d0000000000000000;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB10_4;
+       @%p2 bra        BB11_4;
 
        mov.u32         %r10, %r1;
 
-BB10_3:
+BB11_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -1710,15 +1766,15 @@ BB10_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB10_3;
+       @%p3 bra        BB11_3;
 
-BB10_4:
+BB11_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB10_5:
+BB11_5:
        ret;
 }
 
@@ -1746,9 +1802,9 @@ BB10_5:
        mov.f64         %fd76, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB11_4;
+       @%p1 bra        BB12_4;
 
-BB11_1:
+BB12_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -1757,23 +1813,23 @@ BB11_1:
        max.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB11_3;
+       @%p2 bra        BB12_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        max.f64         %fd78, %fd78, %fd31;
 
-BB11_3:
+BB12_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB11_1;
+       @%p3 bra        BB12_1;
 
-BB11_4:
+BB12_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -1781,130 +1837,130 @@ BB11_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB11_8;
+       @%p4 bra        BB12_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB11_7;
+       @%p5 bra        BB12_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        max.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB11_7:
+BB12_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB11_8:
+BB12_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB11_12;
+       @%p6 bra        BB12_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB11_11;
+       @%p7 bra        BB12_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        max.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB11_11:
+BB12_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB11_12:
+BB12_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB11_16;
+       @%p8 bra        BB12_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB11_15;
+       @%p9 bra        BB12_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        max.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB11_15:
+BB12_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB11_16:
+BB12_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB11_20;
+       @%p10 bra       BB12_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB11_19;
+       @%p11 bra       BB12_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        max.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB11_19:
+BB12_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB11_20:
+BB12_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB11_33;
+       @%p12 bra       BB12_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB11_23;
+       @%p13 bra       BB12_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        max.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB11_23:
+BB12_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB11_25;
+       @%p14 bra       BB12_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        max.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB11_25:
+BB12_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB11_27;
+       @%p15 bra       BB12_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        max.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB11_27:
+BB12_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB11_29;
+       @%p16 bra       BB12_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        max.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB11_29:
+BB12_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB11_31;
+       @%p17 bra       BB12_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        max.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB11_31:
+BB12_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB11_33;
+       @%p18 bra       BB12_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        max.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB11_33:
+BB12_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB11_35;
+       @%p19 bra       BB12_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -1912,7 +1968,7 @@ BB11_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB11_35:
+BB12_35:
        ret;
 }
 
@@ -1936,17 +1992,17 @@ BB11_35:
        ld.param.u32    %r4, [reduce_row_max_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB12_35;
+       @%p1 bra        BB13_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB12_4;
+       @%p2 bra        BB13_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB12_3:
+BB13_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -1956,9 +2012,9 @@ BB12_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB12_3;
+       @%p3 bra        BB13_3;
 
-BB12_4:
+BB13_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -1968,130 +2024,130 @@ BB12_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB12_8;
+       @%p4 bra        BB13_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB12_7;
+       @%p5 bra        BB13_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        max.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB12_7:
+BB13_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB12_8:
+BB13_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB12_12;
+       @%p6 bra        BB13_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB12_11;
+       @%p7 bra        BB13_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        max.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB12_11:
+BB13_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB12_12:
+BB13_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB12_16;
+       @%p8 bra        BB13_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB12_15;
+       @%p9 bra        BB13_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        max.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB12_15:
+BB13_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB12_16:
+BB13_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB12_20;
+       @%p10 bra       BB13_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB12_19;
+       @%p11 bra       BB13_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        max.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB12_19:
+BB13_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB12_20:
+BB13_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB12_33;
+       @%p12 bra       BB13_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB12_23;
+       @%p13 bra       BB13_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        max.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB12_23:
+BB13_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB12_25;
+       @%p14 bra       BB13_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        max.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB12_25:
+BB13_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB12_27;
+       @%p15 bra       BB13_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        max.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB12_27:
+BB13_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB12_29;
+       @%p16 bra       BB13_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        max.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB12_29:
+BB13_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB12_31;
+       @%p17 bra       BB13_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        max.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB12_31:
+BB13_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB12_33;
+       @%p18 bra       BB13_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        max.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB12_33:
+BB13_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB12_35;
+       @%p19 bra       BB13_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -2099,7 +2155,7 @@ BB12_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB12_35:
+BB13_35:
        ret;
 }
 
@@ -2126,18 +2182,18 @@ BB12_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB13_5;
+       @%p1 bra        BB14_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0dFFEFFFFFFFFFFFFF;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB13_4;
+       @%p2 bra        BB14_4;
 
        mov.u32         %r10, %r1;
 
-BB13_3:
+BB14_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -2147,15 +2203,15 @@ BB13_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB13_3;
+       @%p3 bra        BB14_3;
 
-BB13_4:
+BB14_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB13_5:
+BB14_5:
        ret;
 }
 
@@ -2183,9 +2239,9 @@ BB13_5:
        mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB14_4;
+       @%p1 bra        BB15_4;
 
-BB14_1:
+BB15_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -2194,23 +2250,23 @@ BB14_1:
        min.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB14_3;
+       @%p2 bra        BB15_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        min.f64         %fd78, %fd78, %fd31;
 
-BB14_3:
+BB15_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB14_1;
+       @%p3 bra        BB15_1;
 
-BB14_4:
+BB15_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2218,130 +2274,130 @@ BB14_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB14_8;
+       @%p4 bra        BB15_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB14_7;
+       @%p5 bra        BB15_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        min.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB14_7:
+BB15_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB14_8:
+BB15_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB14_12;
+       @%p6 bra        BB15_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB14_11;
+       @%p7 bra        BB15_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        min.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB14_11:
+BB15_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB14_12:
+BB15_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB14_16;
+       @%p8 bra        BB15_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB14_15;
+       @%p9 bra        BB15_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        min.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB14_15:
+BB15_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB14_16:
+BB15_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB14_20;
+       @%p10 bra       BB15_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB14_19;
+       @%p11 bra       BB15_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        min.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB14_19:
+BB15_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB14_20:
+BB15_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB14_33;
+       @%p12 bra       BB15_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB14_23;
+       @%p13 bra       BB15_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        min.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB14_23:
+BB15_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB14_25;
+       @%p14 bra       BB15_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        min.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB14_25:
+BB15_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB14_27;
+       @%p15 bra       BB15_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        min.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB14_27:
+BB15_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB14_29;
+       @%p16 bra       BB15_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        min.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB14_29:
+BB15_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB14_31;
+       @%p17 bra       BB15_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        min.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB14_31:
+BB15_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB14_33;
+       @%p18 bra       BB15_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        min.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB14_33:
+BB15_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB14_35;
+       @%p19 bra       BB15_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2349,7 +2405,7 @@ BB14_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB14_35:
+BB15_35:
        ret;
 }
 
@@ -2373,17 +2429,17 @@ BB14_35:
        ld.param.u32    %r4, [reduce_row_min_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB15_35;
+       @%p1 bra        BB16_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB15_4;
+       @%p2 bra        BB16_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB15_3:
+BB16_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -2393,9 +2449,9 @@ BB15_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB15_3;
+       @%p3 bra        BB16_3;
 
-BB15_4:
+BB16_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -2405,130 +2461,130 @@ BB15_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB15_8;
+       @%p4 bra        BB16_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB15_7;
+       @%p5 bra        BB16_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        min.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB15_7:
+BB16_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB15_8:
+BB16_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB15_12;
+       @%p6 bra        BB16_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB15_11;
+       @%p7 bra        BB16_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        min.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB15_11:
+BB16_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB15_12:
+BB16_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB15_16;
+       @%p8 bra        BB16_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB15_15;
+       @%p9 bra        BB16_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        min.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB15_15:
+BB16_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB15_16:
+BB16_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB15_20;
+       @%p10 bra       BB16_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB15_19;
+       @%p11 bra       BB16_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        min.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB15_19:
+BB16_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB15_20:
+BB16_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB15_33;
+       @%p12 bra       BB16_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB15_23;
+       @%p13 bra       BB16_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        min.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB15_23:
+BB16_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB15_25;
+       @%p14 bra       BB16_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        min.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB15_25:
+BB16_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB15_27;
+       @%p15 bra       BB16_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        min.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB15_27:
+BB16_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB15_29;
+       @%p16 bra       BB16_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        min.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB15_29:
+BB16_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB15_31;
+       @%p17 bra       BB16_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        min.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB15_31:
+BB16_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB15_33;
+       @%p18 bra       BB16_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        min.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB15_33:
+BB16_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB15_35;
+       @%p19 bra       BB16_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -2536,7 +2592,7 @@ BB15_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB15_35:
+BB16_35:
        ret;
 }
 
@@ -2563,18 +2619,18 @@ BB15_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB16_5;
+       @%p1 bra        BB17_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd8, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB16_4;
+       @%p2 bra        BB17_4;
 
        mov.u32         %r10, %r1;
 
-BB16_3:
+BB17_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -2584,15 +2640,15 @@ BB16_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB16_3;
+       @%p3 bra        BB17_3;
 
-BB16_4:
+BB17_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB16_5:
+BB17_5:
        ret;
 }
 
@@ -2620,9 +2676,9 @@ BB16_5:
        mov.f64         %fd76, 0d3FF0000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB17_4;
+       @%p1 bra        BB18_4;
 
-BB17_1:
+BB18_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -2631,23 +2687,23 @@ BB17_1:
        mul.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB17_3;
+       @%p2 bra        BB18_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        mul.f64         %fd78, %fd78, %fd31;
 
-BB17_3:
+BB18_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB17_1;
+       @%p3 bra        BB18_1;
 
-BB17_4:
+BB18_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2655,130 +2711,130 @@ BB17_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB17_8;
+       @%p4 bra        BB18_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB17_7;
+       @%p5 bra        BB18_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        mul.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB17_7:
+BB18_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB17_8:
+BB18_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB17_12;
+       @%p6 bra        BB18_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB17_11;
+       @%p7 bra        BB18_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        mul.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB17_11:
+BB18_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB17_12:
+BB18_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB17_16;
+       @%p8 bra        BB18_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB17_15;
+       @%p9 bra        BB18_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        mul.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB17_15:
+BB18_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB17_16:
+BB18_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB17_20;
+       @%p10 bra       BB18_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB17_19;
+       @%p11 bra       BB18_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        mul.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB17_19:
+BB18_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB17_20:
+BB18_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB17_33;
+       @%p12 bra       BB18_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB17_23;
+       @%p13 bra       BB18_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        mul.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB17_23:
+BB18_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB17_25;
+       @%p14 bra       BB18_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        mul.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB17_25:
+BB18_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB17_27;
+       @%p15 bra       BB18_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        mul.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB17_27:
+BB18_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB17_29;
+       @%p16 bra       BB18_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        mul.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB17_29:
+BB18_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB17_31;
+       @%p17 bra       BB18_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        mul.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB17_31:
+BB18_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB17_33;
+       @%p18 bra       BB18_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        mul.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB17_33:
+BB18_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB17_35;
+       @%p19 bra       BB18_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -2786,7 +2842,7 @@ BB17_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB17_35:
+BB18_35:
        ret;
 }
 
@@ -2801,7 +2857,7 @@ BB17_35:
        .reg .pred      %p<20>;
        .reg .b32       %r<39>;
        .reg .f64       %fd<76>;
-       .reg .b64       %rd<43>;
+       .reg .b64       %rd<42>;
 
 
        ld.param.u64    %rd1, [reduce_row_mean_param_0];
@@ -2810,17 +2866,17 @@ BB17_35:
        ld.param.u32    %r4, [reduce_row_mean_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB18_35;
+       @%p1 bra        BB19_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd74, 0d0000000000000000;
        mov.f64         %fd75, %fd74;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB18_4;
+       @%p2 bra        BB19_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB18_3:
+BB19_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -2830,9 +2886,9 @@ BB18_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd74, %fd75;
-       @%p3 bra        BB18_3;
+       @%p3 bra        BB19_3;
 
-BB18_4:
+BB19_4:
        mov.f64         %fd72, %fd74;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -2842,141 +2898,140 @@ BB18_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB18_8;
+       @%p4 bra        BB19_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd73, %fd72;
-       @%p5 bra        BB18_7;
+       @%p5 bra        BB19_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        add.f64         %fd73, %fd72, %fd29;
        st.shared.f64   [%rd8], %fd73;
 
-BB18_7:
+BB19_7:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB18_8:
+BB19_8:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB18_12;
+       @%p6 bra        BB19_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd71, %fd70;
-       @%p7 bra        BB18_11;
+       @%p7 bra        BB19_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        add.f64         %fd71, %fd70, %fd30;
        st.shared.f64   [%rd8], %fd71;
 
-BB18_11:
+BB19_11:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB18_12:
+BB19_12:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB18_16;
+       @%p8 bra        BB19_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd69, %fd68;
-       @%p9 bra        BB18_15;
+       @%p9 bra        BB19_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        add.f64         %fd69, %fd68, %fd31;
        st.shared.f64   [%rd8], %fd69;
 
-BB18_15:
+BB19_15:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB18_16:
+BB19_16:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB18_20;
+       @%p10 bra       BB19_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd67, %fd66;
-       @%p11 bra       BB18_19;
+       @%p11 bra       BB19_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        add.f64         %fd67, %fd66, %fd32;
        st.shared.f64   [%rd8], %fd67;
 
-BB18_19:
+BB19_19:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB18_20:
+BB19_20:
        mov.f64         %fd65, %fd66;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB18_33;
+       @%p12 bra       BB19_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB18_23;
+       @%p13 bra       BB19_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        add.f64         %fd65, %fd65, %fd33;
        st.volatile.shared.f64  [%rd8], %fd65;
 
-BB18_23:
+BB19_23:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB18_25;
+       @%p14 bra       BB19_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
        add.f64         %fd64, %fd64, %fd34;
        st.volatile.shared.f64  [%rd8], %fd64;
 
-BB18_25:
+BB19_25:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB18_27;
+       @%p15 bra       BB19_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
        add.f64         %fd63, %fd63, %fd35;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB18_27:
+BB19_27:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB18_29;
+       @%p16 bra       BB19_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
        add.f64         %fd62, %fd62, %fd36;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB18_29:
+BB19_29:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB18_31;
+       @%p17 bra       BB19_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
        add.f64         %fd61, %fd61, %fd37;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB18_31:
+BB19_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB18_33;
+       @%p18 bra       BB19_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
        add.f64         %fd39, %fd61, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB18_33:
+BB19_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB18_35;
+       @%p19 bra       BB19_35;
 
        ld.shared.f64   %fd40, [sdata];
-       cvt.u64.u32     %rd39, %r4;
-       cvt.rn.f64.s64  %fd41, %rd39;
+       cvt.rn.f64.s32  %fd41, %r4;
        div.rn.f64      %fd42, %fd40, %fd41;
-       cvta.to.global.u64      %rd40, %rd2;
-       mul.wide.u32    %rd41, %r6, 8;
-       add.s64         %rd42, %rd40, %rd41;
-       st.global.f64   [%rd42], %fd42;
+       cvta.to.global.u64      %rd39, %rd2;
+       mul.wide.u32    %rd40, %r6, 8;
+       add.s64         %rd41, %rd39, %rd40;
+       st.global.f64   [%rd41], %fd42;
 
-BB18_35:
+BB19_35:
        ret;
 }
 
@@ -2991,7 +3046,7 @@ BB18_35:
        .reg .pred      %p<4>;
        .reg .b32       %r<11>;
        .reg .f64       %fd<12>;
-       .reg .b64       %rd<10>;
+       .reg .b64       %rd<9>;
 
 
        ld.param.u64    %rd2, [reduce_col_mean_param_0];
@@ -3003,18 +3058,18 @@ BB18_35:
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB19_5;
+       @%p1 bra        BB20_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
        mov.f64         %fd10, 0d0000000000000000;
        mov.f64         %fd11, %fd10;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB19_4;
+       @%p2 bra        BB20_4;
 
        mov.u32         %r10, %r1;
 
-BB19_3:
+BB20_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
@@ -3024,18 +3079,17 @@ BB19_3:
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd10, %fd11;
-       @%p3 bra        BB19_3;
+       @%p3 bra        BB20_3;
 
-BB19_4:
+BB20_4:
        cvta.to.global.u64      %rd6, %rd3;
-       cvt.u64.u32     %rd7, %r5;
-       cvt.rn.f64.s64  %fd7, %rd7;
+       cvt.rn.f64.s32  %fd7, %r5;
        div.rn.f64      %fd8, %fd10, %fd7;
-       mul.wide.u32    %rd8, %r1, 8;
-       add.s64         %rd9, %rd6, %rd8;
-       st.global.f64   [%rd9], %fd8;
+       mul.wide.u32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f64   [%rd8], %fd8;
 
-BB19_5:
+BB20_5:
        ret;
 }
 
@@ -3061,7 +3115,7 @@ BB19_5:
        mov.u32         %r8, %tid.x;
        mad.lo.s32      %r1, %r7, %r6, %r8;
        setp.ge.u32     %p1, %r1, %r5;
-       @%p1 bra        BB20_5;
+       @%p1 bra        BB21_5;
 
        cvta.to.global.u64      %rd4, %rd2;
        cvt.s64.s32     %rd1, %r1;
@@ -3121,13 +3175,13 @@ BB19_5:
        mov.b32          %f2, %r11;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p2, %f1, 0f4086232B;
-       @%p2 bra        BB20_4;
+       @%p2 bra        BB21_4;
 
        setp.lt.f64     %p3, %fd1, 0d0000000000000000;
        add.f64         %fd37, %fd1, 0d7FF0000000000000;
        selp.f64        %fd40, 0d0000000000000000, %fd37, %p3;
        setp.geu.f32    %p4, %f1, 0f40874800;
-       @%p4 bra        BB20_4;
+       @%p4 bra        BB21_4;
 
        shr.u32         %r12, %r2, 31;
        add.s32         %r13, %r2, %r12;
@@ -3142,13 +3196,13 @@ BB19_5:
        mov.b64         %fd39, {%r20, %r19};
        mul.f64         %fd40, %fd38, %fd39;
 
-BB20_4:
+BB21_4:
        cvta.to.global.u64      %rd7, %rd3;
        shl.b64         %rd8, %rd1, 3;
        add.s64         %rd9, %rd7, %rd8;
        st.global.f64   [%rd9], %fd40;
 
-BB20_5:
+BB21_5:
        ret;
 }
 
@@ -3175,7 +3229,7 @@ BB20_5:
        }
        shr.u32         %r50, %r49, 20;
        setp.ne.s32     %p1, %r50, 0;
-       @%p1 bra        BB21_2;
+       @%p1 bra        BB22_2;
 
        mul.f64         %fd14, %fd12, 0d4350000000000000;
        {
@@ -3189,13 +3243,13 @@ BB20_5:
        shr.u32         %r16, %r49, 20;
        add.s32         %r50, %r16, -54;
 
-BB21_2:
+BB22_2:
        add.s32         %r51, %r50, -1023;
        and.b32         %r17, %r49, -2146435073;
        or.b32          %r18, %r17, 1072693248;
        mov.b64         %fd132, {%r48, %r18};
        setp.lt.u32     %p2, %r18, 1073127583;
-       @%p2 bra        BB21_4;
+       @%p2 bra        BB22_4;
 
        {
        .reg .b32 %temp; 
@@ -3209,7 +3263,7 @@ BB21_2:
        mov.b64         %fd132, {%r19, %r21};
        add.s32         %r51, %r50, -1022;
 
-BB21_4:
+BB22_4:
        add.f64         %fd16, %fd132, 0d3FF0000000000000;
        // inline asm
        rcp.approx.ftz.f64 %fd15,%fd16;
@@ -3374,13 +3428,13 @@ BB21_4:
        mov.b32          %f2, %r35;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p4, %f1, 0f4086232B;
-       @%p4 bra        BB21_7;
+       @%p4 bra        BB22_7;
 
        setp.lt.f64     %p5, %fd4, 0d0000000000000000;
        add.f64         %fd129, %fd4, 0d7FF0000000000000;
        selp.f64        %fd133, 0d0000000000000000, %fd129, %p5;
        setp.geu.f32    %p6, %f1, 0f40874800;
-       @%p6 bra        BB21_7;
+       @%p6 bra        BB22_7;
 
        shr.u32         %r36, %r13, 31;
        add.s32         %r37, %r13, %r36;
@@ -3395,7 +3449,7 @@ BB21_4:
        mov.b64         %fd131, {%r44, %r43};
        mul.f64         %fd133, %fd130, %fd131;
 
-BB21_7:
+BB22_7:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r45}, %fd133;
@@ -3408,13 +3462,13 @@ BB21_7:
        }
        setp.ne.s32     %p8, %r47, 0;
        or.pred         %p9, %p8, %p7;
-       @!%p9 bra       BB21_9;
-       bra.uni         BB21_8;
+       @!%p9 bra       BB22_9;
+       bra.uni         BB22_8;
 
-BB21_8:
+BB22_8:
        fma.rn.f64      %fd133, %fd133, %fd5, %fd133;
 
-BB21_9:
+BB22_9:
        st.param.f64    [func_retval0+0], %fd133;
        ret;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 7a28bb1..567c793 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -93,6 +93,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        case DIRECT_CONV2D_BACKWARD_DATA:
                        case DIRECT_CONV2D_BACKWARD_FILTER:
                        case BIAS_ADD:
+                       case BIAS_MULTIPLY:
                        {       
                                if(et == ExecType.CP || et == ExecType.GPU || 
et == ExecType.SPARK) {
                                        setLops(constructConvolutionLops(et, 
inputs));
@@ -125,6 +126,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        case DIRECT_CONV2D_BACKWARD_DATA:
                                return 14;
                        case BIAS_ADD:
+                       case BIAS_MULTIPLY:
                                return 2;
                        default:
                                return 13;
@@ -247,7 +249,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                // [numRows, numCols, NNZ] 
                long[] ret = new long[3];
                
-               if(op == ConvOp.BIAS_ADD) {
+               if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
                        MatrixCharacteristics[] mc = 
memo.getAllInputStats(getInput());
                        ret[0] = mc[0].rowsKnown() ? mc[0].getRows() : -1;
                        ret[1] = mc[0].colsKnown() ? mc[0].getCols() : -1;
@@ -394,7 +396,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
        @Override
        public void refreshSizeInformation()
        {
-               if(op == ConvOp.BIAS_ADD) {
+               if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
                        Hop input1 = getInput().get(0);
                        setDim1(input1.getDim1());
                        setDim2(input1.getDim2());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java 
b/src/main/java/org/apache/sysml/hops/Hop.java
index 75b24de..86741ba 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -1024,7 +1024,7 @@ public abstract class Hop
        public enum ConvOp {
                MAX_POOLING, MAX_POOLING_BACKWARD,
                DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, 
DIRECT_CONV2D_BACKWARD_DATA,
-               BIAS_ADD
+               BIAS_ADD, BIAS_MULTIPLY
        };
        
        public enum DataGenMethod {
@@ -1091,6 +1091,7 @@ public abstract class Hop
                HopsConv2Lops.put(ConvOp.MAX_POOLING_BACKWARD, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.MAX_POOLING_BACKWARD);
                HopsConv2Lops.put(ConvOp.DIRECT_CONV2D, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D);
                HopsConv2Lops.put(ConvOp.BIAS_ADD, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.BIAS_ADD);
+               HopsConv2Lops.put(ConvOp.BIAS_MULTIPLY, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.BIAS_MULTIPLY);
                HopsConv2Lops.put(ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D_BACKWARD_FILTER);
                HopsConv2Lops.put(ConvOp.DIRECT_CONV2D_BACKWARD_DATA, 
org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D_BACKWARD_DATA);
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java 
b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index cea2c93..6d8885a 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -32,7 +32,7 @@ public class ConvolutionTransform extends Lop
        public enum OperationTypes {
                MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, 
RELU_BACKWARD,
                DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, 
DIRECT_CONV2D_BACKWARD_DATA,
-               BIAS_ADD, DIRECT_CONV2D_BIAS_ADD
+               BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY
        };
        
        private OperationTypes operation = null;
@@ -126,6 +126,9 @@ public class ConvolutionTransform extends Lop
                
                case BIAS_ADD:
                        return "bias_add";
+               
+               case BIAS_MULTIPLY:
+                       return "bias_multiply";
                        
                case DIRECT_CONV2D_BACKWARD_FILTER:
                        return "conv2d_backward_filter";
@@ -140,7 +143,7 @@ public class ConvolutionTransform extends Lop
        }
        
        public String getInstructions(String input, String bias, String output) 
throws LopsException {
-               if(operation == OperationTypes.BIAS_ADD || operation == 
OperationTypes.RELU_BACKWARD) {
+               if(operation == OperationTypes.BIAS_ADD || operation == 
OperationTypes.BIAS_MULTIPLY || operation == OperationTypes.RELU_BACKWARD) {
                        StringBuilder sb = new StringBuilder();
                        sb.append( getExecType() );
                        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java 
b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
index 19565ee..2cb869e 100644
--- a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
@@ -1105,6 +1105,7 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                        break;
                
                case BIAS_ADD:
+               case BIAS_MULTIPLY:
                {
                        Expression input = _args[0];
                        Expression bias = _args[1];
@@ -1615,6 +1616,8 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                         bifop = Expression.BuiltinFunctionOp.CONV2D;
                else if (functionName.equals("bias_add"))
                         bifop = Expression.BuiltinFunctionOp.BIAS_ADD;
+               else if (functionName.equals("bias_multiply"))
+                        bifop = Expression.BuiltinFunctionOp.BIAS_MULTIPLY;
                else if (functionName.equals("conv2d_backward_filter"))
                         bifop = 
Expression.BuiltinFunctionOp.CONV2D_BACKWARD_FILTER;
                else if (functionName.equals("conv2d_backward_data"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java 
b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index b99dd37..aa09fe5 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -2720,6 +2720,15 @@ public class DMLTranslator
                        setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
                        break;
                }
+               case BIAS_MULTIPLY:
+               {
+                       ArrayList<Hop> inHops1 = new ArrayList<Hop>();
+                       inHops1.add(expr);
+                       inHops1.add(expr2);
+                       currBuiltinOp = new ConvolutionOp(target.getName(), 
target.getDataType(), target.getValueType(), Hop.ConvOp.BIAS_MULTIPLY, inHops1);
+                       setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
+                       break;
+               }
                case AVG_POOL:
                case MAX_POOL:
                {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/Expression.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/Expression.java 
b/src/main/java/org/apache/sysml/parser/Expression.java
index 9c3b248..1bb7b35 100644
--- a/src/main/java/org/apache/sysml/parser/Expression.java
+++ b/src/main/java/org/apache/sysml/parser/Expression.java
@@ -90,7 +90,7 @@ public abstract class Expression
                CUMSUM,
                DIAG,
                EIGEN,
-               CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA, BIAS_ADD,
+               CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA, BIAS_ADD, 
BIAS_MULTIPLY,
                MAX_POOL, AVG_POOL, MAX_POOL_BACKWARD,
                EXP,
                FLOOR,

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index f0603b4..bb5e01a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -232,6 +232,7 @@ public class CPInstructionParser extends InstructionParser
                String2CPInstructionType.put( "conv2d_backward_filter"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "conv2d_backward_data"      , 
CPINSTRUCTION_TYPE.Convolution);
                String2CPInstructionType.put( "bias_add"      , 
CPINSTRUCTION_TYPE.Convolution);
+               String2CPInstructionType.put( "bias_multiply"      , 
CPINSTRUCTION_TYPE.Convolution);
                
                // Quaternary instruction opcodes
                String2CPInstructionType.put( "wsloss"  , 
CPINSTRUCTION_TYPE.Quaternary);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 366015f..4051d6a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -48,6 +48,7 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "bias_add",                      
 GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "bias_multiply",                 
         GPUINSTRUCTION_TYPE.Convolution);
 
                // Matrix Multiply Operators
                String2GPUInstructionType.put( "ba+*",                   
GPUINSTRUCTION_TYPE.AggregateBinary);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 3513201..f839990 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -44,8 +44,8 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
        public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr, int numThreads) throws DMLRuntimeException {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
                                opcode, istr);
-               if( !(opcode.equals("bias_add") || 
opcode.equals("relu_backward")) ) {
-                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or relu_backward, but found " + opcode);
+               if( !(opcode.equals("bias_add") || 
opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
+                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or bias_multiply or relu_backward, but found 
" + opcode);
                }
                _in2 = in2;
                _cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -195,7 +195,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        return new ConvolutionCPInstruction(in, in2, in3, out, 
opcode, str, stride,
                                        padding, input_shape, filter_shape, k);
                }
-               else if (opcode.equalsIgnoreCase("bias_add") || 
opcode.equals("relu_backward")) {
+               else if (opcode.equalsIgnoreCase("bias_add") || 
opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
                        InstructionUtils.checkNumFields(parts, 4);
                        CPOperand in = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
@@ -262,6 +262,32 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                ec.setMatrixOutput(getOutputVariableName(), outputBlock);
        }
        
+       public void processBiasMultiplyInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
+               MatrixBlock input = ec.getMatrixInput(input1.getName());
+               MatrixBlock bias = ec.getMatrixInput(_in2.getName());
+               MatrixBlock outputBlock = null;
+               
+               if(bias.getNumColumns() != 1) {
+                       throw new DMLRuntimeException("Expected the number of 
columns of bias matrix to be 1, but found " + bias.getNumColumns());
+               }
+               
+               if(bias.isEmptyBlock()) {
+                       // Anything multiplied by zero is zero
+                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), true);
+               }
+               else {
+                       // As we always fill the output first with bias
+                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), false);
+                       outputBlock.allocateDenseBlock();
+                       LibMatrixDNN.biasMultiply(input, bias, outputBlock, 
_numThreads);
+               }
+               
+               // release inputs/outputs
+               ec.releaseMatrixInput(input1.getName());
+               ec.releaseMatrixInput(_in2.getName());
+               ec.setMatrixOutput(getOutputVariableName(), outputBlock);
+       }
+       
        
        @Override
        public void processInstruction(ExecutionContext ec)
@@ -270,6 +296,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        processBiasAddInstruction(ec);
                        return;
                }
+               else if (instOpcode.equalsIgnoreCase("bias_multiply")) {
+                       processBiasMultiplyInstruction(ec);
+                       return;
+               }
                else if (instOpcode.equalsIgnoreCase("relu_backward")) {
                        processReluBackwardInstruction(ec);
                        return;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 7460d6b..a02115d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -44,8 +44,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction
        
        public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand out, String opcode, String istr) throws DMLRuntimeException {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
-               if(!(opcode.equals("bias_add") || 
opcode.equals("relu_backward"))) {
-                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or relu_backward, but found " + opcode);
+               if(!(opcode.equals("bias_add") || 
opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) {
+                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or bias_multiply or relu_backward, but found 
" + opcode);
                }
                _input1 = in1;
                _input2 = in2;
@@ -166,7 +166,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        return new ConvolutionGPUInstruction(in1, null, out, 
opcode, str, stride,
                                        padding, input_shape, filter_shape);
                }
-               else if( opcode.equalsIgnoreCase("bias_add") || 
opcode.equalsIgnoreCase("relu_backward") ) {
+               else if( opcode.equalsIgnoreCase("bias_add") || 
opcode.equalsIgnoreCase("relu_backward") || 
opcode.equalsIgnoreCase("bias_multiply")  ) {
                        InstructionUtils.checkNumFields(parts, 3);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
@@ -178,14 +178,17 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                }
        }
 
-       public void processBiasInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
+       public void processBiasInstruction(String instOpcode, ExecutionContext 
ec) throws DMLRuntimeException {
                GPUStatistics.incrementNoOfExecutedGPUInst();
                MatrixObject input = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
-               LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out);
+               if(instOpcode.equalsIgnoreCase("bias_add"))
+                       LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, 
out);
+               else if(instOpcode.equalsIgnoreCase("bias_multiply"))
+                       LibMatrixCUDA.biasMultiply(getExtendedOpcode(), input, 
bias, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -210,8 +213,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
        public void processInstruction(ExecutionContext ec) 
                        throws DMLRuntimeException 
        {
-               if (instOpcode.equalsIgnoreCase("bias_add")) {
-                       processBiasInstruction(ec);
+               if (instOpcode.equalsIgnoreCase("bias_add") || 
instOpcode.equalsIgnoreCase("bias_multiply")) {
+                       processBiasInstruction(instOpcode, ec);
                        return;
                }
                else if (instOpcode.equalsIgnoreCase("relu_backward")) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 02f1d55..8074e3a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -433,6 +433,43 @@ public class LibMatrixCUDA {
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
        }
+       
+       /**
+        * Performs the operation corresponding to the DML script:
+        * ones = matrix(1, rows=1, cols=Hout*Wout)
+        * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
+        * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param input input image
+        * @param bias bias
+        * @param outputBlock output
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void biasMultiply(String instName, MatrixObject input, 
MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
+               if(isInSparseFormat(input)) {
+                       
((JCudaObject)input.getGPUObject()).sparseToDense(instName);
+               }
+               if(isInSparseFormat(bias)) {
+                       
((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
+               }
+               long rows = input.getNumRows();
+               long cols = input.getNumColumns();
+               long K = bias.getNumRows();
+               long PQ = cols / K;
+               if(bias.getNumColumns() != 1 || cols % K != 0) {
+                       throw new DMLRuntimeException("Incorrect inputs for 
bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
+               }
+               Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
+               Pointer biasPointer = 
((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
+               Pointer outputPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+               long t1 = 0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+               kernels.launchKernel("bias_multiply",
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
+                                               imagePointer, biasPointer, 
outputPointer, (int)rows, (int)cols, (int) PQ);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
+
+       }
 
        /**
         * Performs the operation corresponding to the DML script:

Reply via email to