[SYSTEMML-1411] Add CPU and GPU bias multiply operation This operation is similar to bias_add, except that it performs element-wise multiplication rather than addition. It avoids unnecessary multiplication of ones and reshape. This pattern is common in deep learning layers such as batch normalization.
Closes #433. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d127dfa2 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d127dfa2 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d127dfa2 Branch: refs/heads/master Commit: d127dfa2d3e8a8c58b742e1722f797a9f6968955 Parents: 6e7e887 Author: Niketan Pansare <[email protected]> Authored: Fri Mar 17 12:33:49 2017 -0800 Committer: Niketan Pansare <[email protected]> Committed: Fri Mar 17 13:33:49 2017 -0700 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 12 + src/main/cpp/kernels/SystemML.ptx | 1312 +++++++++--------- .../org/apache/sysml/hops/ConvolutionOp.java | 6 +- src/main/java/org/apache/sysml/hops/Hop.java | 3 +- .../apache/sysml/lops/ConvolutionTransform.java | 7 +- .../sysml/parser/BuiltinFunctionExpression.java | 3 + .../org/apache/sysml/parser/DMLTranslator.java | 9 + .../org/apache/sysml/parser/Expression.java | 2 +- .../instructions/CPInstructionParser.java | 1 + .../instructions/GPUInstructionParser.java | 1 + .../cp/ConvolutionCPInstruction.java | 36 +- .../gpu/ConvolutionGPUInstruction.java | 17 +- .../runtime/matrix/data/LibMatrixCUDA.java | 37 + .../sysml/runtime/matrix/data/LibMatrixDNN.java | 95 +- 14 files changed, 895 insertions(+), 646 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 7bb2c34..eca2a49 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -111,6 +111,18 @@ __global__ void bias_add(double* input, double* bias, double* ret, int rlen, in } } +// Performs similar operation as bias_add except elementwise multiplication instead of add +extern "C" +__global__ void bias_multiply(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + if(ix < rlen && iy < clen) { + int index = ix * clen + iy; + int biasIndex = iy / PQ; + ret[index] = input[index] * bias[biasIndex]; + } +} + // Compares the value and set extern "C" __global__ void compare_and_set(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index efaf29b..3fd5c07 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,8 +1,8 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21124049 -// Cuda compilation tools, release 8.0, V8.0.44 +// Compiler Build ID: CL-21112126 +// Cuda compilation tools, release 8.0, V8.0.43 // Based on LLVM 3.4svn // @@ -227,6 +227,62 @@ BB3_2: ret; } + // .globl bias_multiply +.visible .entry bias_multiply( + .param .u64 bias_multiply_param_0, + .param .u64 bias_multiply_param_1, + .param .u64 bias_multiply_param_2, + .param .u32 bias_multiply_param_3, + .param .u32 bias_multiply_param_4, + .param .u32 bias_multiply_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<14>; + .reg .f64 %fd<4>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd1, [bias_multiply_param_0]; + ld.param.u64 %rd2, [bias_multiply_param_1]; + ld.param.u64 %rd3, [bias_multiply_param_2]; + ld.param.u32 %r5, [bias_multiply_param_3]; + ld.param.u32 %r3, [bias_multiply_param_4]; + ld.param.u32 %r4, [bias_multiply_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + mov.u32 %r9, %ntid.y; + mov.u32 %r10, %ctaid.y; + mov.u32 %r11, %tid.y; + mad.lo.s32 %r2, %r9, %r10, %r11; + setp.lt.s32 %p1, %r1, %r5; + setp.lt.s32 %p2, %r2, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB4_2; + bra.uni BB4_1; + +BB4_1: + cvta.to.global.u64 %rd4, %rd1; + mad.lo.s32 %r12, %r1, %r3, %r2; + mul.wide.s32 %rd5, %r12, 8; + add.s64 %rd6, %rd4, %rd5; + div.s32 %r13, %r2, %r4; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r13, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + ld.global.f64 %fd2, [%rd6]; + mul.f64 %fd3, %fd2, %fd1; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd11, %rd10, %rd5; + st.global.f64 [%rd11], %fd3; + +BB4_2: + ret; +} + // .globl compare_and_set .visible .entry compare_and_set( .param .u64 compare_and_set_param_0, @@ -267,10 +323,10 @@ BB3_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB4_6; - bra.uni BB4_1; + @!%p3 bra BB5_6; + bra.uni BB5_1; -BB4_1: +BB5_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -280,26 +336,26 @@ BB4_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB4_5; - bra.uni BB4_2; + @%p4 bra BB5_5; + bra.uni BB5_2; -BB4_5: +BB5_5: st.global.f64 [%rd1], %fd4; - bra.uni BB4_6; + bra.uni BB5_6; -BB4_2: +BB5_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB4_4; - bra.uni BB4_3; + @%p5 bra BB5_4; + bra.uni BB5_3; -BB4_4: +BB5_4: st.global.f64 [%rd1], %fd5; - bra.uni BB4_6; + bra.uni BB5_6; -BB4_3: +BB5_3: st.global.f64 [%rd1], %fd6; -BB4_6: +BB5_6: ret; } @@ -340,42 +396,42 @@ BB4_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB5_53; - bra.uni BB5_1; + @!%p4 bra BB6_53; + bra.uni BB6_1; -BB5_1: +BB6_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; mov.u32 %r53, %r1; - @%p5 bra BB5_5; + @%p5 bra BB6_5; setp.ne.s32 %p6, %r11, 2; mov.u32 %r54, %r3; - @%p6 bra BB5_4; + @%p6 bra BB6_4; mov.u32 %r54, %r2; -BB5_4: +BB6_4: mov.u32 %r48, %r54; mov.u32 %r4, %r48; mov.u32 %r53, %r4; -BB5_5: +BB6_5: mov.u32 %r5, %r53; setp.eq.s32 %p7, %r12, 1; mov.u32 %r51, %r1; - @%p7 bra BB5_9; + @%p7 bra BB6_9; setp.ne.s32 %p8, %r12, 2; mov.u32 %r52, %r3; - @%p8 bra BB5_8; + @%p8 bra BB6_8; mov.u32 %r52, %r2; -BB5_8: +BB6_8: mov.u32 %r51, %r52; -BB5_9: +BB6_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; @@ -386,47 +442,47 @@ BB5_9: ld.global.f64 %fd2, [%rd10]; mov.f64 %fd38, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB5_19; + @%p9 bra BB6_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB5_15; + @%p19 bra BB6_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB5_51; + @%p23 bra BB6_51; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB5_50; - bra.uni BB5_13; + @%p24 bra BB6_50; + bra.uni BB6_13; -BB5_50: +BB6_50: sub.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_19: +BB6_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB5_24; + @%p10 bra BB6_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB5_34; + @%p16 bra BB6_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB5_33; - bra.uni BB5_22; + @%p17 bra BB6_33; + bra.uni BB6_22; -BB5_33: +BB6_33: setp.gt.f64 %p29, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_15: +BB6_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB5_49; + @%p20 bra BB6_49; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB5_35; - bra.uni BB5_17; + @%p21 bra BB6_35; + bra.uni BB6_17; -BB5_35: +BB6_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -462,10 +518,10 @@ BB5_35: }// Callseq End 0 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB5_37; - bra.uni BB5_36; + @!%p1 bra BB6_37; + bra.uni BB6_36; -BB5_36: +BB6_36: { .reg .b32 %temp; mov.b64 {%temp, %r23}, %fd37; @@ -477,111 +533,111 @@ BB5_36: } mov.b64 %fd37, {%r25, %r24}; -BB5_37: +BB6_37: mov.f64 %fd36, %fd37; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB5_40; - bra.uni BB5_38; + @%p34 bra BB6_40; + bra.uni BB6_38; -BB5_40: +BB6_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; mov.b64 %fd36, {%r29, %r28}; - bra.uni BB5_41; + bra.uni BB6_41; -BB5_24: +BB6_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB5_28; + @%p11 bra BB6_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB5_32; - bra.uni BB5_26; + @%p14 bra BB6_32; + bra.uni BB6_26; -BB5_32: +BB6_32: setp.eq.f64 %p27, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_28: +BB6_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB5_31; - bra.uni BB5_29; + @%p12 bra BB6_31; + bra.uni BB6_29; -BB5_31: +BB6_31: min.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_51: +BB6_51: add.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_13: +BB6_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB5_14; - bra.uni BB5_52; + @%p25 bra BB6_14; + bra.uni BB6_52; -BB5_14: +BB6_14: mul.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_34: +BB6_34: setp.le.f64 %p30, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_22: +BB6_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB5_23; - bra.uni BB5_52; + @%p18 bra BB6_23; + bra.uni BB6_52; -BB5_23: +BB6_23: setp.ge.f64 %p28, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_49: +BB6_49: div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_17: +BB6_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB5_18; - bra.uni BB5_52; + @%p22 bra BB6_18; + bra.uni BB6_52; -BB5_18: +BB6_18: setp.lt.f64 %p31, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_26: +BB6_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB5_27; - bra.uni BB5_52; + @%p15 bra BB6_27; + bra.uni BB6_52; -BB5_27: +BB6_27: setp.neu.f64 %p26, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_29: +BB6_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB5_52; + @%p13 bra BB6_52; max.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + bra.uni BB6_52; -BB5_38: +BB6_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB5_41; + @%p35 bra BB6_41; cvt.rzi.f64.f64 %fd29, %fd2; setp.neu.f64 %p36, %fd29, %fd2; selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; -BB5_41: +BB6_41: mov.f64 %fd17, %fd36; add.f64 %fd18, %fd1, %fd2; { @@ -591,17 +647,17 @@ BB5_41: and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; mov.f64 %fd35, %fd17; - @%p39 bra BB5_48; + @%p39 bra BB6_48; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; mov.f64 %fd35, %fd18; - @%p40 bra BB5_48; + @%p40 bra BB6_48; abs.f64 %fd30, %fd2; setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; mov.f64 %fd34, %fd18; mov.f64 %fd35, %fd34; - @%p41 bra BB5_48; + @%p41 bra BB6_48; { .reg .b32 %temp; @@ -611,10 +667,10 @@ BB5_41: setp.eq.s32 %p42, %r33, 2146435072; setp.eq.s32 %p43, %r32, 0; and.pred %p44, %p42, %p43; - @%p44 bra BB5_47; - bra.uni BB5_45; + @%p44 bra BB6_47; + bra.uni BB6_45; -BB5_47: +BB6_47: setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; selp.b32 %r41, 2146435072, 0, %p48; xor.b32 %r42, %r41, 2146435072; @@ -624,9 +680,9 @@ BB5_47: selp.b32 %r44, 1072693248, %r43, %p50; mov.u32 %r45, 0; mov.b64 %fd35, {%r45, %r44}; - bra.uni BB5_48; + bra.uni BB6_48; -BB5_45: +BB6_45: { .reg .b32 %temp; mov.b64 {%r34, %temp}, %fd1; @@ -636,10 +692,10 @@ BB5_45: setp.eq.s32 %p46, %r34, 0; and.pred %p47, %p45, %p46; mov.f64 %fd35, %fd17; - @!%p47 bra BB5_48; - bra.uni BB5_46; + @!%p47 bra BB6_48; + bra.uni BB6_46; -BB5_46: +BB6_46: shr.s32 %r36, %r9, 31; and.b32 %r37, %r36, -2146435072; selp.b32 %r38, -1048576, 2146435072, %p1; @@ -647,20 +703,20 @@ BB5_46: mov.u32 %r40, 0; mov.b64 %fd35, {%r40, %r39}; -BB5_48: +BB6_48: setp.eq.f64 %p51, %fd2, 0d0000000000000000; setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; or.pred %p53, %p52, %p51; selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; -BB5_52: +BB6_52: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd38; bar.sync 0; -BB5_53: +BB6_53: ret; } @@ -691,7 +747,7 @@ BB5_53: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB6_90; + @%p3 bra BB7_90; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -700,86 +756,86 @@ BB5_53: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB6_46; + @%p4 bra BB7_46; mov.f64 %fd66, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 5; - @%p5 bra BB6_12; + @%p5 bra BB7_12; setp.gt.s32 %p15, %r6, 2; - @%p15 bra BB6_8; + @%p15 bra BB7_8; setp.eq.s32 %p19, %r6, 0; - @%p19 bra BB6_44; + @%p19 bra BB7_44; setp.eq.s32 %p20, %r6, 1; - @%p20 bra BB6_43; - bra.uni BB6_6; + @%p20 bra BB7_43; + bra.uni BB7_6; -BB6_43: +BB7_43: sub.f64 %fd66, %fd52, %fd1; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_46: +BB7_46: mov.f64 %fd74, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p50, %r6, 5; - @%p50 bra BB6_56; + @%p50 bra BB7_56; setp.gt.s32 %p60, %r6, 2; - @%p60 bra BB6_52; + @%p60 bra BB7_52; setp.eq.s32 %p64, %r6, 0; - @%p64 bra BB6_88; + @%p64 bra BB7_88; setp.eq.s32 %p65, %r6, 1; - @%p65 bra BB6_87; - bra.uni BB6_50; + @%p65 bra BB7_87; + bra.uni BB7_50; -BB6_87: +BB7_87: sub.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_12: +BB7_12: setp.gt.s32 %p6, %r6, 8; - @%p6 bra BB6_17; + @%p6 bra BB7_17; setp.eq.s32 %p12, %r6, 6; - @%p12 bra BB6_27; + @%p12 bra BB7_27; setp.eq.s32 %p13, %r6, 7; - @%p13 bra BB6_26; - bra.uni BB6_15; + @%p13 bra BB7_26; + bra.uni BB7_15; -BB6_26: +BB7_26: setp.lt.f64 %p25, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_56: +BB7_56: setp.gt.s32 %p51, %r6, 8; - @%p51 bra BB6_61; + @%p51 bra BB7_61; setp.eq.s32 %p57, %r6, 6; - @%p57 bra BB6_71; + @%p57 bra BB7_71; setp.eq.s32 %p58, %r6, 7; - @%p58 bra BB6_70; - bra.uni BB6_59; + @%p58 bra BB7_70; + bra.uni BB7_59; -BB6_70: +BB7_70: setp.gt.f64 %p70, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_8: +BB7_8: setp.eq.s32 %p16, %r6, 3; - @%p16 bra BB6_42; + @%p16 bra BB7_42; setp.eq.s32 %p17, %r6, 4; - @%p17 bra BB6_28; - bra.uni BB6_10; + @%p17 bra BB7_28; + bra.uni BB7_10; -BB6_28: +BB7_28: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd52; @@ -815,10 +871,10 @@ BB6_28: }// Callseq End 1 setp.lt.s32 %p29, %r2, 0; and.pred %p1, %p29, %p28; - @!%p1 bra BB6_30; - bra.uni BB6_29; + @!%p1 bra BB7_30; + bra.uni BB7_29; -BB6_29: +BB7_29: { .reg .b32 %temp; mov.b64 {%temp, %r14}, %fd65; @@ -830,43 +886,43 @@ BB6_29: } mov.b64 %fd65, {%r16, %r15}; -BB6_30: +BB7_30: mov.f64 %fd64, %fd65; setp.eq.f64 %p30, %fd52, 0d0000000000000000; - @%p30 bra BB6_33; - bra.uni BB6_31; + @%p30 bra BB7_33; + bra.uni BB7_31; -BB6_33: +BB7_33: selp.b32 %r17, %r2, 0, %p28; or.b32 %r18, %r17, 2146435072; setp.lt.s32 %p34, %r3, 0; selp.b32 %r19, %r18, %r17, %p34; mov.u32 %r20, 0; mov.b64 %fd64, {%r20, %r19}; - bra.uni BB6_34; + bra.uni BB7_34; -BB6_17: +BB7_17: setp.gt.s32 %p7, %r6, 10; - @%p7 bra BB6_21; + @%p7 bra BB7_21; setp.eq.s32 %p10, %r6, 9; - @%p10 bra BB6_25; - bra.uni BB6_19; + @%p10 bra BB7_25; + bra.uni BB7_19; -BB6_25: +BB7_25: setp.eq.f64 %p23, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_52: +BB7_52: setp.eq.s32 %p61, %r6, 3; - @%p61 bra BB6_86; + @%p61 bra BB7_86; setp.eq.s32 %p62, %r6, 4; - @%p62 bra BB6_72; - bra.uni BB6_54; + @%p62 bra BB7_72; + bra.uni BB7_54; -BB6_72: +BB7_72: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -902,10 +958,10 @@ BB6_72: }// Callseq End 2 setp.lt.s32 %p74, %r4, 0; and.pred %p2, %p74, %p73; - @!%p2 bra BB6_74; - bra.uni BB6_73; + @!%p2 bra BB7_74; + bra.uni BB7_73; -BB6_73: +BB7_73: { .reg .b32 %temp; mov.b64 {%temp, %r39}, %fd73; @@ -917,179 +973,179 @@ BB6_73: } mov.b64 %fd73, {%r41, %r40}; -BB6_74: +BB7_74: mov.f64 %fd72, %fd73; setp.eq.f64 %p75, %fd1, 0d0000000000000000; - @%p75 bra BB6_77; - bra.uni BB6_75; + @%p75 bra BB7_77; + bra.uni BB7_75; -BB6_77: +BB7_77: selp.b32 %r42, %r4, 0, %p73; or.b32 %r43, %r42, 2146435072; setp.lt.s32 %p79, %r5, 0; selp.b32 %r44, %r43, %r42, %p79; mov.u32 %r45, 0; mov.b64 %fd72, {%r45, %r44}; - bra.uni BB6_78; + bra.uni BB7_78; -BB6_61: +BB7_61: setp.gt.s32 %p52, %r6, 10; - @%p52 bra BB6_65; + @%p52 bra BB7_65; setp.eq.s32 %p55, %r6, 9; - @%p55 bra BB6_69; - bra.uni BB6_63; + @%p55 bra BB7_69; + bra.uni BB7_63; -BB6_69: +BB7_69: setp.eq.f64 %p68, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_21: +BB7_21: setp.eq.s32 %p8, %r6, 11; - @%p8 bra BB6_24; - bra.uni BB6_22; + @%p8 bra BB7_24; + bra.uni BB7_22; -BB6_24: +BB7_24: min.f64 %fd66, %fd52, %fd1; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_44: +BB7_44: add.f64 %fd66, %fd1, %fd52; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_6: +BB7_6: setp.eq.s32 %p21, %r6, 2; - @%p21 bra BB6_7; - bra.uni BB6_45; + @%p21 bra BB7_7; + bra.uni BB7_45; -BB6_7: +BB7_7: mul.f64 %fd66, %fd1, %fd52; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_27: +BB7_27: setp.ge.f64 %p26, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_15: +BB7_15: setp.eq.s32 %p14, %r6, 8; - @%p14 bra BB6_16; - bra.uni BB6_45; + @%p14 bra BB7_16; + bra.uni BB7_45; -BB6_16: +BB7_16: setp.le.f64 %p24, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_42: +BB7_42: div.rn.f64 %fd66, %fd52, %fd1; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_10: +BB7_10: setp.eq.s32 %p18, %r6, 5; - @%p18 bra BB6_11; - bra.uni BB6_45; + @%p18 bra BB7_11; + bra.uni BB7_45; -BB6_11: +BB7_11: setp.gt.f64 %p27, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_65: +BB7_65: setp.eq.s32 %p53, %r6, 11; - @%p53 bra BB6_68; - bra.uni BB6_66; + @%p53 bra BB7_68; + bra.uni BB7_66; -BB6_68: +BB7_68: min.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_19: +BB7_19: setp.eq.s32 %p11, %r6, 10; - @%p11 bra BB6_20; - bra.uni BB6_45; + @%p11 bra BB7_20; + bra.uni BB7_45; -BB6_20: +BB7_20: setp.neu.f64 %p22, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_22: +BB7_22: setp.ne.s32 %p9, %r6, 12; - @%p9 bra BB6_45; + @%p9 bra BB7_45; max.f64 %fd66, %fd52, %fd1; - bra.uni BB6_45; + bra.uni BB7_45; -BB6_88: +BB7_88: add.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_50: +BB7_50: setp.eq.s32 %p66, %r6, 2; - @%p66 bra BB6_51; - bra.uni BB6_89; + @%p66 bra BB7_51; + bra.uni BB7_89; -BB6_51: +BB7_51: mul.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_71: +BB7_71: setp.le.f64 %p71, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_59: +BB7_59: setp.eq.s32 %p59, %r6, 8; - @%p59 bra BB6_60; - bra.uni BB6_89; + @%p59 bra BB7_60; + bra.uni BB7_89; -BB6_60: +BB7_60: setp.ge.f64 %p69, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_86: +BB7_86: div.rn.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_54: +BB7_54: setp.eq.s32 %p63, %r6, 5; - @%p63 bra BB6_55; - bra.uni BB6_89; + @%p63 bra BB7_55; + bra.uni BB7_89; -BB6_55: +BB7_55: setp.lt.f64 %p72, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_63: +BB7_63: setp.eq.s32 %p56, %r6, 10; - @%p56 bra BB6_64; - bra.uni BB6_89; + @%p56 bra BB7_64; + bra.uni BB7_89; -BB6_64: +BB7_64: setp.neu.f64 %p67, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_66: +BB7_66: setp.ne.s32 %p54, %r6, 12; - @%p54 bra BB6_89; + @%p54 bra BB7_89; max.f64 %fd74, %fd1, %fd52; - bra.uni BB6_89; + bra.uni BB7_89; -BB6_31: +BB7_31: setp.gt.s32 %p31, %r2, -1; - @%p31 bra BB6_34; + @%p31 bra BB7_34; cvt.rzi.f64.f64 %fd54, %fd1; setp.neu.f64 %p32, %fd54, %fd1; selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p32; -BB6_34: +BB7_34: mov.f64 %fd16, %fd64; add.f64 %fd17, %fd1, %fd52; { @@ -1099,17 +1155,17 @@ BB6_34: and.b32 %r22, %r21, 2146435072; setp.ne.s32 %p35, %r22, 2146435072; mov.f64 %fd63, %fd16; - @%p35 bra BB6_41; + @%p35 bra BB7_41; setp.gtu.f64 %p36, %fd10, 0d7FF0000000000000; mov.f64 %fd63, %fd17; - @%p36 bra BB6_41; + @%p36 bra BB7_41; abs.f64 %fd55, %fd1; setp.gtu.f64 %p37, %fd55, 0d7FF0000000000000; mov.f64 %fd62, %fd17; mov.f64 %fd63, %fd62; - @%p37 bra BB6_41; + @%p37 bra BB7_41; { .reg .b32 %temp; @@ -1119,10 +1175,10 @@ BB6_34: setp.eq.s32 %p38, %r24, 2146435072; setp.eq.s32 %p39, %r23, 0; and.pred %p40, %p38, %p39; - @%p40 bra BB6_40; - bra.uni BB6_38; + @%p40 bra BB7_40; + bra.uni BB7_38; -BB6_40: +BB7_40: setp.gt.f64 %p44, %fd10, 0d3FF0000000000000; selp.b32 %r32, 2146435072, 0, %p44; xor.b32 %r33, %r32, 2146435072; @@ -1132,17 +1188,17 @@ BB6_40: selp.b32 %r35, 1072693248, %r34, %p46; mov.u32 %r36, 0; mov.b64 %fd63, {%r36, %r35}; - bra.uni BB6_41; + bra.uni BB7_41; -BB6_75: +BB7_75: setp.gt.s32 %p76, %r4, -1; - @%p76 bra BB6_78; + @%p76 bra BB7_78; cvt.rzi.f64.f64 %fd57, %fd52; setp.neu.f64 %p77, %fd57, %fd52; selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p77; -BB6_78: +BB7_78: mov.f64 %fd41, %fd72; add.f64 %fd42, %fd1, %fd52; { @@ -1152,17 +1208,17 @@ BB6_78: and.b32 %r47, %r46, 2146435072; setp.ne.s32 %p80, %r47, 2146435072; mov.f64 %fd71, %fd41; - @%p80 bra BB6_85; + @%p80 bra BB7_85; setp.gtu.f64 %p81, %fd35, 0d7FF0000000000000; mov.f64 %fd71, %fd42; - @%p81 bra BB6_85; + @%p81 bra BB7_85; abs.f64 %fd58, %fd52; setp.gtu.f64 %p82, %fd58, 0d7FF0000000000000; mov.f64 %fd70, %fd42; mov.f64 %fd71, %fd70; - @%p82 bra BB6_85; + @%p82 bra BB7_85; { .reg .b32 %temp; @@ -1172,10 +1228,10 @@ BB6_78: setp.eq.s32 %p83, %r49, 2146435072; setp.eq.s32 %p84, %r48, 0; and.pred %p85, %p83, %p84; - @%p85 bra BB6_84; - bra.uni BB6_82; + @%p85 bra BB7_84; + bra.uni BB7_82; -BB6_84: +BB7_84: setp.gt.f64 %p89, %fd35, 0d3FF0000000000000; selp.b32 %r57, 2146435072, 0, %p89; xor.b32 %r58, %r57, 2146435072; @@ -1185,9 +1241,9 @@ BB6_84: selp.b32 %r60, 1072693248, %r59, %p91; mov.u32 %r61, 0; mov.b64 %fd71, {%r61, %r60}; - bra.uni BB6_85; + bra.uni BB7_85; -BB6_38: +BB7_38: { .reg .b32 %temp; mov.b64 {%r25, %temp}, %fd52; @@ -1197,10 +1253,10 @@ BB6_38: setp.eq.s32 %p42, %r25, 0; and.pred %p43, %p41, %p42; mov.f64 %fd63, %fd16; - @!%p43 bra BB6_41; - bra.uni BB6_39; + @!%p43 bra BB7_41; + bra.uni BB7_39; -BB6_39: +BB7_39: shr.s32 %r27, %r3, 31; and.b32 %r28, %r27, -2146435072; selp.b32 %r29, -1048576, 2146435072, %p1; @@ -1208,17 +1264,17 @@ BB6_39: mov.u32 %r31, 0; mov.b64 %fd63, {%r31, %r30}; -BB6_41: +BB7_41: setp.eq.f64 %p47, %fd1, 0d0000000000000000; setp.eq.f64 %p48, %fd52, 0d3FF0000000000000; or.pred %p49, %p48, %p47; selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p49; -BB6_45: +BB7_45: st.global.f64 [%rd1], %fd66; - bra.uni BB6_90; + bra.uni BB7_90; -BB6_82: +BB7_82: { .reg .b32 %temp; mov.b64 {%r50, %temp}, %fd1; @@ -1228,10 +1284,10 @@ BB6_82: setp.eq.s32 %p87, %r50, 0; and.pred %p88, %p86, %p87; mov.f64 %fd71, %fd41; - @!%p88 bra BB6_85; - bra.uni BB6_83; + @!%p88 bra BB7_85; + bra.uni BB7_83; -BB6_83: +BB7_83: shr.s32 %r52, %r5, 31; and.b32 %r53, %r52, -2146435072; selp.b32 %r54, -1048576, 2146435072, %p2; @@ -1239,16 +1295,16 @@ BB6_83: mov.u32 %r56, 0; mov.b64 %fd71, {%r56, %r55}; -BB6_85: +BB7_85: setp.eq.f64 %p92, %fd52, 0d0000000000000000; setp.eq.f64 %p93, %fd1, 0d3FF0000000000000; or.pred %p94, %p93, %p92; selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p94; -BB6_89: +BB7_89: st.global.f64 [%rd1], %fd74; -BB6_90: +BB7_90: bar.sync 0; ret; } @@ -1274,14 +1330,14 @@ BB6_90: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB7_2; + @%p1 bra BB8_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB7_2: +BB8_2: ret; } @@ -1309,9 +1365,9 @@ BB7_2: mov.f64 %fd76, 0d0000000000000000; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB8_4; + @%p1 bra BB9_4; -BB8_1: +BB9_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -1320,23 +1376,23 @@ BB8_1: add.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB8_3; + @%p2 bra BB9_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; add.f64 %fd78, %fd78, %fd31; -BB8_3: +BB9_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB8_1; + @%p3 bra BB9_1; -BB8_4: +BB9_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1344,130 +1400,130 @@ BB8_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB8_8; + @%p4 bra BB9_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB8_7; + @%p5 bra BB9_7; ld.shared.f64 %fd32, [%rd1+4096]; add.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB8_7: +BB9_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB8_8: +BB9_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB8_12; + @%p6 bra BB9_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB8_11; + @%p7 bra BB9_11; ld.shared.f64 %fd33, [%rd1+2048]; add.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB8_11: +BB9_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB8_12: +BB9_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB8_16; + @%p8 bra BB9_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB8_15; + @%p9 bra BB9_15; ld.shared.f64 %fd34, [%rd1+1024]; add.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB8_15: +BB9_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB8_16: +BB9_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB8_20; + @%p10 bra BB9_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB8_19; + @%p11 bra BB9_19; ld.shared.f64 %fd35, [%rd1+512]; add.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB8_19: +BB9_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB8_20: +BB9_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB8_33; + @%p12 bra BB9_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB8_23; + @%p13 bra BB9_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; add.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB8_23: +BB9_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB8_25; + @%p14 bra BB9_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; add.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB8_25: +BB9_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB8_27; + @%p15 bra BB9_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; add.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB8_27: +BB9_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB8_29; + @%p16 bra BB9_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; add.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB8_29: +BB9_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB8_31; + @%p17 bra BB9_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; add.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB8_31: +BB9_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB8_33; + @%p18 bra BB9_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; add.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB8_33: +BB9_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB8_35; + @%p19 bra BB9_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -1475,7 +1531,7 @@ BB8_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB8_35: +BB9_35: ret; } @@ -1499,17 +1555,17 @@ BB8_35: ld.param.u32 %r4, [reduce_row_sum_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB9_35; + @%p1 bra BB10_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0d0000000000000000; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB9_4; + @%p2 bra BB10_4; cvta.to.global.u64 %rd3, %rd1; -BB9_3: +BB10_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -1519,9 +1575,9 @@ BB9_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB9_3; + @%p3 bra BB10_3; -BB9_4: +BB10_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -1531,130 +1587,130 @@ BB9_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB9_8; + @%p4 bra BB10_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB9_7; + @%p5 bra BB10_7; ld.shared.f64 %fd29, [%rd8+4096]; add.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB9_7: +BB10_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB9_8: +BB10_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB9_12; + @%p6 bra BB10_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB9_11; + @%p7 bra BB10_11; ld.shared.f64 %fd30, [%rd8+2048]; add.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB9_11: +BB10_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB9_12: +BB10_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB9_16; + @%p8 bra BB10_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB9_15; + @%p9 bra BB10_15; ld.shared.f64 %fd31, [%rd8+1024]; add.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB9_15: +BB10_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB9_16: +BB10_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB9_20; + @%p10 bra BB10_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB9_19; + @%p11 bra BB10_19; ld.shared.f64 %fd32, [%rd8+512]; add.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB9_19: +BB10_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB9_20: +BB10_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB9_33; + @%p12 bra BB10_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB9_23; + @%p13 bra BB10_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; add.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB9_23: +BB10_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB9_25; + @%p14 bra BB10_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; add.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB9_25: +BB10_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB9_27; + @%p15 bra BB10_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; add.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB9_27: +BB10_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB9_29; + @%p16 bra BB10_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; add.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB9_29: +BB10_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB9_31; + @%p17 bra BB10_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; add.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB9_31: +BB10_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB9_33; + @%p18 bra BB10_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; add.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB9_33: +BB10_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB9_35; + @%p19 bra BB10_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -1662,7 +1718,7 @@ BB9_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB9_35: +BB10_35: ret; } @@ -1689,18 +1745,18 @@ BB9_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB10_5; + @%p1 bra BB11_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d0000000000000000; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB10_4; + @%p2 bra BB11_4; mov.u32 %r10, %r1; -BB10_3: +BB11_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -1710,15 +1766,15 @@ BB10_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB10_3; + @%p3 bra BB11_3; -BB10_4: +BB11_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB10_5: +BB11_5: ret; } @@ -1746,9 +1802,9 @@ BB10_5: mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB11_4; + @%p1 bra BB12_4; -BB11_1: +BB12_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -1757,23 +1813,23 @@ BB11_1: max.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB11_3; + @%p2 bra BB12_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; max.f64 %fd78, %fd78, %fd31; -BB11_3: +BB12_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB11_1; + @%p3 bra BB12_1; -BB11_4: +BB12_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1781,130 +1837,130 @@ BB11_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB11_8; + @%p4 bra BB12_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB11_7; + @%p5 bra BB12_7; ld.shared.f64 %fd32, [%rd1+4096]; max.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB11_7: +BB12_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB11_8: +BB12_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB11_12; + @%p6 bra BB12_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB11_11; + @%p7 bra BB12_11; ld.shared.f64 %fd33, [%rd1+2048]; max.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB11_11: +BB12_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB11_12: +BB12_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB11_16; + @%p8 bra BB12_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB11_15; + @%p9 bra BB12_15; ld.shared.f64 %fd34, [%rd1+1024]; max.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB11_15: +BB12_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB11_16: +BB12_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB11_20; + @%p10 bra BB12_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB11_19; + @%p11 bra BB12_19; ld.shared.f64 %fd35, [%rd1+512]; max.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB11_19: +BB12_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB11_20: +BB12_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB11_33; + @%p12 bra BB12_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB11_23; + @%p13 bra BB12_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; max.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB11_23: +BB12_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB11_25; + @%p14 bra BB12_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; max.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB11_25: +BB12_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB11_27; + @%p15 bra BB12_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; max.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB11_27: +BB12_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB11_29; + @%p16 bra BB12_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; max.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB11_29: +BB12_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB11_31; + @%p17 bra BB12_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; max.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB11_31: +BB12_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB11_33; + @%p18 bra BB12_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; max.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB11_33: +BB12_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB11_35; + @%p19 bra BB12_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -1912,7 +1968,7 @@ BB11_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB11_35: +BB12_35: ret; } @@ -1936,17 +1992,17 @@ BB11_35: ld.param.u32 %r4, [reduce_row_max_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB12_35; + @%p1 bra BB13_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB12_4; + @%p2 bra BB13_4; cvta.to.global.u64 %rd3, %rd1; -BB12_3: +BB13_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -1956,9 +2012,9 @@ BB12_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB12_3; + @%p3 bra BB13_3; -BB12_4: +BB13_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -1968,130 +2024,130 @@ BB12_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB12_8; + @%p4 bra BB13_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB12_7; + @%p5 bra BB13_7; ld.shared.f64 %fd29, [%rd8+4096]; max.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB12_7: +BB13_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB12_8: +BB13_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB12_12; + @%p6 bra BB13_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB12_11; + @%p7 bra BB13_11; ld.shared.f64 %fd30, [%rd8+2048]; max.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB12_11: +BB13_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB12_12: +BB13_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB12_16; + @%p8 bra BB13_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB12_15; + @%p9 bra BB13_15; ld.shared.f64 %fd31, [%rd8+1024]; max.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB12_15: +BB13_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB12_16: +BB13_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB12_20; + @%p10 bra BB13_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB12_19; + @%p11 bra BB13_19; ld.shared.f64 %fd32, [%rd8+512]; max.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB12_19: +BB13_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB12_20: +BB13_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB12_33; + @%p12 bra BB13_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB12_23; + @%p13 bra BB13_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; max.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB12_23: +BB13_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB12_25; + @%p14 bra BB13_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; max.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB12_25: +BB13_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB12_27; + @%p15 bra BB13_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; max.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB12_27: +BB13_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB12_29; + @%p16 bra BB13_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; max.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB12_29: +BB13_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB12_31; + @%p17 bra BB13_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; max.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB12_31: +BB13_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB12_33; + @%p18 bra BB13_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; max.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB12_33: +BB13_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB12_35; + @%p19 bra BB13_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -2099,7 +2155,7 @@ BB12_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB12_35: +BB13_35: ret; } @@ -2126,18 +2182,18 @@ BB12_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB13_5; + @%p1 bra BB14_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB13_4; + @%p2 bra BB14_4; mov.u32 %r10, %r1; -BB13_3: +BB14_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -2147,15 +2203,15 @@ BB13_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB13_3; + @%p3 bra BB14_3; -BB13_4: +BB14_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB13_5: +BB14_5: ret; } @@ -2183,9 +2239,9 @@ BB13_5: mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB14_4; + @%p1 bra BB15_4; -BB14_1: +BB15_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -2194,23 +2250,23 @@ BB14_1: min.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB14_3; + @%p2 bra BB15_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; min.f64 %fd78, %fd78, %fd31; -BB14_3: +BB15_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB14_1; + @%p3 bra BB15_1; -BB14_4: +BB15_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2218,130 +2274,130 @@ BB14_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB14_8; + @%p4 bra BB15_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB14_7; + @%p5 bra BB15_7; ld.shared.f64 %fd32, [%rd1+4096]; min.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB14_7: +BB15_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB14_8: +BB15_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB14_12; + @%p6 bra BB15_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB14_11; + @%p7 bra BB15_11; ld.shared.f64 %fd33, [%rd1+2048]; min.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB14_11: +BB15_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB14_12: +BB15_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB14_16; + @%p8 bra BB15_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB14_15; + @%p9 bra BB15_15; ld.shared.f64 %fd34, [%rd1+1024]; min.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB14_15: +BB15_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB14_16: +BB15_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB14_20; + @%p10 bra BB15_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB14_19; + @%p11 bra BB15_19; ld.shared.f64 %fd35, [%rd1+512]; min.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB14_19: +BB15_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB14_20: +BB15_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB14_33; + @%p12 bra BB15_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB14_23; + @%p13 bra BB15_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; min.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB14_23: +BB15_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB14_25; + @%p14 bra BB15_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; min.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB14_25: +BB15_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB14_27; + @%p15 bra BB15_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; min.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB14_27: +BB15_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB14_29; + @%p16 bra BB15_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; min.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB14_29: +BB15_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB14_31; + @%p17 bra BB15_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; min.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB14_31: +BB15_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB14_33; + @%p18 bra BB15_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; min.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB14_33: +BB15_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB14_35; + @%p19 bra BB15_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2349,7 +2405,7 @@ BB14_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB14_35: +BB15_35: ret; } @@ -2373,17 +2429,17 @@ BB14_35: ld.param.u32 %r4, [reduce_row_min_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB15_35; + @%p1 bra BB16_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB15_4; + @%p2 bra BB16_4; cvta.to.global.u64 %rd3, %rd1; -BB15_3: +BB16_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2393,9 +2449,9 @@ BB15_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB15_3; + @%p3 bra BB16_3; -BB15_4: +BB16_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2405,130 +2461,130 @@ BB15_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB15_8; + @%p4 bra BB16_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB15_7; + @%p5 bra BB16_7; ld.shared.f64 %fd29, [%rd8+4096]; min.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB15_7: +BB16_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB15_8: +BB16_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB15_12; + @%p6 bra BB16_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB15_11; + @%p7 bra BB16_11; ld.shared.f64 %fd30, [%rd8+2048]; min.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB15_11: +BB16_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB15_12: +BB16_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB15_16; + @%p8 bra BB16_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB15_15; + @%p9 bra BB16_15; ld.shared.f64 %fd31, [%rd8+1024]; min.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB15_15: +BB16_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB15_16: +BB16_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB15_20; + @%p10 bra BB16_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB15_19; + @%p11 bra BB16_19; ld.shared.f64 %fd32, [%rd8+512]; min.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB15_19: +BB16_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB15_20: +BB16_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB15_33; + @%p12 bra BB16_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB15_23; + @%p13 bra BB16_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; min.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB15_23: +BB16_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB15_25; + @%p14 bra BB16_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; min.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB15_25: +BB16_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB15_27; + @%p15 bra BB16_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; min.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB15_27: +BB16_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB15_29; + @%p16 bra BB16_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; min.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB15_29: +BB16_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB15_31; + @%p17 bra BB16_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; min.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB15_31: +BB16_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB15_33; + @%p18 bra BB16_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; min.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB15_33: +BB16_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB15_35; + @%p19 bra BB16_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -2536,7 +2592,7 @@ BB15_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB15_35: +BB16_35: ret; } @@ -2563,18 +2619,18 @@ BB15_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB16_5; + @%p1 bra BB17_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB16_4; + @%p2 bra BB17_4; mov.u32 %r10, %r1; -BB16_3: +BB17_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -2584,15 +2640,15 @@ BB16_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB16_3; + @%p3 bra BB17_3; -BB16_4: +BB17_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB16_5: +BB17_5: ret; } @@ -2620,9 +2676,9 @@ BB16_5: mov.f64 %fd76, 0d3FF0000000000000; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB17_4; + @%p1 bra BB18_4; -BB17_1: +BB18_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -2631,23 +2687,23 @@ BB17_1: mul.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB17_3; + @%p2 bra BB18_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; mul.f64 %fd78, %fd78, %fd31; -BB17_3: +BB18_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB17_1; + @%p3 bra BB18_1; -BB17_4: +BB18_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2655,130 +2711,130 @@ BB17_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB17_8; + @%p4 bra BB18_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB17_7; + @%p5 bra BB18_7; ld.shared.f64 %fd32, [%rd1+4096]; mul.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB17_7: +BB18_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB17_8: +BB18_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB17_12; + @%p6 bra BB18_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB17_11; + @%p7 bra BB18_11; ld.shared.f64 %fd33, [%rd1+2048]; mul.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB17_11: +BB18_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB17_12: +BB18_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB17_16; + @%p8 bra BB18_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB17_15; + @%p9 bra BB18_15; ld.shared.f64 %fd34, [%rd1+1024]; mul.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB17_15: +BB18_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB17_16: +BB18_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB17_20; + @%p10 bra BB18_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB17_19; + @%p11 bra BB18_19; ld.shared.f64 %fd35, [%rd1+512]; mul.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB17_19: +BB18_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB17_20: +BB18_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB17_33; + @%p12 bra BB18_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB17_23; + @%p13 bra BB18_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; mul.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB17_23: +BB18_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB17_25; + @%p14 bra BB18_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; mul.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB17_25: +BB18_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB17_27; + @%p15 bra BB18_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; mul.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB17_27: +BB18_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB17_29; + @%p16 bra BB18_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; mul.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB17_29: +BB18_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB17_31; + @%p17 bra BB18_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; mul.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB17_31: +BB18_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB17_33; + @%p18 bra BB18_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; mul.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB17_33: +BB18_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB17_35; + @%p19 bra BB18_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2786,7 +2842,7 @@ BB17_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB17_35: +BB18_35: ret; } @@ -2801,7 +2857,7 @@ BB17_35: .reg .pred %p<20>; .reg .b32 %r<39>; .reg .f64 %fd<76>; - .reg .b64 %rd<43>; + .reg .b64 %rd<42>; ld.param.u64 %rd1, [reduce_row_mean_param_0]; @@ -2810,17 +2866,17 @@ BB17_35: ld.param.u32 %r4, [reduce_row_mean_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB18_35; + @%p1 bra BB19_35; mov.u32 %r38, %tid.x; mov.f64 %fd74, 0d0000000000000000; mov.f64 %fd75, %fd74; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB18_4; + @%p2 bra BB19_4; cvta.to.global.u64 %rd3, %rd1; -BB18_3: +BB19_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2830,9 +2886,9 @@ BB18_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd74, %fd75; - @%p3 bra BB18_3; + @%p3 bra BB19_3; -BB18_4: +BB19_4: mov.f64 %fd72, %fd74; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2842,141 +2898,140 @@ BB18_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB18_8; + @%p4 bra BB19_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd73, %fd72; - @%p5 bra BB18_7; + @%p5 bra BB19_7; ld.shared.f64 %fd29, [%rd8+4096]; add.f64 %fd73, %fd72, %fd29; st.shared.f64 [%rd8], %fd73; -BB18_7: +BB19_7: mov.f64 %fd72, %fd73; bar.sync 0; -BB18_8: +BB19_8: mov.f64 %fd70, %fd72; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB18_12; + @%p6 bra BB19_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd71, %fd70; - @%p7 bra BB18_11; + @%p7 bra BB19_11; ld.shared.f64 %fd30, [%rd8+2048]; add.f64 %fd71, %fd70, %fd30; st.shared.f64 [%rd8], %fd71; -BB18_11: +BB19_11: mov.f64 %fd70, %fd71; bar.sync 0; -BB18_12: +BB19_12: mov.f64 %fd68, %fd70; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB18_16; + @%p8 bra BB19_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd69, %fd68; - @%p9 bra BB18_15; + @%p9 bra BB19_15; ld.shared.f64 %fd31, [%rd8+1024]; add.f64 %fd69, %fd68, %fd31; st.shared.f64 [%rd8], %fd69; -BB18_15: +BB19_15: mov.f64 %fd68, %fd69; bar.sync 0; -BB18_16: +BB19_16: mov.f64 %fd66, %fd68; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB18_20; + @%p10 bra BB19_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd67, %fd66; - @%p11 bra BB18_19; + @%p11 bra BB19_19; ld.shared.f64 %fd32, [%rd8+512]; add.f64 %fd67, %fd66, %fd32; st.shared.f64 [%rd8], %fd67; -BB18_19: +BB19_19: mov.f64 %fd66, %fd67; bar.sync 0; -BB18_20: +BB19_20: mov.f64 %fd65, %fd66; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB18_33; + @%p12 bra BB19_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB18_23; + @%p13 bra BB19_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; add.f64 %fd65, %fd65, %fd33; st.volatile.shared.f64 [%rd8], %fd65; -BB18_23: +BB19_23: mov.f64 %fd64, %fd65; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB18_25; + @%p14 bra BB19_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; add.f64 %fd64, %fd64, %fd34; st.volatile.shared.f64 [%rd8], %fd64; -BB18_25: +BB19_25: mov.f64 %fd63, %fd64; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB18_27; + @%p15 bra BB19_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; add.f64 %fd63, %fd63, %fd35; st.volatile.shared.f64 [%rd8], %fd63; -BB18_27: +BB19_27: mov.f64 %fd62, %fd63; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB18_29; + @%p16 bra BB19_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; add.f64 %fd62, %fd62, %fd36; st.volatile.shared.f64 [%rd8], %fd62; -BB18_29: +BB19_29: mov.f64 %fd61, %fd62; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB18_31; + @%p17 bra BB19_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; add.f64 %fd61, %fd61, %fd37; st.volatile.shared.f64 [%rd8], %fd61; -BB18_31: +BB19_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB18_33; + @%p18 bra BB19_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; add.f64 %fd39, %fd61, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB18_33: +BB19_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB18_35; + @%p19 bra BB19_35; ld.shared.f64 %fd40, [sdata]; - cvt.u64.u32 %rd39, %r4; - cvt.rn.f64.s64 %fd41, %rd39; + cvt.rn.f64.s32 %fd41, %r4; div.rn.f64 %fd42, %fd40, %fd41; - cvta.to.global.u64 %rd40, %rd2; - mul.wide.u32 %rd41, %r6, 8; - add.s64 %rd42, %rd40, %rd41; - st.global.f64 [%rd42], %fd42; + cvta.to.global.u64 %rd39, %rd2; + mul.wide.u32 %rd40, %r6, 8; + add.s64 %rd41, %rd39, %rd40; + st.global.f64 [%rd41], %fd42; -BB18_35: +BB19_35: ret; } @@ -2991,7 +3046,7 @@ BB18_35: .reg .pred %p<4>; .reg .b32 %r<11>; .reg .f64 %fd<12>; - .reg .b64 %rd<10>; + .reg .b64 %rd<9>; ld.param.u64 %rd2, [reduce_col_mean_param_0]; @@ -3003,18 +3058,18 @@ BB18_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB19_5; + @%p1 bra BB20_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd10, 0d0000000000000000; mov.f64 %fd11, %fd10; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB19_4; + @%p2 bra BB20_4; mov.u32 %r10, %r1; -BB19_3: +BB20_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -3024,18 +3079,17 @@ BB19_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd10, %fd11; - @%p3 bra BB19_3; + @%p3 bra BB20_3; -BB19_4: +BB20_4: cvta.to.global.u64 %rd6, %rd3; - cvt.u64.u32 %rd7, %r5; - cvt.rn.f64.s64 %fd7, %rd7; + cvt.rn.f64.s32 %fd7, %r5; div.rn.f64 %fd8, %fd10, %fd7; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd6, %rd8; - st.global.f64 [%rd9], %fd8; + mul.wide.u32 %rd7, %r1, 8; + add.s64 %rd8, %rd6, %rd7; + st.global.f64 [%rd8], %fd8; -BB19_5: +BB20_5: ret; } @@ -3061,7 +3115,7 @@ BB19_5: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r7, %r6, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB20_5; + @%p1 bra BB21_5; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -3121,13 +3175,13 @@ BB19_5: mov.b32 %f2, %r11; abs.f32 %f1, %f2; setp.lt.f32 %p2, %f1, 0f4086232B; - @%p2 bra BB20_4; + @%p2 bra BB21_4; setp.lt.f64 %p3, %fd1, 0d0000000000000000; add.f64 %fd37, %fd1, 0d7FF0000000000000; selp.f64 %fd40, 0d0000000000000000, %fd37, %p3; setp.geu.f32 %p4, %f1, 0f40874800; - @%p4 bra BB20_4; + @%p4 bra BB21_4; shr.u32 %r12, %r2, 31; add.s32 %r13, %r2, %r12; @@ -3142,13 +3196,13 @@ BB19_5: mov.b64 %fd39, {%r20, %r19}; mul.f64 %fd40, %fd38, %fd39; -BB20_4: +BB21_4: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd40; -BB20_5: +BB21_5: ret; } @@ -3175,7 +3229,7 @@ BB20_5: } shr.u32 %r50, %r49, 20; setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB21_2; + @%p1 bra BB22_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -3189,13 +3243,13 @@ BB20_5: shr.u32 %r16, %r49, 20; add.s32 %r50, %r16, -54; -BB21_2: +BB22_2: add.s32 %r51, %r50, -1023; and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; mov.b64 %fd132, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB21_4; + @%p2 bra BB22_4; { .reg .b32 %temp; @@ -3209,7 +3263,7 @@ BB21_2: mov.b64 %fd132, {%r19, %r21}; add.s32 %r51, %r50, -1022; -BB21_4: +BB22_4: add.f64 %fd16, %fd132, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; @@ -3374,13 +3428,13 @@ BB21_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB21_7; + @%p4 bra BB22_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB21_7; + @%p6 bra BB22_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; @@ -3395,7 +3449,7 @@ BB21_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd133, %fd130, %fd131; -BB21_7: +BB22_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd133; @@ -3408,13 +3462,13 @@ BB21_7: } setp.ne.s32 %p8, %r47, 0; or.pred %p9, %p8, %p7; - @!%p9 bra BB21_9; - bra.uni BB21_8; + @!%p9 bra BB22_9; + bra.uni BB22_8; -BB21_8: +BB22_8: fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB21_9: +BB22_9: st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index 7a28bb1..567c793 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -93,6 +93,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop case DIRECT_CONV2D_BACKWARD_DATA: case DIRECT_CONV2D_BACKWARD_FILTER: case BIAS_ADD: + case BIAS_MULTIPLY: { if(et == ExecType.CP || et == ExecType.GPU || et == ExecType.SPARK) { setLops(constructConvolutionLops(et, inputs)); @@ -125,6 +126,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop case DIRECT_CONV2D_BACKWARD_DATA: return 14; case BIAS_ADD: + case BIAS_MULTIPLY: return 2; default: return 13; @@ -247,7 +249,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop // [numRows, numCols, NNZ] long[] ret = new long[3]; - if(op == ConvOp.BIAS_ADD) { + if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) { MatrixCharacteristics[] mc = memo.getAllInputStats(getInput()); ret[0] = mc[0].rowsKnown() ? mc[0].getRows() : -1; ret[1] = mc[0].colsKnown() ? mc[0].getCols() : -1; @@ -394,7 +396,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop @Override public void refreshSizeInformation() { - if(op == ConvOp.BIAS_ADD) { + if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) { Hop input1 = getInput().get(0); setDim1(input1.getDim1()); setDim2(input1.getDim2()); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/hops/Hop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java index 75b24de..86741ba 100644 --- a/src/main/java/org/apache/sysml/hops/Hop.java +++ b/src/main/java/org/apache/sysml/hops/Hop.java @@ -1024,7 +1024,7 @@ public abstract class Hop public enum ConvOp { MAX_POOLING, MAX_POOLING_BACKWARD, DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA, - BIAS_ADD + BIAS_ADD, BIAS_MULTIPLY }; public enum DataGenMethod { @@ -1091,6 +1091,7 @@ public abstract class Hop HopsConv2Lops.put(ConvOp.MAX_POOLING_BACKWARD, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.MAX_POOLING_BACKWARD); HopsConv2Lops.put(ConvOp.DIRECT_CONV2D, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D); HopsConv2Lops.put(ConvOp.BIAS_ADD, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.BIAS_ADD); + HopsConv2Lops.put(ConvOp.BIAS_MULTIPLY, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.BIAS_MULTIPLY); HopsConv2Lops.put(ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D_BACKWARD_FILTER); HopsConv2Lops.put(ConvOp.DIRECT_CONV2D_BACKWARD_DATA, org.apache.sysml.lops.ConvolutionTransform.OperationTypes.DIRECT_CONV2D_BACKWARD_DATA); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java index cea2c93..6d8885a 100644 --- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java +++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java @@ -32,7 +32,7 @@ public class ConvolutionTransform extends Lop public enum OperationTypes { MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD, DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA, - BIAS_ADD, DIRECT_CONV2D_BIAS_ADD + BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY }; private OperationTypes operation = null; @@ -126,6 +126,9 @@ public class ConvolutionTransform extends Lop case BIAS_ADD: return "bias_add"; + + case BIAS_MULTIPLY: + return "bias_multiply"; case DIRECT_CONV2D_BACKWARD_FILTER: return "conv2d_backward_filter"; @@ -140,7 +143,7 @@ public class ConvolutionTransform extends Lop } public String getInstructions(String input, String bias, String output) throws LopsException { - if(operation == OperationTypes.BIAS_ADD || operation == OperationTypes.RELU_BACKWARD) { + if(operation == OperationTypes.BIAS_ADD || operation == OperationTypes.BIAS_MULTIPLY || operation == OperationTypes.RELU_BACKWARD) { StringBuilder sb = new StringBuilder(); sb.append( getExecType() ); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java index 19565ee..2cb869e 100644 --- a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java +++ b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java @@ -1105,6 +1105,7 @@ public class BuiltinFunctionExpression extends DataIdentifier break; case BIAS_ADD: + case BIAS_MULTIPLY: { Expression input = _args[0]; Expression bias = _args[1]; @@ -1615,6 +1616,8 @@ public class BuiltinFunctionExpression extends DataIdentifier bifop = Expression.BuiltinFunctionOp.CONV2D; else if (functionName.equals("bias_add")) bifop = Expression.BuiltinFunctionOp.BIAS_ADD; + else if (functionName.equals("bias_multiply")) + bifop = Expression.BuiltinFunctionOp.BIAS_MULTIPLY; else if (functionName.equals("conv2d_backward_filter")) bifop = Expression.BuiltinFunctionOp.CONV2D_BACKWARD_FILTER; else if (functionName.equals("conv2d_backward_data")) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/DMLTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java index b99dd37..aa09fe5 100644 --- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java +++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java @@ -2720,6 +2720,15 @@ public class DMLTranslator setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp); break; } + case BIAS_MULTIPLY: + { + ArrayList<Hop> inHops1 = new ArrayList<Hop>(); + inHops1.add(expr); + inHops1.add(expr2); + currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.BIAS_MULTIPLY, inHops1); + setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp); + break; + } case AVG_POOL: case MAX_POOL: { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/parser/Expression.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/parser/Expression.java b/src/main/java/org/apache/sysml/parser/Expression.java index 9c3b248..1bb7b35 100644 --- a/src/main/java/org/apache/sysml/parser/Expression.java +++ b/src/main/java/org/apache/sysml/parser/Expression.java @@ -90,7 +90,7 @@ public abstract class Expression CUMSUM, DIAG, EIGEN, - CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA, BIAS_ADD, + CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA, BIAS_ADD, BIAS_MULTIPLY, MAX_POOL, AVG_POOL, MAX_POOL_BACKWARD, EXP, FLOOR, http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java index f0603b4..bb5e01a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java @@ -232,6 +232,7 @@ public class CPInstructionParser extends InstructionParser String2CPInstructionType.put( "conv2d_backward_filter" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "conv2d_backward_data" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "bias_add" , CPINSTRUCTION_TYPE.Convolution); + String2CPInstructionType.put( "bias_multiply" , CPINSTRUCTION_TYPE.Convolution); // Quaternary instruction opcodes String2CPInstructionType.put( "wsloss" , CPINSTRUCTION_TYPE.Quaternary); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 366015f..4051d6a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -48,6 +48,7 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "maxpooling", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); // Matrix Multiply Operators String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 3513201..f839990 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -44,8 +44,8 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); - if( !(opcode.equals("bias_add") || opcode.equals("relu_backward")) ) { - throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or relu_backward, but found " + opcode); + if( !(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode); } _in2 = in2; _cptype = CPINSTRUCTION_TYPE.Convolution; @@ -195,7 +195,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, stride, padding, input_shape, filter_shape, k); } - else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward")) { + else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) { InstructionUtils.checkNumFields(parts, 4); CPOperand in = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); @@ -262,6 +262,32 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction ec.setMatrixOutput(getOutputVariableName(), outputBlock); } + public void processBiasMultiplyInstruction(ExecutionContext ec) throws DMLRuntimeException { + MatrixBlock input = ec.getMatrixInput(input1.getName()); + MatrixBlock bias = ec.getMatrixInput(_in2.getName()); + MatrixBlock outputBlock = null; + + if(bias.getNumColumns() != 1) { + throw new DMLRuntimeException("Expected the number of columns of bias matrix to be 1, but found " + bias.getNumColumns()); + } + + if(bias.isEmptyBlock()) { + // Anything multiplied by zero is zero + outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true); + } + else { + // As we always fill the output first with bias + outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), false); + outputBlock.allocateDenseBlock(); + LibMatrixDNN.biasMultiply(input, bias, outputBlock, _numThreads); + } + + // release inputs/outputs + ec.releaseMatrixInput(input1.getName()); + ec.releaseMatrixInput(_in2.getName()); + ec.setMatrixOutput(getOutputVariableName(), outputBlock); + } + @Override public void processInstruction(ExecutionContext ec) @@ -270,6 +296,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction processBiasAddInstruction(ec); return; } + else if (instOpcode.equalsIgnoreCase("bias_multiply")) { + processBiasMultiplyInstruction(ec); + return; + } else if (instOpcode.equalsIgnoreCase("relu_backward")) { processReluBackwardInstruction(ec); return; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 7460d6b..a02115d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -44,8 +44,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); - if(!(opcode.equals("bias_add") || opcode.equals("relu_backward"))) { - throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or relu_backward, but found " + opcode); + if(!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode); } _input1 = in1; _input2 = in2; @@ -166,7 +166,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride, padding, input_shape, filter_shape); } - else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") ) { + else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) { InstructionUtils.checkNumFields(parts, 3); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); @@ -178,14 +178,17 @@ public class ConvolutionGPUInstruction extends GPUInstruction } } - public void processBiasInstruction(ExecutionContext ec) throws DMLRuntimeException { + public void processBiasInstruction(String instOpcode, ExecutionContext ec) throws DMLRuntimeException { GPUStatistics.incrementNoOfExecutedGPUInst(); MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input2.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); - LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out); + if(instOpcode.equalsIgnoreCase("bias_add")) + LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, out); + else if(instOpcode.equalsIgnoreCase("bias_multiply")) + LibMatrixCUDA.biasMultiply(getExtendedOpcode(), input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -210,8 +213,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { - if (instOpcode.equalsIgnoreCase("bias_add")) { - processBiasInstruction(ec); + if (instOpcode.equalsIgnoreCase("bias_add") || instOpcode.equalsIgnoreCase("bias_multiply")) { + processBiasInstruction(instOpcode, ec); return; } else if (instOpcode.equalsIgnoreCase("relu_backward")) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d127dfa2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 02f1d55..8074e3a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -433,6 +433,43 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } + + /** + * Performs the operation corresponding to the DML script: + * ones = matrix(1, rows=1, cols=Hout*Wout) + * output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) + * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param input input image + * @param bias bias + * @param outputBlock output + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void biasMultiply(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { + if(isInSparseFormat(input)) { + ((JCudaObject)input.getGPUObject()).sparseToDense(instName); + } + if(isInSparseFormat(bias)) { + ((JCudaObject)bias.getGPUObject()).sparseToDense(instName); + } + long rows = input.getNumRows(); + long cols = input.getNumColumns(); + long K = bias.getNumRows(); + long PQ = cols / K; + if(bias.getNumColumns() != 1 || cols % K != 0) { + throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); + } + Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; + Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr; + Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; + long t1 = 0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + kernels.launchKernel("bias_multiply", + ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), + imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); + + } /** * Performs the operation corresponding to the DML script:
