Repository: incubator-systemml Updated Branches: refs/heads/master afe61b5a2 -> 0ff4f14b6
[SYSTEMML-540] Bugfix for GPU bias_add and minor cleanup Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/0ff4f14b Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/0ff4f14b Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/0ff4f14b Branch: refs/heads/master Commit: 0ff4f14b6bf64e462b7911b50c9c286b93d6690e Parents: afe61b5 Author: Niketan Pansare <[email protected]> Authored: Tue Jan 10 16:00:32 2017 -0800 Committer: Niketan Pansare <[email protected]> Committed: Tue Jan 10 16:00:32 2017 -0800 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 18 +- src/main/cpp/kernels/SystemML.ptx | 920 ++++++++++--------- .../cp/ConvolutionCPInstruction.java | 10 +- .../gpu/ConvolutionGPUInstruction.java | 10 +- .../runtime/matrix/data/LibMatrixCUDA.java | 122 ++- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 113 ++- 6 files changed, 685 insertions(+), 508 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 7e32f0e..5964707 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -116,8 +116,9 @@ __global__ void relu(double* A, double* ret, int rlen, int clen) { } } +// This method computes the backpropagation errors for previous layer of relu operation extern "C" -__global__ void relu_backward(double* X, double* dout, double* ret, int rlen, int clen) { +__global__ void reluBackward(double* X, double* dout, double* ret, int rlen, int clen) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix < rlen && iy < clen) { @@ -126,6 +127,21 @@ __global__ void relu_backward(double* X, double* dout, double* ret, int rlen, i } } +// Performs the operation corresponding to the DML script: +// ones = matrix(1, rows=1, cols=Hout*Wout) +// output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) +// This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function +extern "C" +__global__ void biasAdd(double* input, double* bias, double* ret, int rlen, int clen, int PQ) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + if(ix < rlen && iy < clen) { + int index = ix * clen + iy; + int biasIndex = iy / PQ; + ret[index] = input[index] + bias[biasIndex]; + } +} + // Compares the value and set extern "C" __global__ void compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index e30e00a..51ddb41 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -197,13 +197,13 @@ BB3_2: ret; } - // .globl relu_backward -.visible .entry relu_backward( - .param .u64 relu_backward_param_0, - .param .u64 relu_backward_param_1, - .param .u64 relu_backward_param_2, - .param .u32 relu_backward_param_3, - .param .u32 relu_backward_param_4 + // .globl reluBackward +.visible .entry reluBackward( + .param .u64 reluBackward_param_0, + .param .u64 reluBackward_param_1, + .param .u64 reluBackward_param_2, + .param .u32 reluBackward_param_3, + .param .u32 reluBackward_param_4 ) { .reg .pred %p<5>; @@ -212,11 +212,11 @@ BB3_2: .reg .b64 %rd<14>; - ld.param.u64 %rd2, [relu_backward_param_0]; - ld.param.u64 %rd3, [relu_backward_param_1]; - ld.param.u64 %rd4, [relu_backward_param_2]; - ld.param.u32 %r4, [relu_backward_param_3]; - ld.param.u32 %r3, [relu_backward_param_4]; + ld.param.u64 %rd2, [reluBackward_param_0]; + ld.param.u64 %rd3, [reluBackward_param_1]; + ld.param.u64 %rd4, [reluBackward_param_2]; + ld.param.u32 %r4, [reluBackward_param_3]; + ld.param.u32 %r3, [reluBackward_param_4]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; @@ -257,6 +257,62 @@ BB4_4: ret; } + // .globl biasAdd +.visible .entry biasAdd( + .param .u64 biasAdd_param_0, + .param .u64 biasAdd_param_1, + .param .u64 biasAdd_param_2, + .param .u32 biasAdd_param_3, + .param .u32 biasAdd_param_4, + .param .u32 biasAdd_param_5 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<14>; + .reg .f64 %fd<4>; + .reg .b64 %rd<12>; + + + ld.param.u64 %rd1, [biasAdd_param_0]; + ld.param.u64 %rd2, [biasAdd_param_1]; + ld.param.u64 %rd3, [biasAdd_param_2]; + ld.param.u32 %r5, [biasAdd_param_3]; + ld.param.u32 %r3, [biasAdd_param_4]; + ld.param.u32 %r4, [biasAdd_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + mov.u32 %r9, %ntid.y; + mov.u32 %r10, %ctaid.y; + mov.u32 %r11, %tid.y; + mad.lo.s32 %r2, %r9, %r10, %r11; + setp.lt.s32 %p1, %r1, %r5; + setp.lt.s32 %p2, %r2, %r3; + and.pred %p3, %p1, %p2; + @!%p3 bra BB5_2; + bra.uni BB5_1; + +BB5_1: + cvta.to.global.u64 %rd4, %rd1; + mad.lo.s32 %r12, %r1, %r3, %r2; + mul.wide.s32 %rd5, %r12, 8; + add.s64 %rd6, %rd4, %rd5; + div.s32 %r13, %r2, %r4; + cvta.to.global.u64 %rd7, %rd2; + mul.wide.s32 %rd8, %r13, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; + ld.global.f64 %fd2, [%rd6]; + add.f64 %fd3, %fd2, %fd1; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd11, %rd10, %rd5; + st.global.f64 [%rd11], %fd3; + +BB5_2: + ret; +} + // .globl compareAndSet .visible .entry compareAndSet( .param .u64 compareAndSet_param_0, @@ -297,10 +353,10 @@ BB4_4: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB5_6; - bra.uni BB5_1; + @!%p3 bra BB6_6; + bra.uni BB6_1; -BB5_1: +BB6_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -310,26 +366,26 @@ BB5_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB5_5; - bra.uni BB5_2; + @%p4 bra BB6_5; + bra.uni BB6_2; -BB5_5: +BB6_5: st.global.f64 [%rd1], %fd4; - bra.uni BB5_6; + bra.uni BB6_6; -BB5_2: +BB6_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB5_4; - bra.uni BB5_3; + @%p5 bra BB6_4; + bra.uni BB6_3; -BB5_4: +BB6_4: st.global.f64 [%rd1], %fd5; - bra.uni BB5_6; + bra.uni BB6_6; -BB5_3: +BB6_3: st.global.f64 [%rd1], %fd6; -BB5_6: +BB6_6: ret; } @@ -370,42 +426,42 @@ BB5_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB6_53; - bra.uni BB6_1; + @!%p4 bra BB7_53; + bra.uni BB7_1; -BB6_1: +BB7_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; mov.u32 %r53, %r1; - @%p5 bra BB6_5; + @%p5 bra BB7_5; setp.ne.s32 %p6, %r11, 2; mov.u32 %r54, %r3; - @%p6 bra BB6_4; + @%p6 bra BB7_4; mov.u32 %r54, %r2; -BB6_4: +BB7_4: mov.u32 %r48, %r54; mov.u32 %r4, %r48; mov.u32 %r53, %r4; -BB6_5: +BB7_5: mov.u32 %r5, %r53; setp.eq.s32 %p7, %r12, 1; mov.u32 %r51, %r1; - @%p7 bra BB6_9; + @%p7 bra BB7_9; setp.ne.s32 %p8, %r12, 2; mov.u32 %r52, %r3; - @%p8 bra BB6_8; + @%p8 bra BB7_8; mov.u32 %r52, %r2; -BB6_8: +BB7_8: mov.u32 %r51, %r52; -BB6_9: +BB7_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; @@ -416,47 +472,47 @@ BB6_9: ld.global.f64 %fd2, [%rd10]; mov.f64 %fd38, 0dC08F380000000000; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB6_19; + @%p9 bra BB7_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB6_15; + @%p19 bra BB7_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB6_51; + @%p23 bra BB7_51; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB6_50; - bra.uni BB6_13; + @%p24 bra BB7_50; + bra.uni BB7_13; -BB6_50: +BB7_50: sub.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_19: +BB7_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB6_24; + @%p10 bra BB7_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB6_34; + @%p16 bra BB7_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB6_33; - bra.uni BB6_22; + @%p17 bra BB7_33; + bra.uni BB7_22; -BB6_33: +BB7_33: setp.gt.f64 %p29, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_15: +BB7_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB6_49; + @%p20 bra BB7_49; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB6_35; - bra.uni BB6_17; + @%p21 bra BB7_35; + bra.uni BB7_17; -BB6_35: +BB7_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -492,10 +548,10 @@ BB6_35: }// Callseq End 0 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB6_37; - bra.uni BB6_36; + @!%p1 bra BB7_37; + bra.uni BB7_36; -BB6_36: +BB7_36: { .reg .b32 %temp; mov.b64 {%temp, %r23}, %fd37; @@ -507,111 +563,111 @@ BB6_36: } mov.b64 %fd37, {%r25, %r24}; -BB6_37: +BB7_37: mov.f64 %fd36, %fd37; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB6_40; - bra.uni BB6_38; + @%p34 bra BB7_40; + bra.uni BB7_38; -BB6_40: +BB7_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; mov.b64 %fd36, {%r29, %r28}; - bra.uni BB6_41; + bra.uni BB7_41; -BB6_24: +BB7_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB6_28; + @%p11 bra BB7_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB6_32; - bra.uni BB6_26; + @%p14 bra BB7_32; + bra.uni BB7_26; -BB6_32: +BB7_32: setp.eq.f64 %p27, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_28: +BB7_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB6_31; - bra.uni BB6_29; + @%p12 bra BB7_31; + bra.uni BB7_29; -BB6_31: +BB7_31: min.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_51: +BB7_51: add.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_13: +BB7_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB6_14; - bra.uni BB6_52; + @%p25 bra BB7_14; + bra.uni BB7_52; -BB6_14: +BB7_14: mul.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_34: +BB7_34: setp.le.f64 %p30, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_22: +BB7_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB6_23; - bra.uni BB6_52; + @%p18 bra BB7_23; + bra.uni BB7_52; -BB6_23: +BB7_23: setp.ge.f64 %p28, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_49: +BB7_49: div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_17: +BB7_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB6_18; - bra.uni BB6_52; + @%p22 bra BB7_18; + bra.uni BB7_52; -BB6_18: +BB7_18: setp.lt.f64 %p31, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_26: +BB7_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB6_27; - bra.uni BB6_52; + @%p15 bra BB7_27; + bra.uni BB7_52; -BB6_27: +BB7_27: setp.neu.f64 %p26, %fd1, %fd2; selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_29: +BB7_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB6_52; + @%p13 bra BB7_52; max.f64 %fd38, %fd1, %fd2; - bra.uni BB6_52; + bra.uni BB7_52; -BB6_38: +BB7_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB6_41; + @%p35 bra BB7_41; cvt.rzi.f64.f64 %fd29, %fd2; setp.neu.f64 %p36, %fd29, %fd2; selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; -BB6_41: +BB7_41: mov.f64 %fd17, %fd36; add.f64 %fd18, %fd1, %fd2; { @@ -621,17 +677,17 @@ BB6_41: and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; mov.f64 %fd35, %fd17; - @%p39 bra BB6_48; + @%p39 bra BB7_48; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; mov.f64 %fd35, %fd18; - @%p40 bra BB6_48; + @%p40 bra BB7_48; abs.f64 %fd30, %fd2; setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; mov.f64 %fd34, %fd18; mov.f64 %fd35, %fd34; - @%p41 bra BB6_48; + @%p41 bra BB7_48; { .reg .b32 %temp; @@ -641,10 +697,10 @@ BB6_41: setp.eq.s32 %p42, %r33, 2146435072; setp.eq.s32 %p43, %r32, 0; and.pred %p44, %p42, %p43; - @%p44 bra BB6_47; - bra.uni BB6_45; + @%p44 bra BB7_47; + bra.uni BB7_45; -BB6_47: +BB7_47: setp.gt.f64 %p48, %fd11, 0d3FF0000000000000; selp.b32 %r41, 2146435072, 0, %p48; xor.b32 %r42, %r41, 2146435072; @@ -654,9 +710,9 @@ BB6_47: selp.b32 %r44, 1072693248, %r43, %p50; mov.u32 %r45, 0; mov.b64 %fd35, {%r45, %r44}; - bra.uni BB6_48; + bra.uni BB7_48; -BB6_45: +BB7_45: { .reg .b32 %temp; mov.b64 {%r34, %temp}, %fd1; @@ -666,10 +722,10 @@ BB6_45: setp.eq.s32 %p46, %r34, 0; and.pred %p47, %p45, %p46; mov.f64 %fd35, %fd17; - @!%p47 bra BB6_48; - bra.uni BB6_46; + @!%p47 bra BB7_48; + bra.uni BB7_46; -BB6_46: +BB7_46: shr.s32 %r36, %r9, 31; and.b32 %r37, %r36, -2146435072; selp.b32 %r38, -1048576, 2146435072, %p1; @@ -677,19 +733,19 @@ BB6_46: mov.u32 %r40, 0; mov.b64 %fd35, {%r40, %r39}; -BB6_48: +BB7_48: setp.eq.f64 %p51, %fd2, 0d0000000000000000; setp.eq.f64 %p52, %fd1, 0d3FF0000000000000; or.pred %p53, %p52, %p51; selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p53; -BB6_52: +BB7_52: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd38; -BB6_53: +BB7_53: ret; } @@ -728,7 +784,7 @@ BB6_53: mad.lo.s32 %r1, %r14, %r15, %r17; mul.lo.s32 %r18, %r9, %r8; setp.ge.s32 %p3, %r1, %r18; - @%p3 bra BB7_88; + @%p3 bra BB8_88; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -737,178 +793,178 @@ BB6_53: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB7_45; + @%p4 bra BB8_45; setp.eq.s32 %p5, %r6, 0; - @%p5 bra BB7_43; + @%p5 bra BB8_43; mov.f64 %fd66, 0dC08F380000000000; setp.gt.s32 %p6, %r6, 6; - @%p6 bra BB7_13; + @%p6 bra BB8_13; setp.gt.s32 %p14, %r6, 3; - @%p14 bra BB7_9; + @%p14 bra BB8_9; setp.eq.s32 %p18, %r6, 1; - @%p18 bra BB7_42; + @%p18 bra BB8_42; setp.eq.s32 %p19, %r6, 2; - @%p19 bra BB7_41; - bra.uni BB7_7; + @%p19 bra BB8_41; + bra.uni BB8_7; -BB7_41: +BB8_41: mul.f64 %fd66, %fd1, %fd52; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_45: +BB8_45: setp.eq.s32 %p49, %r6, 0; - @%p49 bra BB7_86; + @%p49 bra BB8_86; mov.f64 %fd74, 0dC08F380000000000; setp.gt.s32 %p50, %r6, 6; - @%p50 bra BB7_56; + @%p50 bra BB8_56; setp.gt.s32 %p58, %r6, 3; - @%p58 bra BB7_52; + @%p58 bra BB8_52; setp.eq.s32 %p62, %r6, 1; - @%p62 bra BB7_85; + @%p62 bra BB8_85; setp.eq.s32 %p63, %r6, 2; - @%p63 bra BB7_84; - bra.uni BB7_50; + @%p63 bra BB8_84; + bra.uni BB8_50; -BB7_84: +BB8_84: mul.f64 %fd74, %fd1, %fd52; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_43: +BB8_43: add.f64 %fd66, %fd1, %fd52; -BB7_44: +BB8_44: st.global.f64 [%rd1], %fd66; - bra.uni BB7_88; + bra.uni BB8_88; -BB7_13: +BB8_13: setp.gt.s32 %p7, %r6, 9; - @%p7 bra BB7_18; + @%p7 bra BB8_18; setp.eq.s32 %p11, %r6, 7; - @%p11 bra BB7_25; + @%p11 bra BB8_25; setp.eq.s32 %p12, %r6, 8; - @%p12 bra BB7_24; - bra.uni BB7_16; + @%p12 bra BB8_24; + bra.uni BB8_16; -BB7_24: +BB8_24: setp.le.f64 %p23, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_86: +BB8_86: add.f64 %fd74, %fd1, %fd52; -BB7_87: +BB8_87: st.global.f64 [%rd1], %fd74; -BB7_88: +BB8_88: ret; -BB7_56: +BB8_56: setp.gt.s32 %p51, %r6, 9; - @%p51 bra BB7_61; + @%p51 bra BB8_61; setp.eq.s32 %p55, %r6, 7; - @%p55 bra BB7_68; + @%p55 bra BB8_68; setp.eq.s32 %p56, %r6, 8; - @%p56 bra BB7_67; - bra.uni BB7_59; + @%p56 bra BB8_67; + bra.uni BB8_59; -BB7_67: +BB8_67: setp.ge.f64 %p67, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_9: +BB8_9: setp.eq.s32 %p15, %r6, 4; - @%p15 bra BB7_27; + @%p15 bra BB8_27; setp.eq.s32 %p16, %r6, 5; - @%p16 bra BB7_26; - bra.uni BB7_11; + @%p16 bra BB8_26; + bra.uni BB8_11; -BB7_26: +BB8_26: setp.gt.f64 %p26, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_18: +BB8_18: setp.eq.s32 %p8, %r6, 10; - @%p8 bra BB7_23; + @%p8 bra BB8_23; setp.eq.s32 %p9, %r6, 11; - @%p9 bra BB7_22; - bra.uni BB7_20; + @%p9 bra BB8_22; + bra.uni BB8_20; -BB7_22: +BB8_22: min.f64 %fd66, %fd52, %fd1; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_52: +BB8_52: setp.eq.s32 %p59, %r6, 4; - @%p59 bra BB7_70; + @%p59 bra BB8_70; setp.eq.s32 %p60, %r6, 5; - @%p60 bra BB7_69; - bra.uni BB7_54; + @%p60 bra BB8_69; + bra.uni BB8_54; -BB7_69: +BB8_69: setp.lt.f64 %p70, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_61: +BB8_61: setp.eq.s32 %p52, %r6, 10; - @%p52 bra BB7_66; + @%p52 bra BB8_66; setp.eq.s32 %p53, %r6, 11; - @%p53 bra BB7_65; - bra.uni BB7_63; + @%p53 bra BB8_65; + bra.uni BB8_63; -BB7_65: +BB8_65: min.f64 %fd74, %fd1, %fd52; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_42: +BB8_42: sub.f64 %fd66, %fd52, %fd1; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_7: +BB8_7: setp.eq.s32 %p20, %r6, 3; - @%p20 bra BB7_8; - bra.uni BB7_44; + @%p20 bra BB8_8; + bra.uni BB8_44; -BB7_8: +BB8_8: div.rn.f64 %fd66, %fd52, %fd1; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_25: +BB8_25: setp.lt.f64 %p24, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_16: +BB8_16: setp.eq.s32 %p13, %r6, 9; - @%p13 bra BB7_17; - bra.uni BB7_44; + @%p13 bra BB8_17; + bra.uni BB8_44; -BB7_17: +BB8_17: setp.eq.f64 %p22, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_27: +BB8_27: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd52; @@ -944,10 +1000,10 @@ BB7_27: }// Callseq End 1 setp.lt.s32 %p28, %r2, 0; and.pred %p1, %p28, %p27; - @!%p1 bra BB7_29; - bra.uni BB7_28; + @!%p1 bra BB8_29; + bra.uni BB8_28; -BB7_28: +BB8_28: { .reg .b32 %temp; mov.b64 {%temp, %r21}, %fd65; @@ -959,72 +1015,72 @@ BB7_28: } mov.b64 %fd65, {%r23, %r22}; -BB7_29: +BB8_29: mov.f64 %fd64, %fd65; setp.eq.f64 %p29, %fd52, 0d0000000000000000; - @%p29 bra BB7_32; - bra.uni BB7_30; + @%p29 bra BB8_32; + bra.uni BB8_30; -BB7_32: +BB8_32: selp.b32 %r24, %r2, 0, %p27; or.b32 %r25, %r24, 2146435072; setp.lt.s32 %p33, %r3, 0; selp.b32 %r26, %r25, %r24, %p33; mov.u32 %r27, 0; mov.b64 %fd64, {%r27, %r26}; - bra.uni BB7_33; + bra.uni BB8_33; -BB7_11: +BB8_11: setp.eq.s32 %p17, %r6, 6; - @%p17 bra BB7_12; - bra.uni BB7_44; + @%p17 bra BB8_12; + bra.uni BB8_44; -BB7_12: +BB8_12: setp.ge.f64 %p25, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_23: +BB8_23: setp.neu.f64 %p21, %fd1, %fd52; selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_20: +BB8_20: setp.ne.s32 %p10, %r6, 12; - @%p10 bra BB7_44; + @%p10 bra BB8_44; max.f64 %fd66, %fd52, %fd1; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_85: +BB8_85: sub.f64 %fd74, %fd1, %fd52; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_50: +BB8_50: setp.eq.s32 %p64, %r6, 3; - @%p64 bra BB7_51; - bra.uni BB7_87; + @%p64 bra BB8_51; + bra.uni BB8_87; -BB7_51: +BB8_51: div.rn.f64 %fd74, %fd1, %fd52; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_68: +BB8_68: setp.gt.f64 %p68, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_59: +BB8_59: setp.eq.s32 %p57, %r6, 9; - @%p57 bra BB7_60; - bra.uni BB7_87; + @%p57 bra BB8_60; + bra.uni BB8_87; -BB7_60: +BB8_60: setp.eq.f64 %p66, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_70: +BB8_70: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1060,10 +1116,10 @@ BB7_70: }// Callseq End 2 setp.lt.s32 %p72, %r4, 0; and.pred %p2, %p72, %p71; - @!%p2 bra BB7_72; - bra.uni BB7_71; + @!%p2 bra BB8_72; + bra.uni BB8_71; -BB7_71: +BB8_71: { .reg .b32 %temp; mov.b64 {%temp, %r46}, %fd73; @@ -1075,52 +1131,52 @@ BB7_71: } mov.b64 %fd73, {%r48, %r47}; -BB7_72: +BB8_72: mov.f64 %fd72, %fd73; setp.eq.f64 %p73, %fd1, 0d0000000000000000; - @%p73 bra BB7_75; - bra.uni BB7_73; + @%p73 bra BB8_75; + bra.uni BB8_73; -BB7_75: +BB8_75: selp.b32 %r49, %r4, 0, %p71; or.b32 %r50, %r49, 2146435072; setp.lt.s32 %p77, %r5, 0; selp.b32 %r51, %r50, %r49, %p77; mov.u32 %r52, 0; mov.b64 %fd72, {%r52, %r51}; - bra.uni BB7_76; + bra.uni BB8_76; -BB7_54: +BB8_54: setp.eq.s32 %p61, %r6, 6; - @%p61 bra BB7_55; - bra.uni BB7_87; + @%p61 bra BB8_55; + bra.uni BB8_87; -BB7_55: +BB8_55: setp.le.f64 %p69, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_66: +BB8_66: setp.neu.f64 %p65, %fd1, %fd52; selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_63: +BB8_63: setp.ne.s32 %p54, %r6, 12; - @%p54 bra BB7_87; + @%p54 bra BB8_87; max.f64 %fd74, %fd1, %fd52; - bra.uni BB7_87; + bra.uni BB8_87; -BB7_30: +BB8_30: setp.gt.s32 %p30, %r2, -1; - @%p30 bra BB7_33; + @%p30 bra BB8_33; cvt.rzi.f64.f64 %fd54, %fd1; setp.neu.f64 %p31, %fd54, %fd1; selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p31; -BB7_33: +BB8_33: mov.f64 %fd16, %fd64; add.f64 %fd17, %fd1, %fd52; { @@ -1130,17 +1186,17 @@ BB7_33: and.b32 %r29, %r28, 2146435072; setp.ne.s32 %p34, %r29, 2146435072; mov.f64 %fd63, %fd16; - @%p34 bra BB7_40; + @%p34 bra BB8_40; setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000; mov.f64 %fd63, %fd17; - @%p35 bra BB7_40; + @%p35 bra BB8_40; abs.f64 %fd55, %fd1; setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000; mov.f64 %fd62, %fd17; mov.f64 %fd63, %fd62; - @%p36 bra BB7_40; + @%p36 bra BB8_40; { .reg .b32 %temp; @@ -1150,10 +1206,10 @@ BB7_33: setp.eq.s32 %p37, %r31, 2146435072; setp.eq.s32 %p38, %r30, 0; and.pred %p39, %p37, %p38; - @%p39 bra BB7_39; - bra.uni BB7_37; + @%p39 bra BB8_39; + bra.uni BB8_37; -BB7_39: +BB8_39: setp.gt.f64 %p43, %fd10, 0d3FF0000000000000; selp.b32 %r39, 2146435072, 0, %p43; xor.b32 %r40, %r39, 2146435072; @@ -1163,17 +1219,17 @@ BB7_39: selp.b32 %r42, 1072693248, %r41, %p45; mov.u32 %r43, 0; mov.b64 %fd63, {%r43, %r42}; - bra.uni BB7_40; + bra.uni BB8_40; -BB7_73: +BB8_73: setp.gt.s32 %p74, %r4, -1; - @%p74 bra BB7_76; + @%p74 bra BB8_76; cvt.rzi.f64.f64 %fd57, %fd52; setp.neu.f64 %p75, %fd57, %fd52; selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p75; -BB7_76: +BB8_76: mov.f64 %fd41, %fd72; add.f64 %fd42, %fd1, %fd52; { @@ -1183,17 +1239,17 @@ BB7_76: and.b32 %r54, %r53, 2146435072; setp.ne.s32 %p78, %r54, 2146435072; mov.f64 %fd71, %fd41; - @%p78 bra BB7_83; + @%p78 bra BB8_83; setp.gtu.f64 %p79, %fd35, 0d7FF0000000000000; mov.f64 %fd71, %fd42; - @%p79 bra BB7_83; + @%p79 bra BB8_83; abs.f64 %fd58, %fd52; setp.gtu.f64 %p80, %fd58, 0d7FF0000000000000; mov.f64 %fd70, %fd42; mov.f64 %fd71, %fd70; - @%p80 bra BB7_83; + @%p80 bra BB8_83; { .reg .b32 %temp; @@ -1203,10 +1259,10 @@ BB7_76: setp.eq.s32 %p81, %r56, 2146435072; setp.eq.s32 %p82, %r55, 0; and.pred %p83, %p81, %p82; - @%p83 bra BB7_82; - bra.uni BB7_80; + @%p83 bra BB8_82; + bra.uni BB8_80; -BB7_82: +BB8_82: setp.gt.f64 %p87, %fd35, 0d3FF0000000000000; selp.b32 %r64, 2146435072, 0, %p87; xor.b32 %r65, %r64, 2146435072; @@ -1216,9 +1272,9 @@ BB7_82: selp.b32 %r67, 1072693248, %r66, %p89; mov.u32 %r68, 0; mov.b64 %fd71, {%r68, %r67}; - bra.uni BB7_83; + bra.uni BB8_83; -BB7_37: +BB8_37: { .reg .b32 %temp; mov.b64 {%r32, %temp}, %fd52; @@ -1228,10 +1284,10 @@ BB7_37: setp.eq.s32 %p41, %r32, 0; and.pred %p42, %p40, %p41; mov.f64 %fd63, %fd16; - @!%p42 bra BB7_40; - bra.uni BB7_38; + @!%p42 bra BB8_40; + bra.uni BB8_38; -BB7_38: +BB8_38: shr.s32 %r34, %r3, 31; and.b32 %r35, %r34, -2146435072; selp.b32 %r36, -1048576, 2146435072, %p1; @@ -1239,14 +1295,14 @@ BB7_38: mov.u32 %r38, 0; mov.b64 %fd63, {%r38, %r37}; -BB7_40: +BB8_40: setp.eq.f64 %p46, %fd1, 0d0000000000000000; setp.eq.f64 %p47, %fd52, 0d3FF0000000000000; or.pred %p48, %p47, %p46; selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p48; - bra.uni BB7_44; + bra.uni BB8_44; -BB7_80: +BB8_80: { .reg .b32 %temp; mov.b64 {%r57, %temp}, %fd1; @@ -1256,10 +1312,10 @@ BB7_80: setp.eq.s32 %p85, %r57, 0; and.pred %p86, %p84, %p85; mov.f64 %fd71, %fd41; - @!%p86 bra BB7_83; - bra.uni BB7_81; + @!%p86 bra BB8_83; + bra.uni BB8_81; -BB7_81: +BB8_81: shr.s32 %r59, %r5, 31; and.b32 %r60, %r59, -2146435072; selp.b32 %r61, -1048576, 2146435072, %p2; @@ -1267,12 +1323,12 @@ BB7_81: mov.u32 %r63, 0; mov.b64 %fd71, {%r63, %r62}; -BB7_83: +BB8_83: setp.eq.f64 %p90, %fd52, 0d0000000000000000; setp.eq.f64 %p91, %fd1, 0d3FF0000000000000; or.pred %p92, %p91, %p90; selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p92; - bra.uni BB7_87; + bra.uni BB8_87; } // .globl fill @@ -1296,14 +1352,14 @@ BB7_83: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB8_2; + @%p1 bra BB9_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB8_2: +BB9_2: ret; } @@ -1327,17 +1383,17 @@ BB8_2: ld.param.u32 %r4, [reduce_row_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB9_31; + @%p1 bra BB10_31; mov.u32 %r35, %tid.x; mov.f64 %fd63, 0d0000000000000000; mov.f64 %fd64, %fd63; setp.ge.u32 %p2, %r35, %r4; - @%p2 bra BB9_4; + @%p2 bra BB10_4; cvta.to.global.u64 %rd3, %rd1; -BB9_3: +BB10_3: mad.lo.s32 %r8, %r6, %r4, %r35; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -1347,9 +1403,9 @@ BB9_3: add.s32 %r35, %r9, %r35; setp.lt.u32 %p3, %r35, %r4; mov.f64 %fd63, %fd64; - @%p3 bra BB9_3; + @%p3 bra BB10_3; -BB9_4: +BB10_4: mov.f64 %fd61, %fd63; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -1359,113 +1415,113 @@ BB9_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 512; - @%p4 bra BB9_8; + @%p4 bra BB10_8; setp.gt.u32 %p5, %r10, 255; mov.f64 %fd62, %fd61; - @%p5 bra BB9_7; + @%p5 bra BB10_7; ld.shared.f64 %fd26, [%rd8+2048]; add.f64 %fd62, %fd61, %fd26; st.shared.f64 [%rd8], %fd62; -BB9_7: +BB10_7: mov.f64 %fd61, %fd62; bar.sync 0; -BB9_8: +BB10_8: mov.f64 %fd59, %fd61; setp.lt.u32 %p6, %r11, 256; - @%p6 bra BB9_12; + @%p6 bra BB10_12; setp.gt.u32 %p7, %r10, 127; mov.f64 %fd60, %fd59; - @%p7 bra BB9_11; + @%p7 bra BB10_11; ld.shared.f64 %fd27, [%rd8+1024]; add.f64 %fd60, %fd59, %fd27; st.shared.f64 [%rd8], %fd60; -BB9_11: +BB10_11: mov.f64 %fd59, %fd60; bar.sync 0; -BB9_12: +BB10_12: mov.f64 %fd57, %fd59; setp.lt.u32 %p8, %r11, 128; - @%p8 bra BB9_16; + @%p8 bra BB10_16; setp.gt.u32 %p9, %r10, 63; mov.f64 %fd58, %fd57; - @%p9 bra BB9_15; + @%p9 bra BB10_15; ld.shared.f64 %fd28, [%rd8+512]; add.f64 %fd58, %fd57, %fd28; st.shared.f64 [%rd8], %fd58; -BB9_15: +BB10_15: mov.f64 %fd57, %fd58; bar.sync 0; -BB9_16: +BB10_16: mov.f64 %fd56, %fd57; setp.gt.u32 %p10, %r10, 31; - @%p10 bra BB9_29; + @%p10 bra BB10_29; setp.lt.u32 %p11, %r11, 64; - @%p11 bra BB9_19; + @%p11 bra BB10_19; ld.volatile.shared.f64 %fd29, [%rd8+256]; add.f64 %fd56, %fd56, %fd29; st.volatile.shared.f64 [%rd8], %fd56; -BB9_19: +BB10_19: mov.f64 %fd55, %fd56; setp.lt.u32 %p12, %r11, 32; - @%p12 bra BB9_21; + @%p12 bra BB10_21; ld.volatile.shared.f64 %fd30, [%rd8+128]; add.f64 %fd55, %fd55, %fd30; st.volatile.shared.f64 [%rd8], %fd55; -BB9_21: +BB10_21: mov.f64 %fd54, %fd55; setp.lt.u32 %p13, %r11, 16; - @%p13 bra BB9_23; + @%p13 bra BB10_23; ld.volatile.shared.f64 %fd31, [%rd8+64]; add.f64 %fd54, %fd54, %fd31; st.volatile.shared.f64 [%rd8], %fd54; -BB9_23: +BB10_23: mov.f64 %fd53, %fd54; setp.lt.u32 %p14, %r11, 8; - @%p14 bra BB9_25; + @%p14 bra BB10_25; ld.volatile.shared.f64 %fd32, [%rd8+32]; add.f64 %fd53, %fd53, %fd32; st.volatile.shared.f64 [%rd8], %fd53; -BB9_25: +BB10_25: mov.f64 %fd52, %fd53; setp.lt.u32 %p15, %r11, 4; - @%p15 bra BB9_27; + @%p15 bra BB10_27; ld.volatile.shared.f64 %fd33, [%rd8+16]; add.f64 %fd52, %fd52, %fd33; st.volatile.shared.f64 [%rd8], %fd52; -BB9_27: +BB10_27: setp.lt.u32 %p16, %r11, 2; - @%p16 bra BB9_29; + @%p16 bra BB10_29; ld.volatile.shared.f64 %fd34, [%rd8+8]; add.f64 %fd35, %fd52, %fd34; st.volatile.shared.f64 [%rd8], %fd35; -BB9_29: +BB10_29: setp.ne.s32 %p17, %r10, 0; - @%p17 bra BB9_31; + @%p17 bra BB10_31; ld.shared.f64 %fd36, [sdata]; cvta.to.global.u64 %rd36, %rd2; @@ -1473,7 +1529,7 @@ BB9_29: add.s64 %rd38, %rd36, %rd37; st.global.f64 [%rd38], %fd36; -BB9_31: +BB10_31: ret; } @@ -1500,18 +1556,18 @@ BB9_31: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB10_5; + @%p1 bra BB11_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d0000000000000000; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB10_4; + @%p2 bra BB11_4; mov.u32 %r10, %r1; -BB10_3: +BB11_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -1521,15 +1577,15 @@ BB10_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB10_3; + @%p3 bra BB11_3; -BB10_4: +BB11_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB10_5: +BB11_5: ret; } @@ -1557,9 +1613,9 @@ BB10_5: mov.f64 %fd67, 0d0000000000000000; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB11_4; + @%p1 bra BB12_4; -BB11_1: +BB12_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -1568,23 +1624,23 @@ BB11_1: add.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB11_3; + @%p2 bra BB12_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; add.f64 %fd69, %fd69, %fd28; -BB11_3: +BB12_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB11_1; + @%p3 bra BB12_1; -BB11_4: +BB12_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1592,113 +1648,113 @@ BB11_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB11_8; + @%p4 bra BB12_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB11_7; + @%p5 bra BB12_7; ld.shared.f64 %fd29, [%rd1+2048]; add.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB11_7: +BB12_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB11_8: +BB12_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB11_12; + @%p6 bra BB12_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB11_11; + @%p7 bra BB12_11; ld.shared.f64 %fd30, [%rd1+1024]; add.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB11_11: +BB12_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB11_12: +BB12_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB11_16; + @%p8 bra BB12_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB11_15; + @%p9 bra BB12_15; ld.shared.f64 %fd31, [%rd1+512]; add.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB11_15: +BB12_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB11_16: +BB12_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB11_29; + @%p10 bra BB12_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB11_19; + @%p11 bra BB12_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; add.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB11_19: +BB12_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB11_21; + @%p12 bra BB12_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; add.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB11_21: +BB12_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB11_23; + @%p13 bra BB12_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; add.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB11_23: +BB12_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB11_25; + @%p14 bra BB12_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; add.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB11_25: +BB12_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB11_27; + @%p15 bra BB12_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; add.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB11_27: +BB12_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB11_29; + @%p16 bra BB12_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; add.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB11_29: +BB12_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB11_31; + @%p17 bra BB12_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -1706,7 +1762,7 @@ BB11_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB11_31: +BB12_31: ret; } @@ -1734,9 +1790,9 @@ BB11_31: mov.f64 %fd67, 0d0010000000000000; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB12_4; + @%p1 bra BB13_4; -BB12_1: +BB13_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -1745,23 +1801,23 @@ BB12_1: max.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB12_3; + @%p2 bra BB13_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; max.f64 %fd69, %fd69, %fd28; -BB12_3: +BB13_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB12_1; + @%p3 bra BB13_1; -BB12_4: +BB13_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1769,113 +1825,113 @@ BB12_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB12_8; + @%p4 bra BB13_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB12_7; + @%p5 bra BB13_7; ld.shared.f64 %fd29, [%rd1+2048]; max.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB12_7: +BB13_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB12_8: +BB13_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB12_12; + @%p6 bra BB13_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB12_11; + @%p7 bra BB13_11; ld.shared.f64 %fd30, [%rd1+1024]; max.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB12_11: +BB13_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB12_12: +BB13_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB12_16; + @%p8 bra BB13_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB12_15; + @%p9 bra BB13_15; ld.shared.f64 %fd31, [%rd1+512]; max.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB12_15: +BB13_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB12_16: +BB13_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB12_29; + @%p10 bra BB13_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB12_19; + @%p11 bra BB13_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; max.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB12_19: +BB13_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB12_21; + @%p12 bra BB13_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; max.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB12_21: +BB13_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB12_23; + @%p13 bra BB13_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; max.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB12_23: +BB13_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB12_25; + @%p14 bra BB13_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; max.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB12_25: +BB13_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB12_27; + @%p15 bra BB13_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; max.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB12_27: +BB13_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB12_29; + @%p16 bra BB13_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; max.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB12_29: +BB13_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB12_31; + @%p17 bra BB13_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -1883,7 +1939,7 @@ BB12_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB12_31: +BB13_31: ret; } @@ -1911,9 +1967,9 @@ BB12_31: mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB13_4; + @%p1 bra BB14_4; -BB13_1: +BB14_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -1922,23 +1978,23 @@ BB13_1: min.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB13_3; + @%p2 bra BB14_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; min.f64 %fd69, %fd69, %fd28; -BB13_3: +BB14_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB13_1; + @%p3 bra BB14_1; -BB13_4: +BB14_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1946,113 +2002,113 @@ BB13_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB13_8; + @%p4 bra BB14_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB13_7; + @%p5 bra BB14_7; ld.shared.f64 %fd29, [%rd1+2048]; min.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB13_7: +BB14_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB13_8: +BB14_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB13_12; + @%p6 bra BB14_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB13_11; + @%p7 bra BB14_11; ld.shared.f64 %fd30, [%rd1+1024]; min.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB13_11: +BB14_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB13_12: +BB14_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB13_16; + @%p8 bra BB14_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB13_15; + @%p9 bra BB14_15; ld.shared.f64 %fd31, [%rd1+512]; min.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB13_15: +BB14_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB13_16: +BB14_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB13_29; + @%p10 bra BB14_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB13_19; + @%p11 bra BB14_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; min.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB13_19: +BB14_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB13_21; + @%p12 bra BB14_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; min.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB13_21: +BB14_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB13_23; + @%p13 bra BB14_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; min.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB13_23: +BB14_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB13_25; + @%p14 bra BB14_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; min.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB13_25: +BB14_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB13_27; + @%p15 bra BB14_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; min.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB13_27: +BB14_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB13_29; + @%p16 bra BB14_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; min.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB13_29: +BB14_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB13_31; + @%p17 bra BB14_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2060,7 +2116,7 @@ BB13_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB13_31: +BB14_31: ret; } @@ -2087,7 +2143,7 @@ BB13_31: } shr.u32 %r50, %r49, 20; setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB14_2; + @%p1 bra BB15_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -2101,13 +2157,13 @@ BB13_31: shr.u32 %r16, %r49, 20; add.s32 %r50, %r16, -54; -BB14_2: +BB15_2: add.s32 %r51, %r50, -1023; and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; mov.b64 %fd132, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB14_4; + @%p2 bra BB15_4; { .reg .b32 %temp; @@ -2121,7 +2177,7 @@ BB14_2: mov.b64 %fd132, {%r19, %r21}; add.s32 %r51, %r50, -1022; -BB14_4: +BB15_4: add.f64 %fd16, %fd132, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; @@ -2286,13 +2342,13 @@ BB14_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB14_7; + @%p4 bra BB15_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB14_7; + @%p6 bra BB15_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; @@ -2307,7 +2363,7 @@ BB14_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd133, %fd130, %fd131; -BB14_7: +BB15_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd133; @@ -2320,13 +2376,13 @@ BB14_7: } setp.ne.s32 %p8, %r47, 0; or.pred %p9, %p8, %p7; - @!%p9 bra BB14_9; - bra.uni BB14_8; + @!%p9 bra BB15_9; + bra.uni BB15_8; -BB14_8: +BB15_8: fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB14_9: +BB15_9: st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 9c115c6..997c79b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -185,7 +185,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { } else { outputBlock = getDenseOutputBlock(ec, input.getNumRows(), input.getNumColumns()); - LibMatrixDNN.relu_backward(input, dout, outputBlock, _numThreads); + LibMatrixDNN.reluBackward(input, dout, outputBlock, _numThreads); } // release inputs/outputs @@ -213,7 +213,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { else { // As we always fill the output first with bias outputBlock = getDenseOutputBlock(ec, input.getNumRows(), input.getNumColumns()); - LibMatrixDNN.bias_add(input, bias, outputBlock, _numThreads); + LibMatrixDNN.biasAdd(input, bias, outputBlock, _numThreads); } // release inputs/outputs @@ -274,7 +274,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { } else { outputBlock = getDenseOutputBlock(ec, N, C*H*W); - LibMatrixDNN.maxpooling_backward(matBlock, dout, outputBlock, params); + LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); } @@ -296,7 +296,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { } else { outputBlock = getDenseOutputBlock(ec, K, C*R*S); - LibMatrixDNN.conv2d_backward_filter(matBlock, dout, outputBlock, params); + LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); } @@ -307,7 +307,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { } else { outputBlock = getDenseOutputBlock(ec, N, C * H * W); - LibMatrixDNN.conv2d_backward_data(matBlock, dout, outputBlock, params); + LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index f25f3a1..5c49a91 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -148,7 +148,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.bias_add(input, bias, out); + LibMatrixCUDA.biasAdd(input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -162,7 +162,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); - LibMatrixCUDA.relu_backward(input, dout, out); + LibMatrixCUDA.reluBackward(input, dout, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); @@ -231,7 +231,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), K, C * R * S); MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.conv2d_backward_filter(image, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardFilter(image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); @@ -249,7 +249,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.conv2d_backward_data(filter, dout, out, N, C, H, W, + LibMatrixCUDA.conv2dBackwardData(filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { @@ -278,7 +278,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = ec.getDenseMatrixOutputForGPUInstruction(_output.getName()); - LibMatrixCUDA.maxpooling_backward(image, dout, out, N, C, H, W, + LibMatrixCUDA.maxpoolingBackward(image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 4cfe79f..f160bc7 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -42,7 +42,6 @@ import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; -import static jcuda.jcudnn.JCudnn.cudnnActivationBackward; import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION; import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX; @@ -72,8 +71,6 @@ import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject; import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer; import org.apache.sysml.runtime.matrix.operators.*; import org.apache.sysml.utils.Statistics; -import static jcuda.jcudnn.JCudnn.cudnnAddTensor; - import jcuda.Pointer; import jcuda.Sizeof; import jcuda.jcublas.JCublas2; @@ -245,7 +242,15 @@ public class LibMatrixCUDA { return poolingDesc; } - public static void relu_backward(MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { + /** + * This method computes the backpropagation errors for previous layer of relu operation + * + * @param input + * @param dout + * @param outputBlock + * @throws DMLRuntimeException + */ + public static void reluBackward(MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { if(isInSparseFormat(input)) { ((JCudaObject)input.getGPUObject()).sparseToDense(); } @@ -257,53 +262,67 @@ public class LibMatrixCUDA { Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; Pointer doutPointer = ((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - kernels.launchKernel("relu_backward", + kernels.launchKernel("reluBackward", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, doutPointer, outputPointer, (int)rows, (int)cols); } - public static void bias_add(MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { + /** + * Performs the operation corresponding to the DML script: + * ones = matrix(1, rows=1, cols=Hout*Wout) + * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) + * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function + * + * @param input + * @param bias + * @param outputBlock + * @throws DMLRuntimeException + */ + public static void biasAdd(MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { if(isInSparseFormat(input)) { ((JCudaObject)input.getGPUObject()).sparseToDense(); } if(isInSparseFormat(bias)) { ((JCudaObject)bias.getGPUObject()).sparseToDense(); } - Pointer alpha = null; - Pointer beta = null; - cudnnTensorDescriptor biasTensorDesc = null; - cudnnTensorDescriptor dstTensorDesc = null; - try { - int N = (int) input.getNumRows(); - int K = (int) bias.getNumRows(); - int PQ = (int) input.getNumColumns() / K; - alpha = pointerTo(1.0); // TODO - beta = pointerTo(1.0); - - // Allocate descriptors - biasTensorDesc = allocateTensorDescriptor(1, K, 1, 1); - dstTensorDesc = allocateTensorDescriptor(N, K, PQ, 1); - Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; - Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr; - Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - // TODO: Avoid memcpy by allowing update in-place bias_add - cudaMemcpy(outputPointer, imagePointer, N*K*PQ*Sizeof.DOUBLE, jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice); - cudnnAddTensor(cudnnHandle, alpha, biasTensorDesc, biasPointer, beta, dstTensorDesc, outputPointer); - } - finally { - if(alpha != null) - cudaFree(alpha); - if(beta != null) - cudaFree(beta); - if(biasTensorDesc != null) - cudnnDestroyTensorDescriptor(biasTensorDesc); - if(dstTensorDesc != null) - cudnnDestroyTensorDescriptor(dstTensorDesc); + long rows = input.getNumRows(); + long cols = input.getNumColumns(); + long K = bias.getNumRows(); + long PQ = cols / K; + if(bias.getNumColumns() != 1 || cols % K != 0) { + throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); } + Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; + Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr; + Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; + kernels.launchKernel("biasAdd", + ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), + imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ); } - public static void conv2d_backward_filter(MatrixObject image, MatrixObject dout, + /** + * This method computes the backpropogation errors for filter of convolution operation + * + * @param image input image + * @param dout errors from next layer + * @param outputBlock output errors + * @param N number of images + * @param C number of channels + * @param H height + * @param W width + * @param K number of filters + * @param R filter height + * @param S filter width + * @param pad_h pad height + * @param pad_w pad width + * @param stride_h stride height + * @param stride_w stride width + * @param P output activation height + * @param Q output activation width + * @throws DMLRuntimeException + */ + public static void conv2dBackwardFilter(MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { @@ -1290,7 +1309,28 @@ public class LibMatrixCUDA { //********************************************************************/ - public static void conv2d_backward_data(MatrixObject filter, MatrixObject dout, + /** + * This method computes the backpropogation errors for previous layer of convolution operation + * + * @param filter filter used in conv2d + * @param dout errors from next layer + * @param output output errors + * @param N number of images + * @param C number of channels + * @param H height + * @param W width + * @param K number of filters + * @param R filter height + * @param S filter width + * @param pad_h pad height + * @param pad_w pad width + * @param stride_h stride height + * @param stride_w stride width + * @param P output activation height + * @param Q output activation width + * @throws DMLRuntimeException + */ + public static void conv2dBackwardData(MatrixObject filter, MatrixObject dout, MatrixObject output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { @@ -1426,7 +1466,9 @@ public class LibMatrixCUDA { } /** - * performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) + * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) + * This method computes the backpropogation errors for previous layer of maxpooling operation + * * @param image image as matrix object * @param dout delta matrix, output of previous layer * @param outputBlock output matrix @@ -1445,7 +1487,7 @@ public class LibMatrixCUDA { * @param Q (W - S + 1 + 2*pad_w)/stride_w * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void maxpooling_backward(MatrixObject image, MatrixObject dout, + public static void maxpoolingBackward(MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/0ff4f14b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 89cdff8..63571c3 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -126,7 +126,16 @@ public class LibMatrixDNN { } // ------------------------------------------------------------------------------------------------ - public static void conv2d_backward_data(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + /** + * This method computes the backpropogation errors for previous layer of convolution operation + * + * @param filter filter used in conv2d + * @param dout errors from next layer + * @param outputBlock output errors + * @param params + * @throws DMLRuntimeException + */ + public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = filter; params.input2 = dout; params.output = outputBlock; @@ -150,7 +159,16 @@ public class LibMatrixDNN { runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params); } - public static void conv2d_backward_filter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + /** + * This method computes the backpropogation errors for filter of convolution operation + * + * @param image input image + * @param dout errors from next layer + * @param outputBlock output errors + * @param params + * @throws DMLRuntimeException + */ + public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; params.input2 = dout; params.output = outputBlock; @@ -355,8 +373,16 @@ public class LibMatrixDNN { // ----------------------------------------------------------------------------- } - - public static void maxpooling_backward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + /** + * This method computes the backpropogation errors for previous layer of maxpooling operation + * + * @param input + * @param dout + * @param outputBlock + * @param params + * @throws DMLRuntimeException + */ + public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; params.input2 = dout; params.output = outputBlock; @@ -565,7 +591,16 @@ public class LibMatrixDNN { return maxIndex; } - public static void relu_backward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException { + /** + * This method computes the backpropagation errors for previous layer of relu operation + * + * @param input + * @param dout + * @param outputBlock + * @param numThreads + * @throws DMLRuntimeException + */ + public static void reluBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException { int N = input.getNumRows(); ConvolutionParameters params = new ConvolutionParameters(N, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, numThreads); params.input1 = input; @@ -626,7 +661,20 @@ public class LibMatrixDNN { } } - public static void bias_add(MatrixBlock input, MatrixBlock bias, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException { + + /** + * Performs the operation corresponding to the DML script: + * ones = matrix(1, rows=1, cols=Hout*Wout) + * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) + * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function + * + * @param input + * @param bias + * @param outputBlock + * @param numThreads + * @throws DMLRuntimeException + */ + public static void biasAdd(MatrixBlock input, MatrixBlock bias, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException { int N = input.getNumRows(); int K = bias.getNumRows(); int PQ = input.getNumColumns() / K; @@ -636,17 +684,28 @@ public class LibMatrixDNN { params.input2 = bias; params.output = outputBlock; + if(!input.isInSparseFormat() && TEST_SPARSE_INPUT) { + input.denseToSparse(); + } + if(!bias.isInSparseFormat() && TEST_SPARSE_FILTER) { + bias.denseToSparse(); + } + + if(bias.getNumColumns() != 1 || input.getNumColumns() % K != 0) { + throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + N + " X " + input.getNumColumns() + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); + } + if(input.isEmptyBlock()) { double [] outputArray = outputBlock.getDenseBlock(); for(int n = 0; n < N; n++) - fillBias(bias, outputArray, n, N, K, PQ); + fillBias(bias, outputArray, n, n+1, N, K, PQ); } else { runConvTask(TaskType.BiasAdd, params); } } - private static void doBiasAdd(int n, ConvolutionParameters params) throws DMLRuntimeException { + private static void doBiasAdd(int n1, int n2, ConvolutionParameters params) throws DMLRuntimeException { double [] outputArray = params.output.getDenseBlock(); int PQ = params.C; int numOutCols = params.input1.getNumColumns(); @@ -655,18 +714,19 @@ public class LibMatrixDNN { double [] inputArr = params.input1.getDenseBlock(); double [] biasArr = params.input2.getDenseBlock(); int K = params.K; - final int inputOffset = n*K*PQ; - for(int k = 0; k < K; k++) { - int offset = inputOffset + k*PQ; - for(int pq = 0; pq < PQ; pq++) { - outputArray[offset + pq] = inputArr[offset + pq] + biasArr[k]; + int index = n1*K*PQ; + for(int n = n1; n < n2; n++) { + for(int k = 0; k < K; k++) { + for(int pq = 0; pq < PQ; pq++, index++) { + outputArray[index] = inputArr[index] + biasArr[k]; + } } } } else { - fillBias(params.input2, outputArray, n, params.N, params.K, PQ); + fillBias(params.input2, outputArray, n1, n2, params.N, params.K, PQ); if(params.input1.isInSparseFormat()) { - Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1); + Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n1, n2); while(iter.hasNext()) { IJV ijv = iter.next(); int i = ijv.getI(); @@ -676,7 +736,7 @@ public class LibMatrixDNN { } else { double [] inputArr = params.input1.getDenseBlock(); - for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) { + for(int i = n1*numOutCols; i < n2*numOutCols; i++) { outputArray[i] += inputArr[i]; } } @@ -684,23 +744,27 @@ public class LibMatrixDNN { } - private static void fillBias(MatrixBlock bias, double [] outputArray, int n, int N, int K, int PQ) { + private static void fillBias(MatrixBlock bias, double [] outputArray, int n1, int n2, int N, int K, int PQ) { if(bias.isInSparseFormat()) { Iterator<IJV> iter = bias.sparseBlock.getIterator(); while(iter.hasNext()) { IJV ijv = iter.next(); int k = ijv.getI(); double val = ijv.getV(); - int fromIndex = n*K*PQ + k*PQ; - Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val); + for(int n = n1; n < n2; n++) { + int fromIndex = n*K*PQ + k*PQ; + Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val); + } } } else { double [] biasArr = bias.getDenseBlock(); - for(int k = 0; k < K; k++) { - int fromIndex = n*K*PQ + k*PQ; - double val = biasArr[k]; - Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val); + for(int n = n1; n < n2; n++) { + for(int k = 0; k < K; k++) { + int fromIndex = n*K*PQ + k*PQ; + double val = biasArr[k]; + Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val); + } } } } @@ -928,8 +992,7 @@ public class LibMatrixDNN { doPoolingBackward(n, params); break; case BiasAdd: - for(int n = n1; n < n2; n++) - doBiasAdd(n, params); + doBiasAdd(n1, n2, params); break; case ReluBackward: for(int n = n1; n < n2; n++)
