[SYSTEML-1758] added cbind and rbind for GPU Closes #570
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4e47b5e1 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4e47b5e1 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4e47b5e1 Branch: refs/heads/master Commit: 4e47b5e10ff1abdf1ef53c2b1b0d80614ec8e416 Parents: cd1ae5b Author: Nakul Jindal <[email protected]> Authored: Thu Jul 13 14:31:47 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Thu Jul 13 14:31:47 2017 -0700 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 78 +- src/main/cpp/kernels/SystemML.ptx | 1043 ++++++++++-------- .../java/org/apache/sysml/hops/BinaryOp.java | 21 +- src/main/java/org/apache/sysml/lops/Append.java | 95 ++ .../java/org/apache/sysml/lops/AppendCP.java | 93 -- .../instructions/CPInstructionParser.java | 4 +- .../instructions/GPUInstructionParser.java | 17 +- .../gpu/BuiltinUnaryGPUInstruction.java | 2 +- .../instructions/gpu/GPUInstruction.java | 3 + .../gpu/MatrixAppendGPUInstruction.java | 102 ++ .../runtime/matrix/data/LibMatrixCUDA.java | 109 +- .../org/apache/sysml/test/gpu/AppendTest.java | 108 ++ .../test/integration/gpu/ZPackageSuite.java | 2 + 13 files changed, 1099 insertions(+), 578 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 3098282..297269f 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -216,7 +216,7 @@ __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C, bIndex = iy; // rlen == 1 C[outIndex] = binaryOp(A[aIndex], B[bIndex], op); //printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex, B[bIndex], (ix+1), (iy+1)); - __syncthreads(); + __syncthreads(); } } @@ -238,9 +238,9 @@ __global__ void matrix_scalar_op(double* A, double scalar, double* C, int size, C[index] = binaryOp(scalar, A[index], op); } else { C[index] = binaryOp(A[index], scalar, op); - } + } } - __syncthreads(); + __syncthreads(); } @@ -259,6 +259,78 @@ __global__ void fill(double* A, double scalar, int lenA) { } /** + * Appends Matrix B to the right side of Matrix A into a new matrix C + * | 1 2 3 4 | | 8 8 8 | | 1 2 3 4 8 8 8 | + * cbind ( | 9 8 7 6 | , | 7 7 7 | ) = | 9 8 7 6 7 7 7 | + * | 4 3 2 1 | | 9 9 9 | | 4 3 2 1 9 9 9 | + * @param A input matrix A allocated on the GPU + * @param B input matrix B allocated on the GPU + * @param C input matrix C allocated on the GPU + * @param rowsA rows in A + * @param colsA columns in A + * @param rowsB rows in B + * @param colsB columns in B + */ +extern "C" +__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + + int colsC = colsA + colsB; + int rowsC = rowsA; + + // Copy an element of A into C into the appropriate location + if (ix < rowsA && iy < colsA) { + double elemA = A[ix * colsA + iy]; + C[ix * colsC + iy] = elemA; + } + + // Copy an element of B into C into the appropriate location + if (ix < rowsB && iy < colsB) { + double elemB = B[ix * colsB + iy]; + C[ix * colsC + (iy + colsA)] = elemB; + } +} + + +/** + * Appends Matrix B to the bottom of Matrix A into a new matrix C + * | 2 3 4 | | 8 8 8 | | 2 3 4 | + * rbind ( | 8 7 6 | , | 7 7 7 | ) = | 8 7 6 | + * | 3 2 1 | | 3 2 1 | + | 8 8 8 | + | 7 7 7 | + * @param A input matrix A allocated on the GPU + * @param B input matrix B allocated on the GPU + * @param C input matrix C allocated on the GPU + * @param rowsA rows in A + * @param colsA columns in A + * @param rowsB rows in B + * @param colsB columns in B + */ +extern "C" +__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + + int rowsC = rowsA + rowsB; + int colsC = colsA; + + // Copy an element of A into C into the appropriate location + if (ix < rowsA && iy < colsA) { + double elemA = A[ix * colsA + iy]; + C[ix * colsC + iy] = elemA; + } + + // Copy an element of B into C into the appropriate location + if (ix < rowsB && iy < colsB) { + double elemB = B[ix * colsB + iy]; + C[(ix + rowsA) * colsC + iy] = elemB; + } +} + + +/** * Does a reduce operation over all elements of the array. * This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0) * and the Reduction example available through jcuda.org http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index ab43758..6884d5b 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1813,6 +1813,151 @@ BB9_2: ret; } + // .globl cbind +.visible .entry cbind( + .param .u64 cbind_param_0, + .param .u64 cbind_param_1, + .param .u64 cbind_param_2, + .param .u32 cbind_param_3, + .param .u32 cbind_param_4, + .param .u32 cbind_param_5, + .param .u32 cbind_param_6 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<19>; + .reg .f64 %fd<3>; + .reg .b64 %rd<15>; + + + ld.param.u64 %rd2, [cbind_param_0]; + ld.param.u64 %rd3, [cbind_param_1]; + ld.param.u64 %rd4, [cbind_param_2]; + ld.param.u32 %r7, [cbind_param_3]; + ld.param.u32 %r4, [cbind_param_4]; + ld.param.u32 %r5, [cbind_param_5]; + ld.param.u32 %r6, [cbind_param_6]; + cvta.to.global.u64 %rd1, %rd4; + mov.u32 %r8, %ntid.x; + mov.u32 %r9, %ctaid.x; + mov.u32 %r10, %tid.x; + mad.lo.s32 %r1, %r8, %r9, %r10; + mov.u32 %r11, %ntid.y; + mov.u32 %r12, %ctaid.y; + mov.u32 %r13, %tid.y; + mad.lo.s32 %r2, %r11, %r12, %r13; + add.s32 %r3, %r6, %r4; + setp.lt.s32 %p1, %r1, %r7; + setp.lt.s32 %p2, %r2, %r4; + and.pred %p3, %p1, %p2; + @!%p3 bra BB10_2; + bra.uni BB10_1; + +BB10_1: + cvta.to.global.u64 %rd5, %rd2; + mad.lo.s32 %r14, %r1, %r4, %r2; + mul.wide.s32 %rd6, %r14, 8; + add.s64 %rd7, %rd5, %rd6; + ld.global.f64 %fd1, [%rd7]; + mad.lo.s32 %r15, %r1, %r3, %r2; + mul.wide.s32 %rd8, %r15, 8; + add.s64 %rd9, %rd1, %rd8; + st.global.f64 [%rd9], %fd1; + +BB10_2: + setp.lt.s32 %p4, %r1, %r5; + setp.lt.s32 %p5, %r2, %r6; + and.pred %p6, %p4, %p5; + @!%p6 bra BB10_4; + bra.uni BB10_3; + +BB10_3: + cvta.to.global.u64 %rd10, %rd3; + mad.lo.s32 %r16, %r1, %r6, %r2; + mul.wide.s32 %rd11, %r16, 8; + add.s64 %rd12, %rd10, %rd11; + ld.global.f64 %fd2, [%rd12]; + mad.lo.s32 %r17, %r1, %r3, %r4; + add.s32 %r18, %r17, %r2; + mul.wide.s32 %rd13, %r18, 8; + add.s64 %rd14, %rd1, %rd13; + st.global.f64 [%rd14], %fd2; + +BB10_4: + ret; +} + + // .globl rbind +.visible .entry rbind( + .param .u64 rbind_param_0, + .param .u64 rbind_param_1, + .param .u64 rbind_param_2, + .param .u32 rbind_param_3, + .param .u32 rbind_param_4, + .param .u32 rbind_param_5, + .param .u32 rbind_param_6 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<17>; + .reg .f64 %fd<3>; + .reg .b64 %rd<14>; + + + ld.param.u64 %rd2, [rbind_param_0]; + ld.param.u64 %rd3, [rbind_param_1]; + ld.param.u64 %rd4, [rbind_param_2]; + ld.param.u32 %r3, [rbind_param_3]; + ld.param.u32 %r4, [rbind_param_4]; + ld.param.u32 %r5, [rbind_param_5]; + ld.param.u32 %r6, [rbind_param_6]; + cvta.to.global.u64 %rd1, %rd4; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %ctaid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r7, %r8, %r9; + mov.u32 %r10, %ntid.y; + mov.u32 %r11, %ctaid.y; + mov.u32 %r12, %tid.y; + mad.lo.s32 %r2, %r10, %r11, %r12; + setp.lt.s32 %p1, %r1, %r3; + setp.lt.s32 %p2, %r2, %r4; + and.pred %p3, %p1, %p2; + @!%p3 bra BB11_2; + bra.uni BB11_1; + +BB11_1: + cvta.to.global.u64 %rd5, %rd2; + mad.lo.s32 %r13, %r1, %r4, %r2; + mul.wide.s32 %rd6, %r13, 8; + add.s64 %rd7, %rd5, %rd6; + ld.global.f64 %fd1, [%rd7]; + add.s64 %rd8, %rd1, %rd6; + st.global.f64 [%rd8], %fd1; + +BB11_2: + setp.lt.s32 %p4, %r1, %r5; + setp.lt.s32 %p5, %r2, %r6; + and.pred %p6, %p4, %p5; + @!%p6 bra BB11_4; + bra.uni BB11_3; + +BB11_3: + cvta.to.global.u64 %rd9, %rd3; + mad.lo.s32 %r14, %r1, %r6, %r2; + mul.wide.s32 %rd10, %r14, 8; + add.s64 %rd11, %rd9, %rd10; + ld.global.f64 %fd2, [%rd11]; + add.s32 %r15, %r1, %r3; + mad.lo.s32 %r16, %r15, %r4, %r2; + mul.wide.s32 %rd12, %r16, 8; + add.s64 %rd13, %rd1, %rd12; + st.global.f64 [%rd13], %fd2; + +BB11_4: + ret; +} + // .globl reduce_sum .visible .entry reduce_sum( .param .u64 reduce_sum_param_0, @@ -1837,9 +1982,9 @@ BB9_2: mov.f64 %fd76, 0d0000000000000000; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB10_4; + @%p1 bra BB12_4; -BB10_1: +BB12_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -1848,23 +1993,23 @@ BB10_1: add.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB10_3; + @%p2 bra BB12_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; add.f64 %fd78, %fd78, %fd31; -BB10_3: +BB12_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB10_1; + @%p3 bra BB12_1; -BB10_4: +BB12_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1872,130 +2017,130 @@ BB10_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB10_8; + @%p4 bra BB12_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB10_7; + @%p5 bra BB12_7; ld.shared.f64 %fd32, [%rd1+4096]; add.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB10_7: +BB12_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB10_8: +BB12_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB10_12; + @%p6 bra BB12_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB10_11; + @%p7 bra BB12_11; ld.shared.f64 %fd33, [%rd1+2048]; add.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB10_11: +BB12_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB10_12: +BB12_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB10_16; + @%p8 bra BB12_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB10_15; + @%p9 bra BB12_15; ld.shared.f64 %fd34, [%rd1+1024]; add.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB10_15: +BB12_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB10_16: +BB12_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB10_20; + @%p10 bra BB12_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB10_19; + @%p11 bra BB12_19; ld.shared.f64 %fd35, [%rd1+512]; add.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB10_19: +BB12_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB10_20: +BB12_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB10_33; + @%p12 bra BB12_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB10_23; + @%p13 bra BB12_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; add.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB10_23: +BB12_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB10_25; + @%p14 bra BB12_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; add.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB10_25: +BB12_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB10_27; + @%p15 bra BB12_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; add.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB10_27: +BB12_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB10_29; + @%p16 bra BB12_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; add.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB10_29: +BB12_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB10_31; + @%p17 bra BB12_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; add.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB10_31: +BB12_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB10_33; + @%p18 bra BB12_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; add.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB10_33: +BB12_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB10_35; + @%p19 bra BB12_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2003,7 +2148,7 @@ BB10_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB10_35: +BB12_35: ret; } @@ -2027,17 +2172,17 @@ BB10_35: ld.param.u32 %r4, [reduce_row_sum_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB11_35; + @%p1 bra BB13_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0d0000000000000000; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB11_4; + @%p2 bra BB13_4; cvta.to.global.u64 %rd3, %rd1; -BB11_3: +BB13_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2047,9 +2192,9 @@ BB11_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB11_3; + @%p3 bra BB13_3; -BB11_4: +BB13_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2059,130 +2204,130 @@ BB11_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB11_8; + @%p4 bra BB13_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB11_7; + @%p5 bra BB13_7; ld.shared.f64 %fd29, [%rd8+4096]; add.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB11_7: +BB13_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB11_8: +BB13_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB11_12; + @%p6 bra BB13_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB11_11; + @%p7 bra BB13_11; ld.shared.f64 %fd30, [%rd8+2048]; add.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB11_11: +BB13_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB11_12: +BB13_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB11_16; + @%p8 bra BB13_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB11_15; + @%p9 bra BB13_15; ld.shared.f64 %fd31, [%rd8+1024]; add.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB11_15: +BB13_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB11_16: +BB13_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB11_20; + @%p10 bra BB13_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB11_19; + @%p11 bra BB13_19; ld.shared.f64 %fd32, [%rd8+512]; add.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB11_19: +BB13_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB11_20: +BB13_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB11_33; + @%p12 bra BB13_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB11_23; + @%p13 bra BB13_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; add.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB11_23: +BB13_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB11_25; + @%p14 bra BB13_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; add.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB11_25: +BB13_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB11_27; + @%p15 bra BB13_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; add.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB11_27: +BB13_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB11_29; + @%p16 bra BB13_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; add.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB11_29: +BB13_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB11_31; + @%p17 bra BB13_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; add.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB11_31: +BB13_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB11_33; + @%p18 bra BB13_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; add.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB11_33: +BB13_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB11_35; + @%p19 bra BB13_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -2190,7 +2335,7 @@ BB11_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB11_35: +BB13_35: ret; } @@ -2217,18 +2362,18 @@ BB11_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB12_5; + @%p1 bra BB14_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d0000000000000000; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB12_4; + @%p2 bra BB14_4; mov.u32 %r10, %r1; -BB12_3: +BB14_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -2238,15 +2383,15 @@ BB12_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB12_3; + @%p3 bra BB14_3; -BB12_4: +BB14_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB12_5: +BB14_5: ret; } @@ -2274,9 +2419,9 @@ BB12_5: mov.f64 %fd76, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB13_4; + @%p1 bra BB15_4; -BB13_1: +BB15_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -2285,23 +2430,23 @@ BB13_1: max.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB13_3; + @%p2 bra BB15_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; max.f64 %fd78, %fd78, %fd31; -BB13_3: +BB15_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB13_1; + @%p3 bra BB15_1; -BB13_4: +BB15_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2309,130 +2454,130 @@ BB13_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB13_8; + @%p4 bra BB15_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB13_7; + @%p5 bra BB15_7; ld.shared.f64 %fd32, [%rd1+4096]; max.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB13_7: +BB15_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB13_8: +BB15_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB13_12; + @%p6 bra BB15_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB13_11; + @%p7 bra BB15_11; ld.shared.f64 %fd33, [%rd1+2048]; max.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB13_11: +BB15_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB13_12: +BB15_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB13_16; + @%p8 bra BB15_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB13_15; + @%p9 bra BB15_15; ld.shared.f64 %fd34, [%rd1+1024]; max.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB13_15: +BB15_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB13_16: +BB15_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB13_20; + @%p10 bra BB15_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB13_19; + @%p11 bra BB15_19; ld.shared.f64 %fd35, [%rd1+512]; max.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB13_19: +BB15_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB13_20: +BB15_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB13_33; + @%p12 bra BB15_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB13_23; + @%p13 bra BB15_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; max.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB13_23: +BB15_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB13_25; + @%p14 bra BB15_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; max.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB13_25: +BB15_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB13_27; + @%p15 bra BB15_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; max.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB13_27: +BB15_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB13_29; + @%p16 bra BB15_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; max.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB13_29: +BB15_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB13_31; + @%p17 bra BB15_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; max.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB13_31: +BB15_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB13_33; + @%p18 bra BB15_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; max.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB13_33: +BB15_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB13_35; + @%p19 bra BB15_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2440,7 +2585,7 @@ BB13_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB13_35: +BB15_35: ret; } @@ -2464,17 +2609,17 @@ BB13_35: ld.param.u32 %r4, [reduce_row_max_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB14_35; + @%p1 bra BB16_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB14_4; + @%p2 bra BB16_4; cvta.to.global.u64 %rd3, %rd1; -BB14_3: +BB16_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2484,9 +2629,9 @@ BB14_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB14_3; + @%p3 bra BB16_3; -BB14_4: +BB16_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2496,130 +2641,130 @@ BB14_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB14_8; + @%p4 bra BB16_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB14_7; + @%p5 bra BB16_7; ld.shared.f64 %fd29, [%rd8+4096]; max.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB14_7: +BB16_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB14_8: +BB16_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB14_12; + @%p6 bra BB16_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB14_11; + @%p7 bra BB16_11; ld.shared.f64 %fd30, [%rd8+2048]; max.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB14_11: +BB16_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB14_12: +BB16_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB14_16; + @%p8 bra BB16_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB14_15; + @%p9 bra BB16_15; ld.shared.f64 %fd31, [%rd8+1024]; max.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB14_15: +BB16_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB14_16: +BB16_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB14_20; + @%p10 bra BB16_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB14_19; + @%p11 bra BB16_19; ld.shared.f64 %fd32, [%rd8+512]; max.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB14_19: +BB16_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB14_20: +BB16_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB14_33; + @%p12 bra BB16_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB14_23; + @%p13 bra BB16_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; max.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB14_23: +BB16_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB14_25; + @%p14 bra BB16_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; max.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB14_25: +BB16_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB14_27; + @%p15 bra BB16_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; max.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB14_27: +BB16_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB14_29; + @%p16 bra BB16_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; max.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB14_29: +BB16_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB14_31; + @%p17 bra BB16_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; max.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB14_31: +BB16_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB14_33; + @%p18 bra BB16_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; max.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB14_33: +BB16_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB14_35; + @%p19 bra BB16_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -2627,7 +2772,7 @@ BB14_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB14_35: +BB16_35: ret; } @@ -2654,18 +2799,18 @@ BB14_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB15_5; + @%p1 bra BB17_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0dFFEFFFFFFFFFFFFF; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB15_4; + @%p2 bra BB17_4; mov.u32 %r10, %r1; -BB15_3: +BB17_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -2675,15 +2820,15 @@ BB15_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB15_3; + @%p3 bra BB17_3; -BB15_4: +BB17_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB15_5: +BB17_5: ret; } @@ -2711,9 +2856,9 @@ BB15_5: mov.f64 %fd76, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB16_4; + @%p1 bra BB18_4; -BB16_1: +BB18_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -2722,23 +2867,23 @@ BB16_1: min.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB16_3; + @%p2 bra BB18_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; min.f64 %fd78, %fd78, %fd31; -BB16_3: +BB18_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB16_1; + @%p3 bra BB18_1; -BB16_4: +BB18_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -2746,130 +2891,130 @@ BB16_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB16_8; + @%p4 bra BB18_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB16_7; + @%p5 bra BB18_7; ld.shared.f64 %fd32, [%rd1+4096]; min.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB16_7: +BB18_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB16_8: +BB18_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB16_12; + @%p6 bra BB18_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB16_11; + @%p7 bra BB18_11; ld.shared.f64 %fd33, [%rd1+2048]; min.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB16_11: +BB18_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB16_12: +BB18_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB16_16; + @%p8 bra BB18_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB16_15; + @%p9 bra BB18_15; ld.shared.f64 %fd34, [%rd1+1024]; min.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB16_15: +BB18_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB16_16: +BB18_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB16_20; + @%p10 bra BB18_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB16_19; + @%p11 bra BB18_19; ld.shared.f64 %fd35, [%rd1+512]; min.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB16_19: +BB18_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB16_20: +BB18_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB16_33; + @%p12 bra BB18_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB16_23; + @%p13 bra BB18_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; min.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB16_23: +BB18_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB16_25; + @%p14 bra BB18_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; min.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB16_25: +BB18_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB16_27; + @%p15 bra BB18_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; min.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB16_27: +BB18_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB16_29; + @%p16 bra BB18_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; min.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB16_29: +BB18_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB16_31; + @%p17 bra BB18_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; min.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB16_31: +BB18_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB16_33; + @%p18 bra BB18_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; min.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB16_33: +BB18_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB16_35; + @%p19 bra BB18_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -2877,7 +3022,7 @@ BB16_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB16_35: +BB18_35: ret; } @@ -2901,17 +3046,17 @@ BB16_35: ld.param.u32 %r4, [reduce_row_min_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB17_35; + @%p1 bra BB19_35; mov.u32 %r38, %tid.x; mov.f64 %fd72, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd73, %fd72; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB17_4; + @%p2 bra BB19_4; cvta.to.global.u64 %rd3, %rd1; -BB17_3: +BB19_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -2921,9 +3066,9 @@ BB17_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd72, %fd73; - @%p3 bra BB17_3; + @%p3 bra BB19_3; -BB17_4: +BB19_4: mov.f64 %fd70, %fd72; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -2933,130 +3078,130 @@ BB17_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB17_8; + @%p4 bra BB19_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd71, %fd70; - @%p5 bra BB17_7; + @%p5 bra BB19_7; ld.shared.f64 %fd29, [%rd8+4096]; min.f64 %fd71, %fd70, %fd29; st.shared.f64 [%rd8], %fd71; -BB17_7: +BB19_7: mov.f64 %fd70, %fd71; bar.sync 0; -BB17_8: +BB19_8: mov.f64 %fd68, %fd70; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB17_12; + @%p6 bra BB19_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd69, %fd68; - @%p7 bra BB17_11; + @%p7 bra BB19_11; ld.shared.f64 %fd30, [%rd8+2048]; min.f64 %fd69, %fd68, %fd30; st.shared.f64 [%rd8], %fd69; -BB17_11: +BB19_11: mov.f64 %fd68, %fd69; bar.sync 0; -BB17_12: +BB19_12: mov.f64 %fd66, %fd68; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB17_16; + @%p8 bra BB19_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd67, %fd66; - @%p9 bra BB17_15; + @%p9 bra BB19_15; ld.shared.f64 %fd31, [%rd8+1024]; min.f64 %fd67, %fd66, %fd31; st.shared.f64 [%rd8], %fd67; -BB17_15: +BB19_15: mov.f64 %fd66, %fd67; bar.sync 0; -BB17_16: +BB19_16: mov.f64 %fd64, %fd66; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB17_20; + @%p10 bra BB19_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd65, %fd64; - @%p11 bra BB17_19; + @%p11 bra BB19_19; ld.shared.f64 %fd32, [%rd8+512]; min.f64 %fd65, %fd64, %fd32; st.shared.f64 [%rd8], %fd65; -BB17_19: +BB19_19: mov.f64 %fd64, %fd65; bar.sync 0; -BB17_20: +BB19_20: mov.f64 %fd63, %fd64; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB17_33; + @%p12 bra BB19_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB17_23; + @%p13 bra BB19_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; min.f64 %fd63, %fd63, %fd33; st.volatile.shared.f64 [%rd8], %fd63; -BB17_23: +BB19_23: mov.f64 %fd62, %fd63; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB17_25; + @%p14 bra BB19_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; min.f64 %fd62, %fd62, %fd34; st.volatile.shared.f64 [%rd8], %fd62; -BB17_25: +BB19_25: mov.f64 %fd61, %fd62; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB17_27; + @%p15 bra BB19_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; min.f64 %fd61, %fd61, %fd35; st.volatile.shared.f64 [%rd8], %fd61; -BB17_27: +BB19_27: mov.f64 %fd60, %fd61; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB17_29; + @%p16 bra BB19_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; min.f64 %fd60, %fd60, %fd36; st.volatile.shared.f64 [%rd8], %fd60; -BB17_29: +BB19_29: mov.f64 %fd59, %fd60; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB17_31; + @%p17 bra BB19_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; min.f64 %fd59, %fd59, %fd37; st.volatile.shared.f64 [%rd8], %fd59; -BB17_31: +BB19_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB17_33; + @%p18 bra BB19_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; min.f64 %fd39, %fd59, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB17_33: +BB19_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB17_35; + @%p19 bra BB19_35; ld.shared.f64 %fd40, [sdata]; cvta.to.global.u64 %rd39, %rd2; @@ -3064,7 +3209,7 @@ BB17_33: add.s64 %rd41, %rd39, %rd40; st.global.f64 [%rd41], %fd40; -BB17_35: +BB19_35: ret; } @@ -3091,18 +3236,18 @@ BB17_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB18_5; + @%p1 bra BB20_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd8, 0d7FEFFFFFFFFFFFFF; mov.f64 %fd9, %fd8; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB18_4; + @%p2 bra BB20_4; mov.u32 %r10, %r1; -BB18_3: +BB20_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -3112,15 +3257,15 @@ BB18_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd8, %fd9; - @%p3 bra BB18_3; + @%p3 bra BB20_3; -BB18_4: +BB20_4: cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r1, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd8; -BB18_5: +BB20_5: ret; } @@ -3148,9 +3293,9 @@ BB18_5: mov.f64 %fd76, 0d3FF0000000000000; mov.f64 %fd77, %fd76; setp.ge.u32 %p1, %r32, %r5; - @%p1 bra BB19_4; + @%p1 bra BB21_4; -BB19_1: +BB21_1: mov.f64 %fd1, %fd77; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r32, 8; @@ -3159,23 +3304,23 @@ BB19_1: mul.f64 %fd78, %fd1, %fd30; add.s32 %r3, %r32, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB19_3; + @%p2 bra BB21_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd31, [%rd9]; mul.f64 %fd78, %fd78, %fd31; -BB19_3: +BB21_3: mov.f64 %fd77, %fd78; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r32, %r12, %r13, %r32; setp.lt.u32 %p3, %r32, %r5; mov.f64 %fd76, %fd77; - @%p3 bra BB19_1; + @%p3 bra BB21_1; -BB19_4: +BB21_4: mov.f64 %fd74, %fd76; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -3183,130 +3328,130 @@ BB19_4: st.shared.f64 [%rd1], %fd74; bar.sync 0; setp.lt.u32 %p4, %r9, 1024; - @%p4 bra BB19_8; + @%p4 bra BB21_8; setp.gt.u32 %p5, %r6, 511; mov.f64 %fd75, %fd74; - @%p5 bra BB19_7; + @%p5 bra BB21_7; ld.shared.f64 %fd32, [%rd1+4096]; mul.f64 %fd75, %fd74, %fd32; st.shared.f64 [%rd1], %fd75; -BB19_7: +BB21_7: mov.f64 %fd74, %fd75; bar.sync 0; -BB19_8: +BB21_8: mov.f64 %fd72, %fd74; setp.lt.u32 %p6, %r9, 512; - @%p6 bra BB19_12; + @%p6 bra BB21_12; setp.gt.u32 %p7, %r6, 255; mov.f64 %fd73, %fd72; - @%p7 bra BB19_11; + @%p7 bra BB21_11; ld.shared.f64 %fd33, [%rd1+2048]; mul.f64 %fd73, %fd72, %fd33; st.shared.f64 [%rd1], %fd73; -BB19_11: +BB21_11: mov.f64 %fd72, %fd73; bar.sync 0; -BB19_12: +BB21_12: mov.f64 %fd70, %fd72; setp.lt.u32 %p8, %r9, 256; - @%p8 bra BB19_16; + @%p8 bra BB21_16; setp.gt.u32 %p9, %r6, 127; mov.f64 %fd71, %fd70; - @%p9 bra BB19_15; + @%p9 bra BB21_15; ld.shared.f64 %fd34, [%rd1+1024]; mul.f64 %fd71, %fd70, %fd34; st.shared.f64 [%rd1], %fd71; -BB19_15: +BB21_15: mov.f64 %fd70, %fd71; bar.sync 0; -BB19_16: +BB21_16: mov.f64 %fd68, %fd70; setp.lt.u32 %p10, %r9, 128; - @%p10 bra BB19_20; + @%p10 bra BB21_20; setp.gt.u32 %p11, %r6, 63; mov.f64 %fd69, %fd68; - @%p11 bra BB19_19; + @%p11 bra BB21_19; ld.shared.f64 %fd35, [%rd1+512]; mul.f64 %fd69, %fd68, %fd35; st.shared.f64 [%rd1], %fd69; -BB19_19: +BB21_19: mov.f64 %fd68, %fd69; bar.sync 0; -BB19_20: +BB21_20: mov.f64 %fd67, %fd68; setp.gt.u32 %p12, %r6, 31; - @%p12 bra BB19_33; + @%p12 bra BB21_33; setp.lt.u32 %p13, %r9, 64; - @%p13 bra BB19_23; + @%p13 bra BB21_23; ld.volatile.shared.f64 %fd36, [%rd1+256]; mul.f64 %fd67, %fd67, %fd36; st.volatile.shared.f64 [%rd1], %fd67; -BB19_23: +BB21_23: mov.f64 %fd66, %fd67; setp.lt.u32 %p14, %r9, 32; - @%p14 bra BB19_25; + @%p14 bra BB21_25; ld.volatile.shared.f64 %fd37, [%rd1+128]; mul.f64 %fd66, %fd66, %fd37; st.volatile.shared.f64 [%rd1], %fd66; -BB19_25: +BB21_25: mov.f64 %fd65, %fd66; setp.lt.u32 %p15, %r9, 16; - @%p15 bra BB19_27; + @%p15 bra BB21_27; ld.volatile.shared.f64 %fd38, [%rd1+64]; mul.f64 %fd65, %fd65, %fd38; st.volatile.shared.f64 [%rd1], %fd65; -BB19_27: +BB21_27: mov.f64 %fd64, %fd65; setp.lt.u32 %p16, %r9, 8; - @%p16 bra BB19_29; + @%p16 bra BB21_29; ld.volatile.shared.f64 %fd39, [%rd1+32]; mul.f64 %fd64, %fd64, %fd39; st.volatile.shared.f64 [%rd1], %fd64; -BB19_29: +BB21_29: mov.f64 %fd63, %fd64; setp.lt.u32 %p17, %r9, 4; - @%p17 bra BB19_31; + @%p17 bra BB21_31; ld.volatile.shared.f64 %fd40, [%rd1+16]; mul.f64 %fd63, %fd63, %fd40; st.volatile.shared.f64 [%rd1], %fd63; -BB19_31: +BB21_31: setp.lt.u32 %p18, %r9, 2; - @%p18 bra BB19_33; + @%p18 bra BB21_33; ld.volatile.shared.f64 %fd41, [%rd1+8]; mul.f64 %fd42, %fd63, %fd41; st.volatile.shared.f64 [%rd1], %fd42; -BB19_33: +BB21_33: setp.ne.s32 %p19, %r6, 0; - @%p19 bra BB19_35; + @%p19 bra BB21_35; ld.shared.f64 %fd43, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -3314,7 +3459,7 @@ BB19_33: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd43; -BB19_35: +BB21_35: ret; } @@ -3338,17 +3483,17 @@ BB19_35: ld.param.u32 %r4, [reduce_row_mean_param_3]; mov.u32 %r6, %ctaid.x; setp.ge.u32 %p1, %r6, %r5; - @%p1 bra BB20_35; + @%p1 bra BB22_35; mov.u32 %r38, %tid.x; mov.f64 %fd74, 0d0000000000000000; mov.f64 %fd75, %fd74; setp.ge.u32 %p2, %r38, %r4; - @%p2 bra BB20_4; + @%p2 bra BB22_4; cvta.to.global.u64 %rd3, %rd1; -BB20_3: +BB22_3: mad.lo.s32 %r8, %r6, %r4, %r38; mul.wide.u32 %rd4, %r8, 8; add.s64 %rd5, %rd3, %rd4; @@ -3358,9 +3503,9 @@ BB20_3: add.s32 %r38, %r9, %r38; setp.lt.u32 %p3, %r38, %r4; mov.f64 %fd74, %fd75; - @%p3 bra BB20_3; + @%p3 bra BB22_3; -BB20_4: +BB22_4: mov.f64 %fd72, %fd74; mov.u32 %r10, %tid.x; mul.wide.u32 %rd6, %r10, 8; @@ -3370,130 +3515,130 @@ BB20_4: bar.sync 0; mov.u32 %r11, %ntid.x; setp.lt.u32 %p4, %r11, 1024; - @%p4 bra BB20_8; + @%p4 bra BB22_8; setp.gt.u32 %p5, %r10, 511; mov.f64 %fd73, %fd72; - @%p5 bra BB20_7; + @%p5 bra BB22_7; ld.shared.f64 %fd29, [%rd8+4096]; add.f64 %fd73, %fd72, %fd29; st.shared.f64 [%rd8], %fd73; -BB20_7: +BB22_7: mov.f64 %fd72, %fd73; bar.sync 0; -BB20_8: +BB22_8: mov.f64 %fd70, %fd72; setp.lt.u32 %p6, %r11, 512; - @%p6 bra BB20_12; + @%p6 bra BB22_12; setp.gt.u32 %p7, %r10, 255; mov.f64 %fd71, %fd70; - @%p7 bra BB20_11; + @%p7 bra BB22_11; ld.shared.f64 %fd30, [%rd8+2048]; add.f64 %fd71, %fd70, %fd30; st.shared.f64 [%rd8], %fd71; -BB20_11: +BB22_11: mov.f64 %fd70, %fd71; bar.sync 0; -BB20_12: +BB22_12: mov.f64 %fd68, %fd70; setp.lt.u32 %p8, %r11, 256; - @%p8 bra BB20_16; + @%p8 bra BB22_16; setp.gt.u32 %p9, %r10, 127; mov.f64 %fd69, %fd68; - @%p9 bra BB20_15; + @%p9 bra BB22_15; ld.shared.f64 %fd31, [%rd8+1024]; add.f64 %fd69, %fd68, %fd31; st.shared.f64 [%rd8], %fd69; -BB20_15: +BB22_15: mov.f64 %fd68, %fd69; bar.sync 0; -BB20_16: +BB22_16: mov.f64 %fd66, %fd68; setp.lt.u32 %p10, %r11, 128; - @%p10 bra BB20_20; + @%p10 bra BB22_20; setp.gt.u32 %p11, %r10, 63; mov.f64 %fd67, %fd66; - @%p11 bra BB20_19; + @%p11 bra BB22_19; ld.shared.f64 %fd32, [%rd8+512]; add.f64 %fd67, %fd66, %fd32; st.shared.f64 [%rd8], %fd67; -BB20_19: +BB22_19: mov.f64 %fd66, %fd67; bar.sync 0; -BB20_20: +BB22_20: mov.f64 %fd65, %fd66; setp.gt.u32 %p12, %r10, 31; - @%p12 bra BB20_33; + @%p12 bra BB22_33; setp.lt.u32 %p13, %r11, 64; - @%p13 bra BB20_23; + @%p13 bra BB22_23; ld.volatile.shared.f64 %fd33, [%rd8+256]; add.f64 %fd65, %fd65, %fd33; st.volatile.shared.f64 [%rd8], %fd65; -BB20_23: +BB22_23: mov.f64 %fd64, %fd65; setp.lt.u32 %p14, %r11, 32; - @%p14 bra BB20_25; + @%p14 bra BB22_25; ld.volatile.shared.f64 %fd34, [%rd8+128]; add.f64 %fd64, %fd64, %fd34; st.volatile.shared.f64 [%rd8], %fd64; -BB20_25: +BB22_25: mov.f64 %fd63, %fd64; setp.lt.u32 %p15, %r11, 16; - @%p15 bra BB20_27; + @%p15 bra BB22_27; ld.volatile.shared.f64 %fd35, [%rd8+64]; add.f64 %fd63, %fd63, %fd35; st.volatile.shared.f64 [%rd8], %fd63; -BB20_27: +BB22_27: mov.f64 %fd62, %fd63; setp.lt.u32 %p16, %r11, 8; - @%p16 bra BB20_29; + @%p16 bra BB22_29; ld.volatile.shared.f64 %fd36, [%rd8+32]; add.f64 %fd62, %fd62, %fd36; st.volatile.shared.f64 [%rd8], %fd62; -BB20_29: +BB22_29: mov.f64 %fd61, %fd62; setp.lt.u32 %p17, %r11, 4; - @%p17 bra BB20_31; + @%p17 bra BB22_31; ld.volatile.shared.f64 %fd37, [%rd8+16]; add.f64 %fd61, %fd61, %fd37; st.volatile.shared.f64 [%rd8], %fd61; -BB20_31: +BB22_31: setp.lt.u32 %p18, %r11, 2; - @%p18 bra BB20_33; + @%p18 bra BB22_33; ld.volatile.shared.f64 %fd38, [%rd8+8]; add.f64 %fd39, %fd61, %fd38; st.volatile.shared.f64 [%rd8], %fd39; -BB20_33: +BB22_33: setp.ne.s32 %p19, %r10, 0; - @%p19 bra BB20_35; + @%p19 bra BB22_35; ld.shared.f64 %fd40, [sdata]; cvt.u64.u32 %rd39, %r4; @@ -3504,7 +3649,7 @@ BB20_33: add.s64 %rd42, %rd40, %rd41; st.global.f64 [%rd42], %fd42; -BB20_35: +BB22_35: ret; } @@ -3531,18 +3676,18 @@ BB20_35: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB21_5; + @%p1 bra BB23_5; cvta.to.global.u64 %rd1, %rd2; mul.lo.s32 %r2, %r6, %r5; mov.f64 %fd10, 0d0000000000000000; mov.f64 %fd11, %fd10; setp.ge.u32 %p2, %r1, %r2; - @%p2 bra BB21_4; + @%p2 bra BB23_4; mov.u32 %r10, %r1; -BB21_3: +BB23_3: mov.u32 %r3, %r10; mul.wide.u32 %rd4, %r3, 8; add.s64 %rd5, %rd1, %rd4; @@ -3552,9 +3697,9 @@ BB21_3: setp.lt.u32 %p3, %r4, %r2; mov.u32 %r10, %r4; mov.f64 %fd10, %fd11; - @%p3 bra BB21_3; + @%p3 bra BB23_3; -BB21_4: +BB23_4: cvta.to.global.u64 %rd6, %rd3; cvt.u64.u32 %rd7, %r5; cvt.rn.f64.s64 %fd7, %rd7; @@ -3563,7 +3708,7 @@ BB21_4: add.s64 %rd9, %rd6, %rd8; st.global.f64 [%rd9], %fd8; -BB21_5: +BB23_5: ret; } @@ -3589,7 +3734,7 @@ BB21_5: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r7, %r6, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB22_5; + @%p1 bra BB24_5; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -3649,13 +3794,13 @@ BB21_5: mov.b32 %f2, %r11; abs.f32 %f1, %f2; setp.lt.f32 %p2, %f1, 0f4086232B; - @%p2 bra BB22_4; + @%p2 bra BB24_4; setp.lt.f64 %p3, %fd1, 0d0000000000000000; add.f64 %fd37, %fd1, 0d7FF0000000000000; selp.f64 %fd40, 0d0000000000000000, %fd37, %p3; setp.geu.f32 %p4, %f1, 0f40874800; - @%p4 bra BB22_4; + @%p4 bra BB24_4; shr.u32 %r12, %r2, 31; add.s32 %r13, %r2, %r12; @@ -3670,13 +3815,13 @@ BB21_5: mov.b64 %fd39, {%r20, %r19}; mul.f64 %fd40, %fd38, %fd39; -BB22_4: +BB24_4: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd40; -BB22_5: +BB24_5: ret; } @@ -3701,7 +3846,7 @@ BB22_5: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB23_2; + @%p1 bra BB25_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; @@ -3712,7 +3857,7 @@ BB22_5: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd2; -BB23_2: +BB25_2: ret; } @@ -3737,7 +3882,7 @@ BB23_2: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB24_4; + @%p1 bra BB26_4; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -3746,7 +3891,7 @@ BB23_2: ld.global.f64 %fd9, [%rd6]; abs.f64 %fd2, %fd9; setp.ge.f64 %p2, %fd2, 0d4330000000000000; - @%p2 bra BB24_3; + @%p2 bra BB26_3; add.f64 %fd5, %fd2, 0d3FE0000000000000; cvt.rzi.f64.f64 %fd6, %fd5; @@ -3768,7 +3913,7 @@ BB23_2: or.b32 %r10, %r7, %r9; mov.b64 %fd9, {%r6, %r10}; -BB24_3: +BB26_3: cvta.to.global.u64 %rd7, %rd3; cvt.rzi.s64.f64 %rd8, %fd9; cvt.rn.f64.s64 %fd8, %rd8; @@ -3776,7 +3921,7 @@ BB24_3: add.s64 %rd10, %rd7, %rd9; st.global.f64 [%rd10], %fd8; -BB24_4: +BB26_4: ret; } @@ -3801,7 +3946,7 @@ BB24_4: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB25_2; + @%p1 bra BB27_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; @@ -3812,7 +3957,7 @@ BB24_4: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd2; -BB25_2: +BB27_2: ret; } @@ -3838,7 +3983,7 @@ BB25_2: mov.u32 %r15, %tid.x; mad.lo.s32 %r1, %r14, %r13, %r15; setp.ge.u32 %p1, %r1, %r12; - @%p1 bra BB26_9; + @%p1 bra BB28_9; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -3855,7 +4000,7 @@ BB25_2: } mov.u32 %r31, -1023; setp.gt.s32 %p2, %r29, 1048575; - @%p2 bra BB26_3; + @%p2 bra BB28_3; mul.f64 %fd56, %fd56, 0d4350000000000000; { @@ -3868,20 +4013,20 @@ BB25_2: } mov.u32 %r31, -1077; -BB26_3: +BB28_3: add.s32 %r18, %r29, -1; setp.lt.u32 %p3, %r18, 2146435071; - @%p3 bra BB26_5; - bra.uni BB26_4; + @%p3 bra BB28_5; + bra.uni BB28_4; -BB26_5: +BB28_5: shr.u32 %r20, %r29, 20; add.s32 %r32, %r31, %r20; and.b32 %r21, %r29, -2146435073; or.b32 %r22, %r21, 1072693248; mov.b64 %fd57, {%r30, %r22}; setp.lt.s32 %p5, %r22, 1073127583; - @%p5 bra BB26_7; + @%p5 bra BB28_7; { .reg .b32 %temp; @@ -3895,7 +4040,7 @@ BB26_5: mov.b64 %fd57, {%r23, %r25}; add.s32 %r32, %r32, 1; -BB26_7: +BB28_7: add.f64 %fd13, %fd57, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd12,%fd13; @@ -3946,9 +4091,9 @@ BB26_7: mov.f64 %fd54, 0d3C7ABC9E3B39803F; fma.rn.f64 %fd55, %fd47, %fd54, %fd53; add.f64 %fd58, %fd49, %fd55; - bra.uni BB26_8; + bra.uni BB28_8; -BB26_4: +BB28_4: mov.f64 %fd10, 0d7FF0000000000000; fma.rn.f64 %fd11, %fd56, %fd10, %fd10; { @@ -3959,13 +4104,13 @@ BB26_4: setp.eq.f32 %p4, %f1, 0f00000000; selp.f64 %fd58, 0dFFF0000000000000, %fd11, %p4; -BB26_8: +BB28_8: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd58; -BB26_9: +BB28_9: ret; } @@ -3990,7 +4135,7 @@ BB26_9: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB27_2; + @%p1 bra BB29_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; @@ -4001,7 +4146,7 @@ BB26_9: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd2; -BB27_2: +BB29_2: ret; } @@ -4026,7 +4171,7 @@ BB27_2: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB28_2; + @%p1 bra BB30_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; @@ -4037,7 +4182,7 @@ BB27_2: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd2; -BB28_2: +BB30_2: ret; } @@ -4048,7 +4193,7 @@ BB28_2: .param .u32 matrix_sin_param_2 ) { - .local .align 4 .b8 __local_depot29[4]; + .local .align 4 .b8 __local_depot31[4]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<7>; @@ -4057,7 +4202,7 @@ BB28_2: .reg .b64 %rd<17>; - mov.u64 %rd16, __local_depot29; + mov.u64 %rd16, __local_depot31; cvta.local.u64 %SP, %rd16; ld.param.u64 %rd3, [matrix_sin_param_0]; ld.param.u64 %rd4, [matrix_sin_param_1]; @@ -4069,7 +4214,7 @@ BB28_2: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB29_11; + @%p1 bra BB31_11; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4082,19 +4227,19 @@ BB28_2: } and.b32 %r10, %r9, 2147483647; setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB29_4; + @%p2 bra BB31_4; { .reg .b32 %temp; mov.b64 {%r11, %temp}, %fd38; } setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB29_4; + @%p3 bra BB31_4; mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB29_4: +BB31_4: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r17, %fd15; st.local.u32 [%rd1], %r17; @@ -4112,7 +4257,7 @@ BB29_4: } and.b32 %r13, %r12, 2145386496; setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB29_6; + @%p4 bra BB31_6; // Callseq Start 3 { @@ -4135,7 +4280,7 @@ BB29_4: }// Callseq End 3 ld.local.u32 %r17, [%rd1]; -BB29_6: +BB31_6: and.b32 %r14, %r17, 1; shl.b32 %r15, %r14, 3; setp.eq.s32 %p5, %r14, 0; @@ -4157,27 +4302,27 @@ BB29_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB29_8; + @%p5 bra BB31_8; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB29_8: +BB31_8: and.b32 %r16, %r17, 2; setp.eq.s32 %p6, %r16, 0; - @%p6 bra BB29_10; + @%p6 bra BB31_10; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB29_10: +BB31_10: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB29_11: +BB31_11: ret; } @@ -4188,7 +4333,7 @@ BB29_11: .param .u32 matrix_cos_param_2 ) { - .local .align 4 .b8 __local_depot30[4]; + .local .align 4 .b8 __local_depot32[4]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<7>; @@ -4197,7 +4342,7 @@ BB29_11: .reg .b64 %rd<17>; - mov.u64 %rd16, __local_depot30; + mov.u64 %rd16, __local_depot32; cvta.local.u64 %SP, %rd16; ld.param.u64 %rd3, [matrix_cos_param_0]; ld.param.u64 %rd4, [matrix_cos_param_1]; @@ -4209,7 +4354,7 @@ BB29_11: mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; setp.ge.u32 %p1, %r1, %r6; - @%p1 bra BB30_11; + @%p1 bra BB32_11; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4222,19 +4367,19 @@ BB29_11: } and.b32 %r11, %r10, 2147483647; setp.ne.s32 %p2, %r11, 2146435072; - @%p2 bra BB30_4; + @%p2 bra BB32_4; { .reg .b32 %temp; mov.b64 {%r12, %temp}, %fd38; } setp.ne.s32 %p3, %r12, 0; - @%p3 bra BB30_4; + @%p3 bra BB32_4; mov.f64 %fd14, 0d0000000000000000; mul.rn.f64 %fd38, %fd38, %fd14; -BB30_4: +BB32_4: mul.f64 %fd15, %fd38, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r18, %fd15; st.local.u32 [%rd1], %r18; @@ -4252,7 +4397,7 @@ BB30_4: } and.b32 %r14, %r13, 2145386496; setp.lt.u32 %p4, %r14, 1105199104; - @%p4 bra BB30_6; + @%p4 bra BB32_6; // Callseq Start 4 { @@ -4275,7 +4420,7 @@ BB30_4: }// Callseq End 4 ld.local.u32 %r18, [%rd1]; -BB30_6: +BB32_6: add.s32 %r5, %r18, 1; and.b32 %r15, %r5, 1; shl.b32 %r16, %r15, 3; @@ -4298,27 +4443,27 @@ BB30_6: ld.const.f64 %fd34, [%rd12+48]; fma.rn.f64 %fd8, %fd33, %fd7, %fd34; fma.rn.f64 %fd40, %fd8, %fd39, %fd39; - @%p5 bra BB30_8; + @%p5 bra BB32_8; mov.f64 %fd35, 0d3FF0000000000000; fma.rn.f64 %fd40, %fd8, %fd7, %fd35; -BB30_8: +BB32_8: and.b32 %r17, %r5, 2; setp.eq.s32 %p6, %r17, 0; - @%p6 bra BB30_10; + @%p6 bra BB32_10; mov.f64 %fd36, 0d0000000000000000; mov.f64 %fd37, 0dBFF0000000000000; fma.rn.f64 %fd40, %fd40, %fd37, %fd36; -BB30_10: +BB32_10: cvta.to.global.u64 %rd13, %rd4; shl.b64 %rd14, %rd2, 3; add.s64 %rd15, %rd13, %rd14; st.global.f64 [%rd15], %fd40; -BB30_11: +BB32_11: ret; } @@ -4329,7 +4474,7 @@ BB30_11: .param .u32 matrix_tan_param_2 ) { - .local .align 4 .b8 __local_depot31[4]; + .local .align 4 .b8 __local_depot33[4]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<6>; @@ -4338,7 +4483,7 @@ BB30_11: .reg .b64 %rd<14>; - mov.u64 %rd13, __local_depot31; + mov.u64 %rd13, __local_depot33; cvta.local.u64 %SP, %rd13; ld.param.u64 %rd3, [matrix_tan_param_0]; ld.param.u64 %rd4, [matrix_tan_param_1]; @@ -4350,7 +4495,7 @@ BB30_11: mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.u32 %p1, %r1, %r5; - @%p1 bra BB31_9; + @%p1 bra BB33_9; cvta.to.global.u64 %rd6, %rd3; cvt.s64.s32 %rd2, %r1; @@ -4363,19 +4508,19 @@ BB30_11: } and.b32 %r10, %r9, 2147483647; setp.ne.s32 %p2, %r10, 2146435072; - @%p2 bra BB31_4; + @%p2 bra BB33_4; { .reg .b32 %temp; mov.b64 {%r11, %temp}, %fd63; } setp.ne.s32 %p3, %r11, 0; - @%p3 bra BB31_4; + @%p3 bra BB33_4; mov.f64 %fd11, 0d0000000000000000; mul.rn.f64 %fd63, %fd63, %fd11; -BB31_4: +BB33_4: mul.f64 %fd12, %fd63, 0d3FE45F306DC9C883; cvt.rni.s32.f64 %r15, %fd12; st.local.u32 [%rd1], %r15; @@ -4393,7 +4538,7 @@ BB31_4: } and.b32 %r13, %r12, 2145386496; setp.lt.u32 %p4, %r13, 1105199104; - @%p4 bra BB31_6; + @%p4 bra BB33_6; // Callseq Start 5 { @@ -4416,7 +4561,7 @@ BB31_4: }// Callseq End 5 ld.local.u32 %r15, [%rd1]; -BB31_6: +BB33_6: mul.f64 %fd20, %fd64, %fd64; mov.f64 %fd21, 0dBEF9757C5B27EBB1; mov.f64 %fd22, 0d3EE48DAC2799BCB9; @@ -4451,10 +4596,10 @@ BB31_6: fma.rn.f64 %fd65, %fd7, %fd64, %fd64; and.b32 %r14, %r15, 1; setp.eq.b32 %p5, %r14, 1; - @!%p5 bra BB31_8; - bra.uni BB31_7; + @!%p5 bra BB33_8; + bra.uni BB33_7; -BB31_7: +BB33_7: sub.f64 %fd52, %fd65, %fd64; neg.f64 %fd53, %fd52; fma.rn.f64 %fd54, %fd7, %fd64, %fd53; @@ -4471,13 +4616,13 @@ BB31_7: fma.rn.f64 %fd62, %fd60, %fd54, %fd61; fma.rn.f64 %fd65, %fd62, %fd60, %fd60; -BB31_8: +BB33_8: cvta.to.global.u64 %rd10, %rd4; shl.b64 %rd11, %rd2, 3; add.s64 %rd12, %rd10, %rd11; st.global.f64 [%rd12], %fd65; -BB31_9: +BB33_9: ret; } @@ -4503,7 +4648,7 @@ BB31_9: mov.u32 %r6, %tid.x; mad.lo.s32 %r1, %r5, %r4, %r6; setp.ge.u32 %p1, %r1, %r3; - @%p1 bra BB32_5; + @%p1 bra BB34_5; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -4517,10 +4662,10 @@ BB31_9: mov.b32 %f1, %r2; abs.f32 %f2, %f1; setp.lt.f32 %p2, %f2, 0f3FE26666; - @%p2 bra BB32_3; - bra.uni BB32_2; + @%p2 bra BB34_3; + bra.uni BB34_2; -BB32_3: +BB34_3: mul.f64 %fd55, %fd1, %fd1; mov.f64 %fd56, 0dBFB3823B180754AF; mov.f64 %fd57, 0d3FB0066BDC1895E9; @@ -4549,9 +4694,9 @@ BB32_3: fma.rn.f64 %fd80, %fd78, %fd55, %fd79; mul.f64 %fd81, %fd55, %fd80; fma.rn.f64 %fd82, %fd81, %fd1, %fd1; - bra.uni BB32_4; + bra.uni BB34_4; -BB32_2: +BB34_2: abs.f64 %fd7, %fd1; mov.f64 %fd8, 0d3FE0000000000000; mov.f64 %fd9, 0dBFE0000000000000; @@ -4631,13 +4776,13 @@ BB32_2: or.b32 %r14, %r12, %r13; mov.b64 %fd82, {%r11, %r14}; -BB32_4: +BB34_4: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd82; -BB32_5: +BB34_5: ret; } @@ -4662,7 +4807,7 @@ BB32_5: mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r6, %r5, %r7; setp.ge.u32 %p1, %r1, %r4; - @%p1 bra BB33_14; + @%p1 bra BB35_14; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -4679,10 +4824,10 @@ BB32_5: mov.b64 {%temp, %r8}, %fd1; } setp.lt.s32 %p2, %r8, 1071801958; - @%p2 bra BB33_9; - bra.uni BB33_2; + @%p2 bra BB35_9; + bra.uni BB35_2; -BB33_9: +BB35_9: mul.f64 %fd62, %fd1, %fd1; mov.f64 %fd63, 0dBFB3823B180754AF; mov.f64 %fd64, 0d3FB0066BDC1895E9; @@ -4712,14 +4857,14 @@ BB33_9: mul.f64 %fd88, %fd62, %fd87; fma.rn.f64 %fd10, %fd88, %fd1, %fd1; setp.lt.s32 %p6, %r2, 0; - @%p6 bra BB33_11; + @%p6 bra BB35_11; mov.f64 %fd89, 0dBC91A62633145C07; add.rn.f64 %fd90, %fd10, %fd89; neg.f64 %fd93, %fd90; - bra.uni BB33_12; + bra.uni BB35_12; -BB33_2: +BB35_2: mov.f64 %fd19, 0d3FF0000000000000; sub.f64 %fd2, %fd19, %fd1; { @@ -4755,7 +4900,7 @@ BB33_2: fma.rn.f64 %fd28, %fd24, %fd25, %fd18; fma.rn.f64 %fd3, %fd28, %fd27, %fd24; setp.lt.s32 %p3, %r3, 1; - @%p3 bra BB33_4; + @%p3 bra BB35_4; { .reg .b32 %temp; @@ -4794,31 +4939,31 @@ BB33_2: fma.rn.f64 %fd54, %fd52, %fd2, %fd53; mul.f64 %fd55, %fd2, %fd54; fma.rn.f64 %fd94, %fd55, %fd29, %fd29; - bra.uni BB33_5; + bra.uni BB35_5; -BB33_11: +BB35_11: mov.f64 %fd91, 0d3C91A62633145C07; add.rn.f64 %fd93, %fd10, %fd91; -BB33_12: +BB35_12: mov.f64 %fd92, 0d3FF921FB54442D18; add.rn.f64 %fd94, %fd92, %fd93; - bra.uni BB33_13; + bra.uni BB35_13; -BB33_4: +BB35_4: mov.f64 %fd56, 0d0000000000000000; mul.rn.f64 %fd94, %fd1, %fd56; -BB33_5: +BB35_5: setp.gt.s32 %p4, %r3, -1; - @%p4 bra BB33_7; + @%p4 bra BB35_7; mov.f64 %fd57, 0d7FF0000000000000; mul.rn.f64 %fd94, %fd94, %fd57; -BB33_7: +BB35_7: setp.gt.s32 %p5, %r2, -1; - @%p5 bra BB33_13; + @%p5 bra BB35_13; mov.f64 %fd58, 0dBCA1A62633145C07; add.rn.f64 %fd59, %fd94, %fd58; @@ -4826,13 +4971,13 @@ BB33_7: mov.f64 %fd61, 0d400921FB54442D18; add.rn.f64 %fd94, %fd61, %fd60; -BB33_13: +BB35_13: cvta.to.global.u64 %rd7, %rd3; shl.b64 %rd8, %rd1, 3; add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd94; -BB33_14: +BB35_14: ret; } @@ -4857,7 +5002,7 @@ BB33_14: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB34_4; + @%p1 bra BB36_4; cvta.to.global.u64 %rd4, %rd2; cvt.s64.s32 %rd1, %r1; @@ -4867,7 +5012,7 @@ BB33_14: abs.f64 %fd2, %fd1; setp.leu.f64 %p2, %fd2, 0d3FF0000000000000; mov.f64 %fd56, %fd2; - @%p2 bra BB34_3; + @%p2 bra BB36_3; // inline asm rcp.approx.ftz.f64 %fd5,%fd2; @@ -4881,7 +5026,7 @@ BB33_14: selp.f64 %fd3, 0d0000000000000000, %fd11, %p3; mov.f64 %fd56, %fd3; -BB34_3: +BB36_3: mov.f64 %fd4, %fd56; cvta.to.global.u64 %rd7, %rd3; mul.f64 %fd12, %fd4, %fd4; @@ -4947,7 +5092,7 @@ BB34_3: add.s64 %rd9, %rd7, %rd8; st.global.f64 [%rd9], %fd55; -BB34_4: +BB36_4: ret; } @@ -4972,7 +5117,7 @@ BB34_4: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.u32 %p1, %r1, %r2; - @%p1 bra BB35_4; + @%p1 bra BB37_4; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; @@ -4981,15 +5126,15 @@ BB34_4: setp.eq.f64 %p2, %fd1, 0d0000000000000000; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p2 bra BB35_3; - bra.uni BB35_2; + @%p2 bra BB37_3; + bra.uni BB37_2; -BB35_3: +BB37_3: mov.u64 %rd8, 0; st.global.u64 [%rd1], %rd8; - bra.uni BB35_4; + bra.uni BB37_4; -BB35_2: +BB37_2: { .reg .b32 %temp; mov.b64 {%temp, %r6}, %fd1; @@ -5009,7 +5154,7 @@ BB35_2: mov.b64 %fd3, {%r11, %r10}; st.global.f64 [%rd1], %fd3; -BB35_4: +BB37_4: ret; } @@ -5018,7 +5163,7 @@ BB35_4: .param .b64 __internal_trig_reduction_slowpathd_param_1 ) { - .local .align 8 .b8 __local_depot36[40]; + .local .align 8 .b8 __local_depot38[40]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<9>; @@ -5027,7 +5172,7 @@ BB35_4: .reg .b64 %rd<101>; - mov.u64 %rd100, __local_depot36; + mov.u64 %rd100, __local_depot38; cvta.local.u64 %SP, %rd100; ld.param.f64 %fd4, [__internal_trig_reduction_slowpathd_param_0]; ld.param.u64 %rd37, [__internal_trig_reduction_slowpathd_param_1]; @@ -5041,7 +5186,7 @@ BB35_4: shr.u32 %r3, %r1, 20; bfe.u32 %r4, %r1, 20, 11; setp.eq.s32 %p1, %r4, 2047; - @%p1 bra BB36_13; + @%p1 bra BB38_13; add.s32 %r16, %r4, -1024; shr.u32 %r17, %r16, 6; @@ -5054,7 +5199,7 @@ BB35_4: setp.gt.s32 %p2, %r5, %r6; mov.u64 %rd94, 0; mov.u64 %rd93, %rd1; - @%p2 bra BB36_4; + @%p2 bra BB38_4; mov.b64 %rd41, %fd4; shl.b64 %rd42, %rd41, 11; @@ -5073,7 +5218,7 @@ BB35_4: mov.u64 %rd91, %rd1; mov.u32 %r39, %r7; -BB36_3: +BB38_3: .pragma "nounroll"; mov.u32 %r8, %r39; mov.u64 %rd7, %rd91; @@ -5108,15 +5253,15 @@ BB36_3: setp.lt.s32 %p3, %r9, %r6; mov.u64 %rd91, %rd13; mov.u32 %r39, %r9; - @%p3 bra BB36_3; + @%p3 bra BB38_3; -BB36_4: +BB38_4: st.local.u64 [%rd93], %rd94; ld.local.u64 %rd95, [%rd1+16]; ld.local.u64 %rd96, [%rd1+24]; and.b32 %r10, %r3, 63; setp.eq.s32 %p4, %r10, 0; - @%p4 bra BB36_6; + @%p4 bra BB38_6; mov.u32 %r27, 64; sub.s32 %r28, %r27, %r10; @@ -5128,7 +5273,7 @@ BB36_4: shr.u64 %rd56, %rd55, %r28; or.b64 %rd95, %rd56, %rd54; -BB36_6: +BB38_6: cvta.to.local.u64 %rd57, %rd37; shr.u64 %rd58, %rd96, 62; cvt.u32.u64 %r29, %rd58; @@ -5145,7 +5290,7 @@ BB36_6: selp.b32 %r34, %r32, %r33, %p5; st.local.u32 [%rd57], %r34; setp.eq.s32 %p6, %r31, 0; - @%p6 bra BB36_8; + @%p6 bra BB38_8; mov.u64 %rd65, 0; // inline asm @@ -5165,10 +5310,10 @@ BB36_6: // inline asm xor.b32 %r40, %r40, -2147483648; -BB36_8: +BB38_8: clz.b64 %r41, %rd98; setp.eq.s32 %p7, %r41, 0; - @%p7 bra BB36_10; + @%p7 bra BB38_10; shl.b64 %rd68, %rd98, %r41; mov.u32 %r35, 64; @@ -5176,7 +5321,7 @@ BB36_8: shr.u64 %rd69, %rd97, %r36; or.b64 %rd98, %rd69, %rd68; -BB36_10: +BB38_10: mov.u64 %rd73, -3958705157555305931; // inline asm { @@ -5197,7 +5342,7 @@ BB36_10: } // inline asm setp.lt.s64 %p8, %rd99, 1; - @%p8 bra BB36_12; + @%p8 bra BB38_12; // inline asm { @@ -5216,7 +5361,7 @@ BB36_10: // inline asm add.s32 %r41, %r41, 1; -BB36_12: +BB38_12: cvt.u64.u32 %rd80, %r40; shl.b64 %rd81, %rd80, 32; mov.u32 %r37, 1022; @@ -5231,7 +5376,7 @@ BB36_12: or.b64 %rd89, %rd88, %rd81; mov.b64 %fd4, %rd89; -BB36_13: +BB38_13: st.param.f64 [func_retval0+0], %fd4; ret; } @@ -5259,7 +5404,7 @@ BB36_13: } shr.u32 %r50, %r49, 20; setp.ne.s32 %p1, %r50, 0; - @%p1 bra BB37_2; + @%p1 bra BB39_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { @@ -5273,13 +5418,13 @@ BB36_13: shr.u32 %r16, %r49, 20; add.s32 %r50, %r16, -54; -BB37_2: +BB39_2: add.s32 %r51, %r50, -1023; and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; mov.b64 %fd132, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB37_4; + @%p2 bra BB39_4; { .reg .b32 %temp; @@ -5293,7 +5438,7 @@ BB37_2: mov.b64 %fd132, {%r19, %r21}; add.s32 %r51, %r50, -1022; -BB37_4: +BB39_4: add.f64 %fd16, %fd132, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; @@ -5458,13 +5603,13 @@ BB37_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB37_7; + @%p4 bra BB39_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; add.f64 %fd129, %fd4, 0d7FF0000000000000; selp.f64 %fd133, 0d0000000000000000, %fd129, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB37_7; + @%p6 bra BB39_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; @@ -5479,26 +5624,26 @@ BB37_4: mov.b64 %fd131, {%r44, %r43}; mul.f64 %fd133, %fd130, %fd131; -BB37_7: +BB39_7: { .reg .b32 %temp; mov.b64 {%temp, %r45}, %fd133; } and.b32 %r46, %r45, 2147483647; setp.ne.s32 %p7, %r46, 2146435072; - @%p7 bra BB37_9; + @%p7 bra BB39_9; { .reg .b32 %temp; mov.b64 {%r47, %temp}, %fd133; } setp.eq.s32 %p8, %r47, 0; - @%p8 bra BB37_10; + @%p8 bra BB39_10; -BB37_9: +BB39_9: fma.rn.f64 %fd133, %fd133, %fd5, %fd133; -BB37_10: +BB39_10: st.param.f64 [func_retval0+0], %fd133; ret; } http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/hops/BinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java index 6175621..9155203 100644 --- a/src/main/java/org/apache/sysml/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java @@ -23,24 +23,26 @@ import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.lops.Aggregate; +import org.apache.sysml.lops.Append; +import org.apache.sysml.lops.AppendG; import org.apache.sysml.lops.AppendGAlignedSP; import org.apache.sysml.lops.AppendM; -import org.apache.sysml.lops.AppendCP; -import org.apache.sysml.lops.AppendG; import org.apache.sysml.lops.AppendR; import org.apache.sysml.lops.Binary; -import org.apache.sysml.lops.BinaryScalar; import org.apache.sysml.lops.BinaryM; +import org.apache.sysml.lops.BinaryScalar; import org.apache.sysml.lops.BinaryUAggChain; import org.apache.sysml.lops.CentralMoment; import org.apache.sysml.lops.CoVariance; import org.apache.sysml.lops.CombineBinary; +import org.apache.sysml.lops.CombineBinary.OperationTypes; import org.apache.sysml.lops.CombineUnary; import org.apache.sysml.lops.ConvolutionTransform; import org.apache.sysml.lops.Data; import org.apache.sysml.lops.DataPartition; import org.apache.sysml.lops.Group; import org.apache.sysml.lops.Lop; +import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.lops.LopsException; import org.apache.sysml.lops.PartialAggregate; import org.apache.sysml.lops.PickByCount; @@ -48,8 +50,6 @@ import org.apache.sysml.lops.RepMat; import org.apache.sysml.lops.SortKeys; import org.apache.sysml.lops.Unary; import org.apache.sysml.lops.UnaryCP; -import org.apache.sysml.lops.CombineBinary.OperationTypes; -import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; @@ -527,15 +527,20 @@ public class BinaryOp extends Hop } else //CP { + if (DMLScript.USE_ACCELERATOR && dt1 == DataType.MATRIX && (DMLScript.FORCE_ACCELERATOR + || getMemEstimate() < GPUContextPool.initialGPUMemBudget())) { + et = ExecType.GPU; + } + Lop offset = createOffsetLop( getInput().get(0), cbind ); //offset 1st input - append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind); + append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind, et); append.getOutputParameters().setDimensions(rlen, clen, getRowsInBlock(), getColsInBlock(), getNnz()); } } else //SCALAR-STRING and SCALAR-STRING (always CP) { - append = new AppendCP(getInput().get(0).constructLops(), getInput().get(1).constructLops(), - Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind); + append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), + Data.createLiteralLop(ValueType.INT, "-1"), getDataType(), getValueType(), cbind, ExecType.CP); append.getOutputParameters().setDimensions(0,0,-1,-1,-1); } http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/Append.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/Append.java b/src/main/java/org/apache/sysml/lops/Append.java new file mode 100644 index 0000000..e224e51 --- /dev/null +++ b/src/main/java/org/apache/sysml/lops/Append.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.lops; + +import org.apache.sysml.lops.LopProperties.ExecLocation; +import org.apache.sysml.lops.LopProperties.ExecType; +import org.apache.sysml.lops.compile.JobType; +import org.apache.sysml.parser.Expression.*; + + +public class Append extends Lop +{ + public static final String OPCODE = "append"; + + private boolean _cbind = true; + private ExecType _et; + + public Append(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind, ExecType et) + { + super(Lop.Type.Append, dt, vt); + _et = et; + init(input1, input2, input3, dt, vt); + + _cbind = cbind; + } + + public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt) + { + addInput(input1); + input1.addOutput(this); + + addInput(input2); + input2.addOutput(this); + + addInput(input3); + input3.addOutput(this); + + boolean breaksAlignment = false; + boolean aligner = false; + boolean definesMRJob = false; + + lps.addCompatibility(JobType.INVALID); + lps.setProperties( inputs, _et, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob ); + } + + @Override + public String toString() { + + return " Append: "; + } + + //called when append executes in CP + public String getInstructions(String input1, String input2, String input3, String output) + throws LopsException + { + StringBuilder sb = new StringBuilder(); + sb.append( getExecType() ); + sb.append( OPERAND_DELIMITOR ); + sb.append( "append" ); + + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(0).prepInputOperand(input1)); + + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(1).prepInputOperand(input2)); + + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(2).prepScalarInputOperand(getExecType())); + + sb.append( OPERAND_DELIMITOR ); + sb.append( prepOutputOperand(output) ); + + sb.append( OPERAND_DELIMITOR ); + sb.append( _cbind ); + + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/lops/AppendCP.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/AppendCP.java b/src/main/java/org/apache/sysml/lops/AppendCP.java deleted file mode 100644 index e76f21e..0000000 --- a/src/main/java/org/apache/sysml/lops/AppendCP.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysml.lops; - -import org.apache.sysml.lops.LopProperties.ExecLocation; -import org.apache.sysml.lops.LopProperties.ExecType; -import org.apache.sysml.lops.compile.JobType; -import org.apache.sysml.parser.Expression.*; - - -public class AppendCP extends Lop -{ - public static final String OPCODE = "append"; - - private boolean _cbind = true; - - public AppendCP(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, boolean cbind) - { - super(Lop.Type.Append, dt, vt); - init(input1, input2, input3, dt, vt); - - _cbind = cbind; - } - - public void init(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt) - { - addInput(input1); - input1.addOutput(this); - - addInput(input2); - input2.addOutput(this); - - addInput(input3); - input3.addOutput(this); - - boolean breaksAlignment = false; - boolean aligner = false; - boolean definesMRJob = false; - - lps.addCompatibility(JobType.INVALID); - lps.setProperties( inputs, ExecType.CP, ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob ); - } - - @Override - public String toString() { - - return " AppendCP: "; - } - - //called when append executes in CP - public String getInstructions(String input1, String input2, String input3, String output) - throws LopsException - { - StringBuilder sb = new StringBuilder(); - sb.append( getExecType() ); - sb.append( OPERAND_DELIMITOR ); - sb.append( "append" ); - - sb.append( OPERAND_DELIMITOR ); - sb.append( getInputs().get(0).prepInputOperand(input1)); - - sb.append( OPERAND_DELIMITOR ); - sb.append( getInputs().get(1).prepInputOperand(input2)); - - sb.append( OPERAND_DELIMITOR ); - sb.append( getInputs().get(2).prepScalarInputOperand(getExecType())); - - sb.append( OPERAND_DELIMITOR ); - sb.append( prepOutputOperand(output) ); - - sb.append( OPERAND_DELIMITOR ); - sb.append( _cbind ); - - return sb.toString(); - } -} http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java index 7088c50..e755fa0 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java @@ -22,7 +22,7 @@ package org.apache.sysml.runtime.instructions; import java.util.HashMap; -import org.apache.sysml.lops.AppendCP; +import org.apache.sysml.lops.Append; import org.apache.sysml.lops.DataGen; import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.lops.UnaryCP; @@ -239,7 +239,7 @@ public class CPInstructionParser extends InstructionParser // User-defined function Opcodes String2CPInstructionType.put( "extfunct" , CPINSTRUCTION_TYPE.External); - String2CPInstructionType.put( AppendCP.OPCODE, CPINSTRUCTION_TYPE.Append); + String2CPInstructionType.put( Append.OPCODE, CPINSTRUCTION_TYPE.Append); // data generation opcodes String2CPInstructionType.put( DataGen.RAND_OPCODE , CPINSTRUCTION_TYPE.Rand); http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 17b1578..36f57b4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -33,6 +33,7 @@ import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.RelationalBinaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.MatrixAppendGPUInstruction; public class GPUInstructionParser extends InstructionParser { @@ -52,12 +53,15 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); // Matrix Multiply Operators - String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); - String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); + String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); + String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); // Reorg/Transpose - String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg); - + String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg); + + // Matrix Manipulation + String2GPUInstructionType.put( "append", GPUINSTRUCTION_TYPE.Append); + // Binary Cellwise String2GPUInstructionType.put( "+", GPUINSTRUCTION_TYPE.ArithmeticBinary); String2GPUInstructionType.put( "-", GPUINSTRUCTION_TYPE.ArithmeticBinary); @@ -161,7 +165,10 @@ public class GPUInstructionParser extends InstructionParser case BuiltinBinary: return BuiltinBinaryGPUInstruction.parseInstruction(str); - + + case Append: + return MatrixAppendGPUInstruction.parseInstruction(str); + case Convolution: return ConvolutionGPUInstruction.parseInstruction(str); http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java index 7529b05..e1c163d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinUnaryGPUInstruction.java @@ -43,7 +43,7 @@ public abstract class BuiltinUnaryGPUInstruction extends GPUInstruction { _gputype = GPUINSTRUCTION_TYPE.BuiltinUnary; this._arity = _arity; _input = in; - _output = out; + _output = out; } public int getArity() { http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 7f981eb..a5388cb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -39,6 +39,7 @@ public abstract class GPUInstruction extends Instruction Convolution, MMTSJ, Reorg, + Append, ArithmeticBinary, BuiltinUnary, BuiltinBinary, @@ -101,6 +102,8 @@ public abstract class GPUInstruction extends Instruction public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel + public final static String MISC_TIMER_CBIND_KERNEL = "cbindk"; // time spent in the cbind kernel + public final static String MISC_TIMER_RBIND_KERNEL = "rbindk"; // time spent in the rbind kernel public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel http://git-wip-us.apache.org/repos/asf/systemml/blob/4e47b5e1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java new file mode 100644 index 0000000..7671d7d --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixAppendGPUInstruction.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.parser.Expression; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.OffsetColumnIndex; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.AppendCPInstruction; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.cp.FrameAppendCPInstruction; +import org.apache.sysml.runtime.instructions.cp.MatrixAppendCPInstruction; +import org.apache.sysml.runtime.instructions.cp.ScalarAppendCPInstruction; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.runtime.matrix.operators.ReorgOperator; +import org.apache.sysml.utils.GPUStatistics; + +/** + * Implements the cbind and rbind functions for matrices + */ +public class MatrixAppendGPUInstruction extends GPUInstruction { + + CPOperand output; + CPOperand input1, input2; + AppendCPInstruction.AppendType type; + + public MatrixAppendGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, AppendCPInstruction.AppendType type, String opcode, String istr) { + super(op, opcode, istr); + this.output = out; + this.input1 = in1; + this.input2 = in2; + this.type = type; + } + + public static MatrixAppendGPUInstruction parseInstruction ( String str ) + throws DMLRuntimeException + { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + InstructionUtils.checkNumFields (parts, 5); + + String opcode = parts[0]; + CPOperand in1 = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand in3 = new CPOperand(parts[3]); + CPOperand out = new CPOperand(parts[4]); + boolean cbind = Boolean.parseBoolean(parts[5]); + + AppendCPInstruction.AppendType type = (in1.getDataType()!= Expression.DataType.MATRIX && in1.getDataType()!= Expression.DataType.FRAME) ? + AppendCPInstruction.AppendType.STRING : cbind ? AppendCPInstruction.AppendType.CBIND : AppendCPInstruction.AppendType.RBIND; + + if (in1.getDataType()!= Expression.DataType.MATRIX || in2.getDataType()!= Expression.DataType.MATRIX){ + throw new DMLRuntimeException("GPU : Error in internal state - Append was called on data other than matrices"); + } + + if(!opcode.equalsIgnoreCase("append")) + throw new DMLRuntimeException("Unknown opcode while parsing a AppendCPInstruction: " + str); + + Operator op = new ReorgOperator(OffsetColumnIndex.getOffsetColumnIndexFnObject(-1)); + return new MatrixAppendGPUInstruction(op, in1, in2, out, type, opcode, str); + } + + @Override + public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + + String opcode = getOpcode(); + MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, input1.getName()); + MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, input2.getName()); + + if(type == AppendCPInstruction.AppendType.CBIND) { + LibMatrixCUDA.cbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName()); + } else if (type == AppendCPInstruction.AppendType.RBIND ) { + LibMatrixCUDA.rbind(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName()); + } else { + throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); + } + ec.releaseMatrixInputForGPUInstruction(input1.getName()); + ec.releaseMatrixInputForGPUInstruction(input2.getName()); + ec.releaseMatrixOutputForGPUInstruction(output.getName()); + } +}
