Repository: incubator-systemml Updated Branches: refs/heads/master 6963f5e10 -> 41c513151
[SYSTEMML-1039] Added uark+/uar+ Closes #326. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/41c51315 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/41c51315 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/41c51315 Branch: refs/heads/master Commit: 41c513151941942e785ae882adb4ed92f72fd471 Parents: 6963f5e Author: Nakul Jindal <[email protected]> Authored: Thu Jan 5 15:56:25 2017 -0800 Committer: Niketan Pansare <[email protected]> Committed: Thu Jan 5 15:56:24 2017 -0800 ---------------------------------------------------------------------- src/main/cpp/kernels/SystemML.cu | 77 +- src/main/cpp/kernels/SystemML.ptx | 1824 ++++++++++++------ .../java/org/apache/sysml/hops/AggUnaryOp.java | 2 +- .../instructions/GPUInstructionParser.java | 13 +- .../context/AggregateUnaryGPUInstruction.java | 8 +- .../instructions/gpu/context/GPUContext.java | 18 +- .../instructions/gpu/context/JCudaContext.java | 38 +- .../runtime/matrix/data/LibMatrixCUDA.java | 278 +-- 8 files changed, 1489 insertions(+), 769 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 5e5fd5e..11a337c 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -183,7 +183,20 @@ __global__ void fill(double* A, double scalar, int lenA) { } - +/** + * Does a reduce (sum) over all elements of the array. + * This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0) + * and the Reduction example available through jcuda.org + * When invoked initially, all blocks partly compute the reduction operation over the entire array + * and writes it to the output/temporary array. A second invokation needs to happen to get the + * reduced value. + * The number of threads, blocks and amount of shared memory is calculated in a specific way. + * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes this method to see + * how its done. + * @param g_idata input data stored in device memory (of size n) + * @param g_odata output/temporary array stode in device memory (of size n) + * @param n size of the input and temporary/output arrays + */ extern "C" __global__ void reduce(double *g_idata, double *g_odata, unsigned int n) { @@ -237,3 +250,65 @@ __global__ void reduce(double *g_idata, double *g_odata, unsigned int n) if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } + + +/** + * Does a reduce (sum) over each row of the array. + * The intuition for this kernel is that each block does a reduction over a single row. + * The maximum numver + * @param g_idata input matrix stored in device memory + * @param g_odata output vector of size [rows * 1] in device memory + * @param rows number of rows in input matrix + * @param cols number of columns in input matrix + */ +extern "C" +__global__ void reduce_row(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols) +{ + extern __shared__ double sdata[]; + + // one block per row + if (blockIdx.x >= rows) { + return; + } + + unsigned int block = blockIdx.x; + unsigned int tid = threadIdx.x; + unsigned int i = tid; + unsigned int block_offset = block * cols; + + double mySum = 0; + while (i < cols){ + mySum += g_idata[block_offset + i]; + i += blockDim.x; + } + + // each thread puts its local sum into shared memory + sdata[tid] = mySum; + __syncthreads(); + + + // do reduction in shared mem + if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); } + if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); } + if (blockDim.x >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); } + + if (tid < 32) + { + // now that we are using warp-synchronous programming (below) + // we need to declare our shared memory volatile so that the compiler + // doesn't reorder stores to it and induce incorrect behavior. + volatile double* smem = sdata; + if (blockDim.x >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; } + if (blockDim.x >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; } + if (blockDim.x >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; } + if (blockDim.x >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; } + if (blockDim.x >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; } + if (blockDim.x >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; } + } + + // write result for this block to global mem + if (tid == 0) + g_odata[block] = sdata[0]; +} + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index ea27ac0..0683492 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,16 +1,16 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21124049 -// Cuda compilation tools, release 8.0, V8.0.44 +// Compiler Build ID: CL-19856038 +// Cuda compilation tools, release 7.5, V7.5.17 // Based on LLVM 3.4svn // -.version 5.0 -.target sm_20 +.version 4.3 +.target sm_30 .address_size 64 - // .globl copyUpperToLowerTriangleDense + // .globl getBoolean .func (.param .b64 func_retval0) __internal_accurate_pow ( .param .b64 __internal_accurate_pow_param_0, @@ -19,6 +19,307 @@ ; .extern .shared .align 8 .b8 sdata[]; +.visible .func (.param .b64 func_retval0) getBoolean( + .param .b32 getBoolean_param_0 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<2>; + .reg .f64 %fd<2>; + + + ld.param.u32 %r1, [getBoolean_param_0]; + setp.eq.s32 %p1, %r1, 0; + selp.f64 %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1; + st.param.f64 [func_retval0+0], %fd1; + ret; +} + + // .globl binaryOp +.visible .func (.param .b64 func_retval0) binaryOp( + .param .b64 binaryOp_param_0, + .param .b64 binaryOp_param_1, + .param .b32 binaryOp_param_2 +) +{ + .reg .pred %p<41>; + .reg .b32 %r<30>; + .reg .f64 %fd<40>; + .reg .b64 %rd<3>; + + + ld.param.f64 %fd26, [binaryOp_param_0]; + ld.param.f64 %fd27, [binaryOp_param_1]; + ld.param.u32 %r3, [binaryOp_param_2]; + setp.eq.s32 %p2, %r3, 0; + @%p2 bra BB1_40; + + setp.eq.s32 %p3, %r3, 1; + @%p3 bra BB1_39; + bra.uni BB1_2; + +BB1_39: + sub.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_40: + add.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_2: + setp.eq.s32 %p4, %r3, 2; + @%p4 bra BB1_38; + bra.uni BB1_3; + +BB1_38: + mul.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_3: + setp.eq.s32 %p5, %r3, 3; + @%p5 bra BB1_37; + bra.uni BB1_4; + +BB1_37: + div.rn.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_4: + setp.eq.s32 %p6, %r3, 4; + @%p6 bra BB1_21; + bra.uni BB1_5; + +BB1_21: + { + .reg .b32 %temp; + mov.b64 {%temp, %r1}, %fd26; + } + { + .reg .b32 %temp; + mov.b64 {%temp, %r2}, %fd27; + } + bfe.u32 %r4, %r2, 20, 11; + add.s32 %r5, %r4, -1012; + mov.b64 %rd2, %fd27; + shl.b64 %rd1, %rd2, %r5; + setp.eq.s64 %p21, %rd1, -9223372036854775808; + abs.f64 %fd9, %fd26; + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.f64 [param0+0], %fd9; + .param .b64 param1; + st.param.f64 [param1+0], %fd27; + .param .b64 retval0; + call.uni (retval0), + __internal_accurate_pow, + ( + param0, + param1 + ); + ld.param.f64 %fd38, [retval0+0]; + + //{ + }// Callseq End 0 + setp.lt.s32 %p22, %r1, 0; + and.pred %p1, %p22, %p21; + @!%p1 bra BB1_23; + bra.uni BB1_22; + +BB1_22: + { + .reg .b32 %temp; + mov.b64 {%temp, %r6}, %fd38; + } + xor.b32 %r7, %r6, -2147483648; + { + .reg .b32 %temp; + mov.b64 {%r8, %temp}, %fd38; + } + mov.b64 %fd38, {%r8, %r7}; + +BB1_23: + mov.f64 %fd37, %fd38; + setp.eq.f64 %p23, %fd26, 0d0000000000000000; + @%p23 bra BB1_26; + bra.uni BB1_24; + +BB1_26: + selp.b32 %r9, %r1, 0, %p21; + or.b32 %r10, %r9, 2146435072; + setp.lt.s32 %p27, %r2, 0; + selp.b32 %r11, %r10, %r9, %p27; + mov.u32 %r12, 0; + mov.b64 %fd37, {%r12, %r11}; + bra.uni BB1_27; + +BB1_5: + setp.eq.s32 %p7, %r3, 5; + @%p7 bra BB1_20; + bra.uni BB1_6; + +BB1_20: + setp.lt.f64 %p20, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20; + bra.uni BB1_41; + +BB1_6: + setp.eq.s32 %p8, %r3, 6; + @%p8 bra BB1_19; + bra.uni BB1_7; + +BB1_19: + setp.le.f64 %p19, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19; + bra.uni BB1_41; + +BB1_24: + setp.gt.s32 %p24, %r1, -1; + @%p24 bra BB1_27; + + cvt.rzi.f64.f64 %fd29, %fd27; + setp.neu.f64 %p25, %fd29, %fd27; + selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p25; + +BB1_27: + mov.f64 %fd15, %fd37; + add.f64 %fd16, %fd26, %fd27; + { + .reg .b32 %temp; + mov.b64 {%temp, %r13}, %fd16; + } + and.b32 %r14, %r13, 2146435072; + setp.ne.s32 %p28, %r14, 2146435072; + mov.f64 %fd36, %fd15; + @%p28 bra BB1_36; + + setp.gtu.f64 %p29, %fd9, 0d7FF0000000000000; + mov.f64 %fd36, %fd16; + @%p29 bra BB1_36; + + abs.f64 %fd30, %fd27; + setp.gtu.f64 %p30, %fd30, 0d7FF0000000000000; + mov.f64 %fd35, %fd16; + mov.f64 %fd36, %fd35; + @%p30 bra BB1_36; + + and.b32 %r15, %r2, 2147483647; + setp.ne.s32 %p31, %r15, 2146435072; + @%p31 bra BB1_32; + + { + .reg .b32 %temp; + mov.b64 {%r16, %temp}, %fd27; + } + setp.eq.s32 %p32, %r16, 0; + @%p32 bra BB1_35; + +BB1_32: + and.b32 %r17, %r1, 2147483647; + setp.ne.s32 %p33, %r17, 2146435072; + mov.f64 %fd33, %fd15; + mov.f64 %fd36, %fd33; + @%p33 bra BB1_36; + + { + .reg .b32 %temp; + mov.b64 {%r18, %temp}, %fd26; + } + setp.ne.s32 %p34, %r18, 0; + mov.f64 %fd36, %fd15; + @%p34 bra BB1_36; + + shr.s32 %r19, %r2, 31; + and.b32 %r20, %r19, -2146435072; + add.s32 %r21, %r20, 2146435072; + or.b32 %r22, %r21, -2147483648; + selp.b32 %r23, %r22, %r21, %p1; + mov.u32 %r24, 0; + mov.b64 %fd36, {%r24, %r23}; + bra.uni BB1_36; + +BB1_7: + setp.eq.s32 %p9, %r3, 7; + @%p9 bra BB1_18; + bra.uni BB1_8; + +BB1_18: + setp.gt.f64 %p18, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18; + bra.uni BB1_41; + +BB1_8: + setp.eq.s32 %p10, %r3, 8; + @%p10 bra BB1_17; + bra.uni BB1_9; + +BB1_17: + setp.ge.f64 %p17, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17; + bra.uni BB1_41; + +BB1_9: + setp.eq.s32 %p11, %r3, 9; + @%p11 bra BB1_16; + bra.uni BB1_10; + +BB1_16: + setp.eq.f64 %p16, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16; + bra.uni BB1_41; + +BB1_10: + setp.eq.s32 %p12, %r3, 10; + @%p12 bra BB1_15; + bra.uni BB1_11; + +BB1_15: + setp.neu.f64 %p15, %fd26, %fd27; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15; + bra.uni BB1_41; + +BB1_35: + setp.gt.f64 %p35, %fd9, 0d3FF0000000000000; + selp.b32 %r25, 2146435072, 0, %p35; + xor.b32 %r26, %r25, 2146435072; + setp.lt.s32 %p36, %r2, 0; + selp.b32 %r27, %r26, %r25, %p36; + setp.eq.f64 %p37, %fd26, 0dBFF0000000000000; + selp.b32 %r28, 1072693248, %r27, %p37; + mov.u32 %r29, 0; + mov.b64 %fd36, {%r29, %r28}; + +BB1_36: + setp.eq.f64 %p38, %fd27, 0d0000000000000000; + setp.eq.f64 %p39, %fd26, 0d3FF0000000000000; + or.pred %p40, %p39, %p38; + selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p40; + +BB1_41: + st.param.f64 [func_retval0+0], %fd39; + ret; + +BB1_11: + setp.eq.s32 %p13, %r3, 11; + @%p13 bra BB1_14; + bra.uni BB1_12; + +BB1_14: + min.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; + +BB1_12: + mov.f64 %fd39, 0dC08F380000000000; + setp.ne.s32 %p14, %r3, 12; + @%p14 bra BB1_41; + + max.f64 %fd39, %fd26, %fd27; + bra.uni BB1_41; +} + + // .globl copyUpperToLowerTriangleDense .visible .entry copyUpperToLowerTriangleDense( .param .u64 copyUpperToLowerTriangleDense_param_0, .param .u32 copyUpperToLowerTriangleDense_param_1, @@ -46,10 +347,10 @@ setp.gt.s32 %p1, %r2, %r1; setp.lt.s32 %p2, %r3, %r5; and.pred %p3, %p1, %p2; - @!%p3 bra BB0_2; - bra.uni BB0_1; + @!%p3 bra BB2_2; + bra.uni BB2_1; -BB0_1: +BB2_1: cvta.to.global.u64 %rd2, %rd1; mad.lo.s32 %r12, %r1, %r4, %r2; mul.wide.s32 %rd3, %r12, 8; @@ -59,7 +360,7 @@ BB0_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB0_2: +BB2_2: ret; } @@ -92,14 +393,14 @@ BB0_2: mad.lo.s32 %r1, %r8, %r9, %r11; mul.lo.s32 %r12, %r3, %r2; setp.ge.s32 %p1, %r1, %r12; - @%p1 bra BB1_2; + @%p1 bra BB3_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB1_2: +BB3_2: ret; } @@ -133,10 +434,10 @@ BB1_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB2_2; - bra.uni BB2_1; + @!%p3 bra BB4_2; + bra.uni BB4_1; -BB2_1: +BB4_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -145,7 +446,7 @@ BB2_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd1; -BB2_2: +BB4_2: ret; } @@ -178,10 +479,10 @@ BB2_2: setp.lt.s32 %p1, %r1, %r4; setp.lt.s32 %p2, %r2, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB3_2; - bra.uni BB3_1; + @!%p3 bra BB5_2; + bra.uni BB5_1; -BB3_1: +BB5_1: cvta.to.global.u64 %rd3, %rd1; mad.lo.s32 %r11, %r1, %r3, %r2; mul.wide.s32 %rd4, %r11, 8; @@ -193,7 +494,7 @@ BB3_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB3_2: +BB5_2: ret; } @@ -237,10 +538,10 @@ BB3_2: setp.lt.s32 %p1, %r7, %r2; setp.lt.s32 %p2, %r11, %r3; and.pred %p3, %p1, %p2; - @!%p3 bra BB4_6; - bra.uni BB4_1; + @!%p3 bra BB6_6; + bra.uni BB6_1; -BB4_1: +BB6_1: cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 8; add.s64 %rd6, %rd4, %rd5; @@ -250,26 +551,26 @@ BB4_1: setp.lt.f64 %p4, %fd8, %fd3; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd1, %rd7, %rd5; - @%p4 bra BB4_5; - bra.uni BB4_2; + @%p4 bra BB6_5; + bra.uni BB6_2; -BB4_5: +BB6_5: st.global.f64 [%rd1], %fd4; - bra.uni BB4_6; + bra.uni BB6_6; -BB4_2: +BB6_2: setp.lt.f64 %p5, %fd1, %fd2; - @%p5 bra BB4_4; - bra.uni BB4_3; + @%p5 bra BB6_4; + bra.uni BB6_3; -BB4_4: +BB6_4: st.global.f64 [%rd1], %fd5; - bra.uni BB4_6; + bra.uni BB6_6; -BB4_3: +BB6_3: st.global.f64 [%rd1], %fd6; -BB4_6: +BB6_6: ret; } @@ -285,9 +586,9 @@ BB4_6: .param .u32 binCellOp_param_7 ) { - .reg .pred %p<50>; - .reg .b32 %r<51>; - .reg .f64 %fd<39>; + .reg .pred %p<52>; + .reg .b32 %r<56>; + .reg .f64 %fd<40>; .reg .b64 %rd<15>; @@ -310,93 +611,93 @@ BB4_6: setp.lt.s32 %p2, %r1, %r14; setp.lt.s32 %p3, %r2, %r10; and.pred %p4, %p2, %p3; - @!%p4 bra BB5_53; - bra.uni BB5_1; + @!%p4 bra BB7_55; + bra.uni BB7_1; -BB5_1: +BB7_1: mad.lo.s32 %r3, %r1, %r10, %r2; setp.eq.s32 %p5, %r11, 1; - mov.u32 %r49, %r1; - @%p5 bra BB5_5; + mov.u32 %r54, %r1; + @%p5 bra BB7_5; setp.ne.s32 %p6, %r11, 2; - mov.u32 %r50, %r3; - @%p6 bra BB5_4; + mov.u32 %r55, %r3; + @%p6 bra BB7_4; - mov.u32 %r50, %r2; + mov.u32 %r55, %r2; -BB5_4: - mov.u32 %r44, %r50; - mov.u32 %r4, %r44; - mov.u32 %r49, %r4; +BB7_4: + mov.u32 %r49, %r55; + mov.u32 %r4, %r49; + mov.u32 %r54, %r4; -BB5_5: - mov.u32 %r5, %r49; +BB7_5: + mov.u32 %r5, %r54; setp.eq.s32 %p7, %r12, 1; - mov.u32 %r47, %r1; - @%p7 bra BB5_9; + mov.u32 %r52, %r1; + @%p7 bra BB7_9; setp.ne.s32 %p8, %r12, 2; - mov.u32 %r48, %r3; - @%p8 bra BB5_8; + mov.u32 %r53, %r3; + @%p8 bra BB7_8; - mov.u32 %r48, %r2; + mov.u32 %r53, %r2; -BB5_8: - mov.u32 %r47, %r48; +BB7_8: + mov.u32 %r52, %r53; -BB5_9: +BB7_9: cvta.to.global.u64 %rd5, %rd3; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r5, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd1, [%rd8]; - mul.wide.s32 %rd9, %r47, 8; + mul.wide.s32 %rd9, %r52, 8; add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd38, 0dC08F380000000000; + mov.f64 %fd39, 0dC08F380000000000; setp.gt.s32 %p9, %r13, 5; - @%p9 bra BB5_19; + @%p9 bra BB7_19; setp.gt.s32 %p19, %r13, 2; - @%p19 bra BB5_15; + @%p19 bra BB7_15; setp.eq.s32 %p23, %r13, 0; - @%p23 bra BB5_51; + @%p23 bra BB7_53; setp.eq.s32 %p24, %r13, 1; - @%p24 bra BB5_50; - bra.uni BB5_13; + @%p24 bra BB7_52; + bra.uni BB7_13; -BB5_50: - sub.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; +BB7_52: + sub.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_19: +BB7_19: setp.gt.s32 %p10, %r13, 8; - @%p10 bra BB5_24; + @%p10 bra BB7_24; setp.eq.s32 %p16, %r13, 6; - @%p16 bra BB5_34; + @%p16 bra BB7_34; setp.eq.s32 %p17, %r13, 7; - @%p17 bra BB5_33; - bra.uni BB5_22; + @%p17 bra BB7_33; + bra.uni BB7_22; -BB5_33: +BB7_33: setp.gt.f64 %p29, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29; + bra.uni BB7_54; -BB5_15: +BB7_15: setp.eq.s32 %p20, %r13, 3; - @%p20 bra BB5_49; + @%p20 bra BB7_51; setp.eq.s32 %p21, %r13, 4; - @%p21 bra BB5_35; - bra.uni BB5_17; + @%p21 bra BB7_35; + bra.uni BB7_17; -BB5_35: +BB7_35: { .reg .b32 %temp; mov.b64 {%temp, %r8}, %fd1; @@ -411,7 +712,7 @@ BB5_35: shl.b64 %rd1, %rd11, %r22; setp.eq.s64 %p32, %rd1, -9223372036854775808; abs.f64 %fd11, %fd1; - // Callseq Start 0 + // Callseq Start 1 { .reg .b32 temp_param_reg; // <end>} @@ -426,133 +727,133 @@ BB5_35: param0, param1 ); - ld.param.f64 %fd37, [retval0+0]; + ld.param.f64 %fd38, [retval0+0]; //{ - }// Callseq End 0 + }// Callseq End 1 setp.lt.s32 %p33, %r8, 0; and.pred %p1, %p33, %p32; - @!%p1 bra BB5_37; - bra.uni BB5_36; + @!%p1 bra BB7_37; + bra.uni BB7_36; -BB5_36: +BB7_36: { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd37; + mov.b64 {%temp, %r23}, %fd38; } xor.b32 %r24, %r23, -2147483648; { .reg .b32 %temp; - mov.b64 {%r25, %temp}, %fd37; + mov.b64 {%r25, %temp}, %fd38; } - mov.b64 %fd37, {%r25, %r24}; + mov.b64 %fd38, {%r25, %r24}; -BB5_37: - mov.f64 %fd36, %fd37; +BB7_37: + mov.f64 %fd37, %fd38; setp.eq.f64 %p34, %fd1, 0d0000000000000000; - @%p34 bra BB5_40; - bra.uni BB5_38; + @%p34 bra BB7_40; + bra.uni BB7_38; -BB5_40: +BB7_40: selp.b32 %r26, %r8, 0, %p32; or.b32 %r27, %r26, 2146435072; setp.lt.s32 %p38, %r9, 0; selp.b32 %r28, %r27, %r26, %p38; mov.u32 %r29, 0; - mov.b64 %fd36, {%r29, %r28}; - bra.uni BB5_41; + mov.b64 %fd37, {%r29, %r28}; + bra.uni BB7_41; -BB5_24: +BB7_24: setp.gt.s32 %p11, %r13, 10; - @%p11 bra BB5_28; + @%p11 bra BB7_28; setp.eq.s32 %p14, %r13, 9; - @%p14 bra BB5_32; - bra.uni BB5_26; + @%p14 bra BB7_32; + bra.uni BB7_26; -BB5_32: +BB7_32: setp.eq.f64 %p27, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27; + bra.uni BB7_54; -BB5_28: +BB7_28: setp.eq.s32 %p12, %r13, 11; - @%p12 bra BB5_31; - bra.uni BB5_29; + @%p12 bra BB7_31; + bra.uni BB7_29; -BB5_31: - min.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; +BB7_31: + min.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_51: - add.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; +BB7_53: + add.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_13: +BB7_13: setp.eq.s32 %p25, %r13, 2; - @%p25 bra BB5_14; - bra.uni BB5_52; + @%p25 bra BB7_14; + bra.uni BB7_54; -BB5_14: - mul.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; +BB7_14: + mul.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_34: +BB7_34: setp.le.f64 %p30, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30; + bra.uni BB7_54; -BB5_22: +BB7_22: setp.eq.s32 %p18, %r13, 8; - @%p18 bra BB5_23; - bra.uni BB5_52; + @%p18 bra BB7_23; + bra.uni BB7_54; -BB5_23: +BB7_23: setp.ge.f64 %p28, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28; + bra.uni BB7_54; -BB5_49: - div.rn.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; +BB7_51: + div.rn.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_17: +BB7_17: setp.eq.s32 %p22, %r13, 5; - @%p22 bra BB5_18; - bra.uni BB5_52; + @%p22 bra BB7_18; + bra.uni BB7_54; -BB5_18: +BB7_18: setp.lt.f64 %p31, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31; + bra.uni BB7_54; -BB5_26: +BB7_26: setp.eq.s32 %p15, %r13, 10; - @%p15 bra BB5_27; - bra.uni BB5_52; + @%p15 bra BB7_27; + bra.uni BB7_54; -BB5_27: +BB7_27: setp.neu.f64 %p26, %fd1, %fd2; - selp.f64 %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB5_52; + selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB7_54; -BB5_29: +BB7_29: setp.ne.s32 %p13, %r13, 12; - @%p13 bra BB5_52; + @%p13 bra BB7_54; - max.f64 %fd38, %fd1, %fd2; - bra.uni BB5_52; + max.f64 %fd39, %fd1, %fd2; + bra.uni BB7_54; -BB5_38: +BB7_38: setp.gt.s32 %p35, %r8, -1; - @%p35 bra BB5_41; + @%p35 bra BB7_41; - cvt.rzi.f64.f64 %fd30, %fd2; - setp.neu.f64 %p36, %fd30, %fd2; - selp.f64 %fd36, 0dFFF8000000000000, %fd36, %p36; + cvt.rzi.f64.f64 %fd29, %fd2; + setp.neu.f64 %p36, %fd29, %fd2; + selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36; -BB5_41: - mov.f64 %fd17, %fd36; +BB7_41: + mov.f64 %fd17, %fd37; add.f64 %fd18, %fd1, %fd2; { .reg .b32 %temp; @@ -560,60 +861,78 @@ BB5_41: } and.b32 %r31, %r30, 2146435072; setp.ne.s32 %p39, %r31, 2146435072; - mov.f64 %fd35, %fd17; - @%p39 bra BB5_48; + mov.f64 %fd36, %fd17; + @%p39 bra BB7_50; setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000; + mov.f64 %fd36, %fd18; + @%p40 bra BB7_50; + + abs.f64 %fd30, %fd2; + setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000; mov.f64 %fd35, %fd18; - @%p40 bra BB5_48; - - abs.f64 %fd19, %fd2; - setp.gtu.f64 %p41, %fd19, 0d7FF0000000000000; - mov.f64 %fd34, %fd18; - mov.f64 %fd35, %fd34; - @%p41 bra BB5_48; - - setp.eq.f64 %p42, %fd19, 0d7FF0000000000000; - @%p42 bra BB5_47; - bra.uni BB5_45; - -BB5_47: - setp.gt.f64 %p44, %fd11, 0d3FF0000000000000; - selp.b32 %r37, 2146435072, 0, %p44; - xor.b32 %r38, %r37, 2146435072; - setp.lt.s32 %p45, %r9, 0; - selp.b32 %r39, %r38, %r37, %p45; - setp.eq.f64 %p46, %fd1, 0dBFF0000000000000; - selp.b32 %r40, 1072693248, %r39, %p46; + mov.f64 %fd36, %fd35; + @%p41 bra BB7_50; + + and.b32 %r32, %r9, 2147483647; + setp.ne.s32 %p42, %r32, 2146435072; + @%p42 bra BB7_46; + + { + .reg .b32 %temp; + mov.b64 {%r33, %temp}, %fd2; + } + setp.eq.s32 %p43, %r33, 0; + @%p43 bra BB7_49; + +BB7_46: + and.b32 %r34, %r8, 2147483647; + setp.ne.s32 %p44, %r34, 2146435072; + mov.f64 %fd33, %fd17; + mov.f64 %fd36, %fd33; + @%p44 bra BB7_50; + + { + .reg .b32 %temp; + mov.b64 {%r35, %temp}, %fd1; + } + setp.ne.s32 %p45, %r35, 0; + mov.f64 %fd36, %fd17; + @%p45 bra BB7_50; + + shr.s32 %r36, %r9, 31; + and.b32 %r37, %r36, -2146435072; + add.s32 %r38, %r37, 2146435072; + or.b32 %r39, %r38, -2147483648; + selp.b32 %r40, %r39, %r38, %p1; mov.u32 %r41, 0; - mov.b64 %fd35, {%r41, %r40}; - bra.uni BB5_48; - -BB5_45: - setp.neu.f64 %p43, %fd11, 0d7FF0000000000000; - mov.f64 %fd35, %fd17; - @%p43 bra BB5_48; - - shr.s32 %r32, %r9, 31; - and.b32 %r33, %r32, -2146435072; - selp.b32 %r34, -1048576, 2146435072, %p1; - add.s32 %r35, %r34, %r33; - mov.u32 %r36, 0; - mov.b64 %fd35, {%r36, %r35}; - -BB5_48: - setp.eq.f64 %p47, %fd2, 0d0000000000000000; - setp.eq.f64 %p48, %fd1, 0d3FF0000000000000; - or.pred %p49, %p48, %p47; - selp.f64 %fd38, 0d3FF0000000000000, %fd35, %p49; - -BB5_52: + mov.b64 %fd36, {%r41, %r40}; + bra.uni BB7_50; + +BB7_49: + setp.gt.f64 %p46, %fd11, 0d3FF0000000000000; + selp.b32 %r42, 2146435072, 0, %p46; + xor.b32 %r43, %r42, 2146435072; + setp.lt.s32 %p47, %r9, 0; + selp.b32 %r44, %r43, %r42, %p47; + setp.eq.f64 %p48, %fd1, 0dBFF0000000000000; + selp.b32 %r45, 1072693248, %r44, %p48; + mov.u32 %r46, 0; + mov.b64 %fd36, {%r46, %r45}; + +BB7_50: + setp.eq.f64 %p49, %fd2, 0d0000000000000000; + setp.eq.f64 %p50, %fd1, 0d3FF0000000000000; + or.pred %p51, %p50, %p49; + selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51; + +BB7_54: cvta.to.global.u64 %rd12, %rd4; mul.wide.s32 %rd13, %r3, 8; add.s64 %rd14, %rd12, %rd13; - st.global.f64 [%rd14], %fd38; + st.global.f64 [%rd14], %fd39; -BB5_53: +BB7_55: ret; } @@ -628,14 +947,14 @@ BB5_53: .param .u32 binCellScalarOp_param_6 ) { - .reg .pred %p<85>; - .reg .b32 %r<61>; - .reg .f64 %fd<75>; + .reg .pred %p<89>; + .reg .b32 %r<71>; + .reg .f64 %fd<77>; .reg .b64 %rd<12>; ld.param.u64 %rd4, [binCellScalarOp_param_0]; - ld.param.f64 %fd54, [binCellScalarOp_param_1]; + ld.param.f64 %fd52, [binCellScalarOp_param_1]; ld.param.u64 %rd5, [binCellScalarOp_param_2]; ld.param.u32 %r8, [binCellScalarOp_param_3]; ld.param.u32 %r9, [binCellScalarOp_param_4]; @@ -652,7 +971,7 @@ BB5_53: mad.lo.s32 %r1, %r14, %r15, %r17; mul.lo.s32 %r18, %r9, %r8; setp.ge.s32 %p3, %r1, %r18; - @%p3 bra BB6_88; + @%p3 bra BB8_92; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -661,181 +980,181 @@ BB5_53: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB6_45; + @%p4 bra BB8_47; setp.eq.s32 %p5, %r6, 0; - @%p5 bra BB6_43; + @%p5 bra BB8_45; - mov.f64 %fd66, 0dC08F380000000000; + mov.f64 %fd67, 0dC08F380000000000; setp.gt.s32 %p6, %r6, 6; - @%p6 bra BB6_13; + @%p6 bra BB8_13; setp.gt.s32 %p14, %r6, 3; - @%p14 bra BB6_9; + @%p14 bra BB8_9; setp.eq.s32 %p18, %r6, 1; - @%p18 bra BB6_42; + @%p18 bra BB8_44; setp.eq.s32 %p19, %r6, 2; - @%p19 bra BB6_41; - bra.uni BB6_7; + @%p19 bra BB8_43; + bra.uni BB8_7; -BB6_41: - mul.f64 %fd66, %fd1, %fd54; - bra.uni BB6_44; +BB8_43: + mul.f64 %fd67, %fd1, %fd52; + bra.uni BB8_46; -BB6_45: - setp.eq.s32 %p45, %r6, 0; - @%p45 bra BB6_86; +BB8_47: + setp.eq.s32 %p47, %r6, 0; + @%p47 bra BB8_90; - mov.f64 %fd74, 0dC08F380000000000; - setp.gt.s32 %p46, %r6, 6; - @%p46 bra BB6_56; + mov.f64 %fd76, 0dC08F380000000000; + setp.gt.s32 %p48, %r6, 6; + @%p48 bra BB8_58; - setp.gt.s32 %p54, %r6, 3; - @%p54 bra BB6_52; + setp.gt.s32 %p56, %r6, 3; + @%p56 bra BB8_54; - setp.eq.s32 %p58, %r6, 1; - @%p58 bra BB6_85; + setp.eq.s32 %p60, %r6, 1; + @%p60 bra BB8_89; - setp.eq.s32 %p59, %r6, 2; - @%p59 bra BB6_84; - bra.uni BB6_50; + setp.eq.s32 %p61, %r6, 2; + @%p61 bra BB8_88; + bra.uni BB8_52; -BB6_84: - mul.f64 %fd74, %fd1, %fd54; - bra.uni BB6_87; +BB8_88: + mul.f64 %fd76, %fd1, %fd52; + bra.uni BB8_91; -BB6_43: - add.f64 %fd66, %fd1, %fd54; +BB8_45: + add.f64 %fd67, %fd1, %fd52; -BB6_44: - st.global.f64 [%rd1], %fd66; - bra.uni BB6_88; +BB8_46: + st.global.f64 [%rd1], %fd67; + bra.uni BB8_92; -BB6_13: +BB8_13: setp.gt.s32 %p7, %r6, 9; - @%p7 bra BB6_18; + @%p7 bra BB8_18; setp.eq.s32 %p11, %r6, 7; - @%p11 bra BB6_25; + @%p11 bra BB8_25; setp.eq.s32 %p12, %r6, 8; - @%p12 bra BB6_24; - bra.uni BB6_16; + @%p12 bra BB8_24; + bra.uni BB8_16; -BB6_24: - setp.le.f64 %p23, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23; - bra.uni BB6_44; +BB8_24: + setp.le.f64 %p23, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23; + bra.uni BB8_46; -BB6_86: - add.f64 %fd74, %fd1, %fd54; +BB8_90: + add.f64 %fd76, %fd1, %fd52; -BB6_87: - st.global.f64 [%rd1], %fd74; +BB8_91: + st.global.f64 [%rd1], %fd76; -BB6_88: +BB8_92: ret; -BB6_56: - setp.gt.s32 %p47, %r6, 9; - @%p47 bra BB6_61; +BB8_58: + setp.gt.s32 %p49, %r6, 9; + @%p49 bra BB8_63; - setp.eq.s32 %p51, %r6, 7; - @%p51 bra BB6_68; + setp.eq.s32 %p53, %r6, 7; + @%p53 bra BB8_70; - setp.eq.s32 %p52, %r6, 8; - @%p52 bra BB6_67; - bra.uni BB6_59; + setp.eq.s32 %p54, %r6, 8; + @%p54 bra BB8_69; + bra.uni BB8_61; -BB6_67: - setp.ge.f64 %p63, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p63; - bra.uni BB6_87; +BB8_69: + setp.ge.f64 %p65, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65; + bra.uni BB8_91; -BB6_9: +BB8_9: setp.eq.s32 %p15, %r6, 4; - @%p15 bra BB6_27; + @%p15 bra BB8_27; setp.eq.s32 %p16, %r6, 5; - @%p16 bra BB6_26; - bra.uni BB6_11; + @%p16 bra BB8_26; + bra.uni BB8_11; -BB6_26: - setp.gt.f64 %p26, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26; - bra.uni BB6_44; +BB8_26: + setp.gt.f64 %p26, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26; + bra.uni BB8_46; -BB6_18: +BB8_18: setp.eq.s32 %p8, %r6, 10; - @%p8 bra BB6_23; + @%p8 bra BB8_23; setp.eq.s32 %p9, %r6, 11; - @%p9 bra BB6_22; - bra.uni BB6_20; + @%p9 bra BB8_22; + bra.uni BB8_20; -BB6_22: - min.f64 %fd66, %fd54, %fd1; - bra.uni BB6_44; +BB8_22: + min.f64 %fd67, %fd52, %fd1; + bra.uni BB8_46; -BB6_52: - setp.eq.s32 %p55, %r6, 4; - @%p55 bra BB6_70; +BB8_54: + setp.eq.s32 %p57, %r6, 4; + @%p57 bra BB8_72; - setp.eq.s32 %p56, %r6, 5; - @%p56 bra BB6_69; - bra.uni BB6_54; + setp.eq.s32 %p58, %r6, 5; + @%p58 bra BB8_71; + bra.uni BB8_56; -BB6_69: - setp.lt.f64 %p66, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66; - bra.uni BB6_87; +BB8_71: + setp.lt.f64 %p68, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68; + bra.uni BB8_91; -BB6_61: - setp.eq.s32 %p48, %r6, 10; - @%p48 bra BB6_66; +BB8_63: + setp.eq.s32 %p50, %r6, 10; + @%p50 bra BB8_68; - setp.eq.s32 %p49, %r6, 11; - @%p49 bra BB6_65; - bra.uni BB6_63; + setp.eq.s32 %p51, %r6, 11; + @%p51 bra BB8_67; + bra.uni BB8_65; -BB6_65: - min.f64 %fd74, %fd1, %fd54; - bra.uni BB6_87; +BB8_67: + min.f64 %fd76, %fd1, %fd52; + bra.uni BB8_91; -BB6_42: - sub.f64 %fd66, %fd54, %fd1; - bra.uni BB6_44; +BB8_44: + sub.f64 %fd67, %fd52, %fd1; + bra.uni BB8_46; -BB6_7: +BB8_7: setp.eq.s32 %p20, %r6, 3; - @%p20 bra BB6_8; - bra.uni BB6_44; + @%p20 bra BB8_8; + bra.uni BB8_46; -BB6_8: - div.rn.f64 %fd66, %fd54, %fd1; - bra.uni BB6_44; +BB8_8: + div.rn.f64 %fd67, %fd52, %fd1; + bra.uni BB8_46; -BB6_25: - setp.lt.f64 %p24, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24; - bra.uni BB6_44; +BB8_25: + setp.lt.f64 %p24, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24; + bra.uni BB8_46; -BB6_16: +BB8_16: setp.eq.s32 %p13, %r6, 9; - @%p13 bra BB6_17; - bra.uni BB6_44; + @%p13 bra BB8_17; + bra.uni BB8_46; -BB6_17: - setp.eq.f64 %p22, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22; - bra.uni BB6_44; +BB8_17: + setp.eq.f64 %p22, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22; + bra.uni BB8_46; -BB6_27: +BB8_27: { .reg .b32 %temp; - mov.b64 {%temp, %r2}, %fd54; + mov.b64 {%temp, %r2}, %fd52; } { .reg .b32 %temp; @@ -846,8 +1165,8 @@ BB6_27: mov.b64 %rd10, %fd1; shl.b64 %rd2, %rd10, %r20; setp.eq.s64 %p27, %rd2, -9223372036854775808; - abs.f64 %fd10, %fd54; - // Callseq Start 1 + abs.f64 %fd10, %fd52; + // Callseq Start 2 { .reg .b32 temp_param_reg; // <end>} @@ -862,115 +1181,115 @@ BB6_27: param0, param1 ); - ld.param.f64 %fd65, [retval0+0]; + ld.param.f64 %fd66, [retval0+0]; //{ - }// Callseq End 1 + }// Callseq End 2 setp.lt.s32 %p28, %r2, 0; and.pred %p1, %p28, %p27; - @!%p1 bra BB6_29; - bra.uni BB6_28; + @!%p1 bra BB8_29; + bra.uni BB8_28; -BB6_28: +BB8_28: { .reg .b32 %temp; - mov.b64 {%temp, %r21}, %fd65; + mov.b64 {%temp, %r21}, %fd66; } xor.b32 %r22, %r21, -2147483648; { .reg .b32 %temp; - mov.b64 {%r23, %temp}, %fd65; + mov.b64 {%r23, %temp}, %fd66; } - mov.b64 %fd65, {%r23, %r22}; + mov.b64 %fd66, {%r23, %r22}; -BB6_29: - mov.f64 %fd64, %fd65; - setp.eq.f64 %p29, %fd54, 0d0000000000000000; - @%p29 bra BB6_32; - bra.uni BB6_30; +BB8_29: + mov.f64 %fd65, %fd66; + setp.eq.f64 %p29, %fd52, 0d0000000000000000; + @%p29 bra BB8_32; + bra.uni BB8_30; -BB6_32: +BB8_32: selp.b32 %r24, %r2, 0, %p27; or.b32 %r25, %r24, 2146435072; setp.lt.s32 %p33, %r3, 0; selp.b32 %r26, %r25, %r24, %p33; mov.u32 %r27, 0; - mov.b64 %fd64, {%r27, %r26}; - bra.uni BB6_33; + mov.b64 %fd65, {%r27, %r26}; + bra.uni BB8_33; -BB6_11: +BB8_11: setp.eq.s32 %p17, %r6, 6; - @%p17 bra BB6_12; - bra.uni BB6_44; + @%p17 bra BB8_12; + bra.uni BB8_46; -BB6_12: - setp.ge.f64 %p25, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25; - bra.uni BB6_44; +BB8_12: + setp.ge.f64 %p25, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25; + bra.uni BB8_46; -BB6_23: - setp.neu.f64 %p21, %fd1, %fd54; - selp.f64 %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21; - bra.uni BB6_44; +BB8_23: + setp.neu.f64 %p21, %fd1, %fd52; + selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21; + bra.uni BB8_46; -BB6_20: +BB8_20: setp.ne.s32 %p10, %r6, 12; - @%p10 bra BB6_44; + @%p10 bra BB8_46; - max.f64 %fd66, %fd54, %fd1; - bra.uni BB6_44; + max.f64 %fd67, %fd52, %fd1; + bra.uni BB8_46; -BB6_85: - sub.f64 %fd74, %fd1, %fd54; - bra.uni BB6_87; +BB8_89: + sub.f64 %fd76, %fd1, %fd52; + bra.uni BB8_91; -BB6_50: - setp.eq.s32 %p60, %r6, 3; - @%p60 bra BB6_51; - bra.uni BB6_87; +BB8_52: + setp.eq.s32 %p62, %r6, 3; + @%p62 bra BB8_53; + bra.uni BB8_91; -BB6_51: - div.rn.f64 %fd74, %fd1, %fd54; - bra.uni BB6_87; +BB8_53: + div.rn.f64 %fd76, %fd1, %fd52; + bra.uni BB8_91; -BB6_68: - setp.gt.f64 %p64, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p64; - bra.uni BB6_87; +BB8_70: + setp.gt.f64 %p66, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66; + bra.uni BB8_91; -BB6_59: - setp.eq.s32 %p53, %r6, 9; - @%p53 bra BB6_60; - bra.uni BB6_87; +BB8_61: + setp.eq.s32 %p55, %r6, 9; + @%p55 bra BB8_62; + bra.uni BB8_91; -BB6_60: - setp.eq.f64 %p62, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p62; - bra.uni BB6_87; +BB8_62: + setp.eq.f64 %p64, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64; + bra.uni BB8_91; -BB6_70: +BB8_72: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; } { .reg .b32 %temp; - mov.b64 {%temp, %r5}, %fd54; + mov.b64 {%temp, %r5}, %fd52; } - bfe.u32 %r40, %r5, 20, 11; - add.s32 %r41, %r40, -1012; - mov.b64 %rd11, %fd54; - shl.b64 %rd3, %rd11, %r41; - setp.eq.s64 %p67, %rd3, -9223372036854775808; - abs.f64 %fd36, %fd1; - // Callseq Start 2 + bfe.u32 %r45, %r5, 20, 11; + add.s32 %r46, %r45, -1012; + mov.b64 %rd11, %fd52; + shl.b64 %rd3, %rd11, %r46; + setp.eq.s64 %p69, %rd3, -9223372036854775808; + abs.f64 %fd35, %fd1; + // Callseq Start 3 { .reg .b32 temp_param_reg; // <end>} .param .b64 param0; - st.param.f64 [param0+0], %fd36; + st.param.f64 [param0+0], %fd35; .param .b64 param1; - st.param.f64 [param1+0], %fd54; + st.param.f64 [param1+0], %fd52; .param .b64 retval0; call.uni (retval0), __internal_accurate_pow, @@ -978,193 +1297,229 @@ BB6_70: param0, param1 ); - ld.param.f64 %fd73, [retval0+0]; + ld.param.f64 %fd75, [retval0+0]; //{ - }// Callseq End 2 - setp.lt.s32 %p68, %r4, 0; - and.pred %p2, %p68, %p67; - @!%p2 bra BB6_72; - bra.uni BB6_71; + }// Callseq End 3 + setp.lt.s32 %p70, %r4, 0; + and.pred %p2, %p70, %p69; + @!%p2 bra BB8_74; + bra.uni BB8_73; -BB6_71: +BB8_73: { .reg .b32 %temp; - mov.b64 {%temp, %r42}, %fd73; + mov.b64 {%temp, %r47}, %fd75; } - xor.b32 %r43, %r42, -2147483648; + xor.b32 %r48, %r47, -2147483648; { .reg .b32 %temp; - mov.b64 {%r44, %temp}, %fd73; + mov.b64 {%r49, %temp}, %fd75; } - mov.b64 %fd73, {%r44, %r43}; - -BB6_72: - mov.f64 %fd72, %fd73; - setp.eq.f64 %p69, %fd1, 0d0000000000000000; - @%p69 bra BB6_75; - bra.uni BB6_73; - -BB6_75: - selp.b32 %r45, %r4, 0, %p67; - or.b32 %r46, %r45, 2146435072; - setp.lt.s32 %p73, %r5, 0; - selp.b32 %r47, %r46, %r45, %p73; - mov.u32 %r48, 0; - mov.b64 %fd72, {%r48, %r47}; - bra.uni BB6_76; - -BB6_54: - setp.eq.s32 %p57, %r6, 6; - @%p57 bra BB6_55; - bra.uni BB6_87; - -BB6_55: - setp.le.f64 %p65, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65; - bra.uni BB6_87; - -BB6_66: - setp.neu.f64 %p61, %fd1, %fd54; - selp.f64 %fd74, 0d3FF0000000000000, 0d0000000000000000, %p61; - bra.uni BB6_87; - -BB6_63: - setp.ne.s32 %p50, %r6, 12; - @%p50 bra BB6_87; - - max.f64 %fd74, %fd1, %fd54; - bra.uni BB6_87; - -BB6_30: + mov.b64 %fd75, {%r49, %r48}; + +BB8_74: + mov.f64 %fd74, %fd75; + setp.eq.f64 %p71, %fd1, 0d0000000000000000; + @%p71 bra BB8_77; + bra.uni BB8_75; + +BB8_77: + selp.b32 %r50, %r4, 0, %p69; + or.b32 %r51, %r50, 2146435072; + setp.lt.s32 %p75, %r5, 0; + selp.b32 %r52, %r51, %r50, %p75; + mov.u32 %r53, 0; + mov.b64 %fd74, {%r53, %r52}; + bra.uni BB8_78; + +BB8_56: + setp.eq.s32 %p59, %r6, 6; + @%p59 bra BB8_57; + bra.uni BB8_91; + +BB8_57: + setp.le.f64 %p67, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67; + bra.uni BB8_91; + +BB8_68: + setp.neu.f64 %p63, %fd1, %fd52; + selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63; + bra.uni BB8_91; + +BB8_65: + setp.ne.s32 %p52, %r6, 12; + @%p52 bra BB8_91; + + max.f64 %fd76, %fd1, %fd52; + bra.uni BB8_91; + +BB8_30: setp.gt.s32 %p30, %r2, -1; - @%p30 bra BB6_33; + @%p30 bra BB8_33; - cvt.rzi.f64.f64 %fd56, %fd1; - setp.neu.f64 %p31, %fd56, %fd1; - selp.f64 %fd64, 0dFFF8000000000000, %fd64, %p31; + cvt.rzi.f64.f64 %fd54, %fd1; + setp.neu.f64 %p31, %fd54, %fd1; + selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31; -BB6_33: - mov.f64 %fd16, %fd64; - add.f64 %fd17, %fd1, %fd54; +BB8_33: + mov.f64 %fd16, %fd65; + add.f64 %fd17, %fd1, %fd52; { .reg .b32 %temp; mov.b64 {%temp, %r28}, %fd17; } and.b32 %r29, %r28, 2146435072; setp.ne.s32 %p34, %r29, 2146435072; - mov.f64 %fd63, %fd16; - @%p34 bra BB6_40; + mov.f64 %fd64, %fd16; + @%p34 bra BB8_42; setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000; + mov.f64 %fd64, %fd17; + @%p35 bra BB8_42; + + abs.f64 %fd55, %fd1; + setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000; mov.f64 %fd63, %fd17; - @%p35 bra BB6_40; - - abs.f64 %fd18, %fd1; - setp.gtu.f64 %p36, %fd18, 0d7FF0000000000000; - mov.f64 %fd62, %fd17; - mov.f64 %fd63, %fd62; - @%p36 bra BB6_40; - - setp.eq.f64 %p37, %fd18, 0d7FF0000000000000; - @%p37 bra BB6_39; - bra.uni BB6_37; - -BB6_39: - setp.gt.f64 %p39, %fd10, 0d3FF0000000000000; - selp.b32 %r35, 2146435072, 0, %p39; - xor.b32 %r36, %r35, 2146435072; - setp.lt.s32 %p40, %r3, 0; - selp.b32 %r37, %r36, %r35, %p40; - setp.eq.f64 %p41, %fd54, 0dBFF0000000000000; - selp.b32 %r38, 1072693248, %r37, %p41; + mov.f64 %fd64, %fd63; + @%p36 bra BB8_42; + + and.b32 %r30, %r3, 2147483647; + setp.ne.s32 %p37, %r30, 2146435072; + @%p37 bra BB8_38; + + { + .reg .b32 %temp; + mov.b64 {%r31, %temp}, %fd1; + } + setp.eq.s32 %p38, %r31, 0; + @%p38 bra BB8_41; + +BB8_38: + and.b32 %r32, %r2, 2147483647; + setp.ne.s32 %p39, %r32, 2146435072; + mov.f64 %fd61, %fd16; + mov.f64 %fd64, %fd61; + @%p39 bra BB8_42; + + { + .reg .b32 %temp; + mov.b64 {%r33, %temp}, %fd52; + } + setp.ne.s32 %p40, %r33, 0; + mov.f64 %fd64, %fd16; + @%p40 bra BB8_42; + + shr.s32 %r34, %r3, 31; + and.b32 %r35, %r34, -2146435072; + add.s32 %r36, %r35, 2146435072; + or.b32 %r37, %r36, -2147483648; + selp.b32 %r38, %r37, %r36, %p1; mov.u32 %r39, 0; - mov.b64 %fd63, {%r39, %r38}; - bra.uni BB6_40; + mov.b64 %fd64, {%r39, %r38}; + bra.uni BB8_42; -BB6_73: - setp.gt.s32 %p70, %r4, -1; - @%p70 bra BB6_76; +BB8_75: + setp.gt.s32 %p72, %r4, -1; + @%p72 bra BB8_78; - cvt.rzi.f64.f64 %fd58, %fd54; - setp.neu.f64 %p71, %fd58, %fd54; - selp.f64 %fd72, 0dFFF8000000000000, %fd72, %p71; + cvt.rzi.f64.f64 %fd57, %fd52; + setp.neu.f64 %p73, %fd57, %fd52; + selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73; -BB6_76: - mov.f64 %fd42, %fd72; - add.f64 %fd43, %fd1, %fd54; +BB8_78: + mov.f64 %fd41, %fd74; + add.f64 %fd42, %fd1, %fd52; { .reg .b32 %temp; - mov.b64 {%temp, %r49}, %fd43; + mov.b64 {%temp, %r54}, %fd42; } - and.b32 %r50, %r49, 2146435072; - setp.ne.s32 %p74, %r50, 2146435072; - mov.f64 %fd71, %fd42; - @%p74 bra BB6_83; - - setp.gtu.f64 %p75, %fd36, 0d7FF0000000000000; - mov.f64 %fd71, %fd43; - @%p75 bra BB6_83; - - abs.f64 %fd44, %fd54; - setp.gtu.f64 %p76, %fd44, 0d7FF0000000000000; - mov.f64 %fd70, %fd43; - mov.f64 %fd71, %fd70; - @%p76 bra BB6_83; - - setp.eq.f64 %p77, %fd44, 0d7FF0000000000000; - @%p77 bra BB6_82; - bra.uni BB6_80; - -BB6_82: - setp.gt.f64 %p79, %fd36, 0d3FF0000000000000; - selp.b32 %r56, 2146435072, 0, %p79; - xor.b32 %r57, %r56, 2146435072; - setp.lt.s32 %p80, %r5, 0; - selp.b32 %r58, %r57, %r56, %p80; - setp.eq.f64 %p81, %fd1, 0dBFF0000000000000; - selp.b32 %r59, 1072693248, %r58, %p81; - mov.u32 %r60, 0; - mov.b64 %fd71, {%r60, %r59}; - bra.uni BB6_83; - -BB6_37: - setp.neu.f64 %p38, %fd10, 0d7FF0000000000000; - mov.f64 %fd63, %fd16; - @%p38 bra BB6_40; - - shr.s32 %r30, %r3, 31; - and.b32 %r31, %r30, -2146435072; - selp.b32 %r32, -1048576, 2146435072, %p1; - add.s32 %r33, %r32, %r31; - mov.u32 %r34, 0; - mov.b64 %fd63, {%r34, %r33}; - -BB6_40: - setp.eq.f64 %p42, %fd1, 0d0000000000000000; - setp.eq.f64 %p43, %fd54, 0d3FF0000000000000; - or.pred %p44, %p43, %p42; - selp.f64 %fd66, 0d3FF0000000000000, %fd63, %p44; - bra.uni BB6_44; - -BB6_80: - setp.neu.f64 %p78, %fd36, 0d7FF0000000000000; - mov.f64 %fd71, %fd42; - @%p78 bra BB6_83; - - shr.s32 %r51, %r5, 31; - and.b32 %r52, %r51, -2146435072; - selp.b32 %r53, -1048576, 2146435072, %p2; - add.s32 %r54, %r53, %r52; - mov.u32 %r55, 0; - mov.b64 %fd71, {%r55, %r54}; - -BB6_83: - setp.eq.f64 %p82, %fd54, 0d0000000000000000; - setp.eq.f64 %p83, %fd1, 0d3FF0000000000000; - or.pred %p84, %p83, %p82; - selp.f64 %fd74, 0d3FF0000000000000, %fd71, %p84; - bra.uni BB6_87; + and.b32 %r55, %r54, 2146435072; + setp.ne.s32 %p76, %r55, 2146435072; + mov.f64 %fd73, %fd41; + @%p76 bra BB8_87; + + setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000; + mov.f64 %fd73, %fd42; + @%p77 bra BB8_87; + + abs.f64 %fd58, %fd52; + setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000; + mov.f64 %fd72, %fd42; + mov.f64 %fd73, %fd72; + @%p78 bra BB8_87; + + and.b32 %r56, %r5, 2147483647; + setp.ne.s32 %p79, %r56, 2146435072; + @%p79 bra BB8_83; + + { + .reg .b32 %temp; + mov.b64 {%r57, %temp}, %fd52; + } + setp.eq.s32 %p80, %r57, 0; + @%p80 bra BB8_86; + +BB8_83: + and.b32 %r58, %r4, 2147483647; + setp.ne.s32 %p81, %r58, 2146435072; + mov.f64 %fd70, %fd41; + mov.f64 %fd73, %fd70; + @%p81 bra BB8_87; + + { + .reg .b32 %temp; + mov.b64 {%r59, %temp}, %fd1; + } + setp.ne.s32 %p82, %r59, 0; + mov.f64 %fd73, %fd41; + @%p82 bra BB8_87; + + shr.s32 %r60, %r5, 31; + and.b32 %r61, %r60, -2146435072; + add.s32 %r62, %r61, 2146435072; + or.b32 %r63, %r62, -2147483648; + selp.b32 %r64, %r63, %r62, %p2; + mov.u32 %r65, 0; + mov.b64 %fd73, {%r65, %r64}; + bra.uni BB8_87; + +BB8_41: + setp.gt.f64 %p41, %fd10, 0d3FF0000000000000; + selp.b32 %r40, 2146435072, 0, %p41; + xor.b32 %r41, %r40, 2146435072; + setp.lt.s32 %p42, %r3, 0; + selp.b32 %r42, %r41, %r40, %p42; + setp.eq.f64 %p43, %fd52, 0dBFF0000000000000; + selp.b32 %r43, 1072693248, %r42, %p43; + mov.u32 %r44, 0; + mov.b64 %fd64, {%r44, %r43}; + +BB8_42: + setp.eq.f64 %p44, %fd1, 0d0000000000000000; + setp.eq.f64 %p45, %fd52, 0d3FF0000000000000; + or.pred %p46, %p45, %p44; + selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46; + bra.uni BB8_46; + +BB8_86: + setp.gt.f64 %p83, %fd35, 0d3FF0000000000000; + selp.b32 %r66, 2146435072, 0, %p83; + xor.b32 %r67, %r66, 2146435072; + setp.lt.s32 %p84, %r5, 0; + selp.b32 %r68, %r67, %r66, %p84; + setp.eq.f64 %p85, %fd1, 0dBFF0000000000000; + selp.b32 %r69, 1072693248, %r68, %p85; + mov.u32 %r70, 0; + mov.b64 %fd73, {%r70, %r69}; + +BB8_87: + setp.eq.f64 %p86, %fd52, 0d0000000000000000; + setp.eq.f64 %p87, %fd1, 0d3FF0000000000000; + or.pred %p88, %p87, %p86; + selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88; + bra.uni BB8_91; } // .globl fill @@ -1188,14 +1543,14 @@ BB6_83: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB7_2; + @%p1 bra BB9_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; st.global.f64 [%rd4], %fd1; -BB7_2: +BB9_2: ret; } @@ -1223,9 +1578,9 @@ BB7_2: mov.f64 %fd67, 0d0000000000000000; mov.f64 %fd68, %fd67; setp.ge.u32 %p1, %r30, %r5; - @%p1 bra BB8_4; + @%p1 bra BB10_4; -BB8_1: +BB10_1: mov.f64 %fd1, %fd68; cvta.to.global.u64 %rd4, %rd2; mul.wide.u32 %rd5, %r30, 8; @@ -1234,23 +1589,23 @@ BB8_1: add.f64 %fd69, %fd1, %fd27; add.s32 %r3, %r30, %r9; setp.ge.u32 %p2, %r3, %r5; - @%p2 bra BB8_3; + @%p2 bra BB10_3; mul.wide.u32 %rd8, %r3, 8; add.s64 %rd9, %rd4, %rd8; ld.global.f64 %fd28, [%rd9]; add.f64 %fd69, %fd69, %fd28; -BB8_3: +BB10_3: mov.f64 %fd68, %fd69; shl.b32 %r12, %r9, 1; mov.u32 %r13, %nctaid.x; mad.lo.s32 %r30, %r12, %r13, %r30; setp.lt.u32 %p3, %r30, %r5; mov.f64 %fd67, %fd68; - @%p3 bra BB8_1; + @%p3 bra BB10_1; -BB8_4: +BB10_4: mov.f64 %fd65, %fd67; mul.wide.u32 %rd10, %r6, 8; mov.u64 %rd11, sdata; @@ -1258,113 +1613,113 @@ BB8_4: st.shared.f64 [%rd1], %fd65; bar.sync 0; setp.lt.u32 %p4, %r9, 512; - @%p4 bra BB8_8; + @%p4 bra BB10_8; setp.gt.u32 %p5, %r6, 255; mov.f64 %fd66, %fd65; - @%p5 bra BB8_7; + @%p5 bra BB10_7; ld.shared.f64 %fd29, [%rd1+2048]; add.f64 %fd66, %fd65, %fd29; st.shared.f64 [%rd1], %fd66; -BB8_7: +BB10_7: mov.f64 %fd65, %fd66; bar.sync 0; -BB8_8: +BB10_8: mov.f64 %fd63, %fd65; setp.lt.u32 %p6, %r9, 256; - @%p6 bra BB8_12; + @%p6 bra BB10_12; setp.gt.u32 %p7, %r6, 127; mov.f64 %fd64, %fd63; - @%p7 bra BB8_11; + @%p7 bra BB10_11; ld.shared.f64 %fd30, [%rd1+1024]; add.f64 %fd64, %fd63, %fd30; st.shared.f64 [%rd1], %fd64; -BB8_11: +BB10_11: mov.f64 %fd63, %fd64; bar.sync 0; -BB8_12: +BB10_12: mov.f64 %fd61, %fd63; setp.lt.u32 %p8, %r9, 128; - @%p8 bra BB8_16; + @%p8 bra BB10_16; setp.gt.u32 %p9, %r6, 63; mov.f64 %fd62, %fd61; - @%p9 bra BB8_15; + @%p9 bra BB10_15; ld.shared.f64 %fd31, [%rd1+512]; add.f64 %fd62, %fd61, %fd31; st.shared.f64 [%rd1], %fd62; -BB8_15: +BB10_15: mov.f64 %fd61, %fd62; bar.sync 0; -BB8_16: +BB10_16: mov.f64 %fd60, %fd61; setp.gt.u32 %p10, %r6, 31; - @%p10 bra BB8_29; + @%p10 bra BB10_29; setp.lt.u32 %p11, %r9, 64; - @%p11 bra BB8_19; + @%p11 bra BB10_19; ld.volatile.shared.f64 %fd32, [%rd1+256]; add.f64 %fd60, %fd60, %fd32; st.volatile.shared.f64 [%rd1], %fd60; -BB8_19: +BB10_19: mov.f64 %fd59, %fd60; setp.lt.u32 %p12, %r9, 32; - @%p12 bra BB8_21; + @%p12 bra BB10_21; ld.volatile.shared.f64 %fd33, [%rd1+128]; add.f64 %fd59, %fd59, %fd33; st.volatile.shared.f64 [%rd1], %fd59; -BB8_21: +BB10_21: mov.f64 %fd58, %fd59; setp.lt.u32 %p13, %r9, 16; - @%p13 bra BB8_23; + @%p13 bra BB10_23; ld.volatile.shared.f64 %fd34, [%rd1+64]; add.f64 %fd58, %fd58, %fd34; st.volatile.shared.f64 [%rd1], %fd58; -BB8_23: +BB10_23: mov.f64 %fd57, %fd58; setp.lt.u32 %p14, %r9, 8; - @%p14 bra BB8_25; + @%p14 bra BB10_25; ld.volatile.shared.f64 %fd35, [%rd1+32]; add.f64 %fd57, %fd57, %fd35; st.volatile.shared.f64 [%rd1], %fd57; -BB8_25: +BB10_25: mov.f64 %fd56, %fd57; setp.lt.u32 %p15, %r9, 4; - @%p15 bra BB8_27; + @%p15 bra BB10_27; ld.volatile.shared.f64 %fd36, [%rd1+16]; add.f64 %fd56, %fd56, %fd36; st.volatile.shared.f64 [%rd1], %fd56; -BB8_27: +BB10_27: setp.lt.u32 %p16, %r9, 2; - @%p16 bra BB8_29; + @%p16 bra BB10_29; ld.volatile.shared.f64 %fd37, [%rd1+8]; add.f64 %fd38, %fd56, %fd37; st.volatile.shared.f64 [%rd1], %fd38; -BB8_29: +BB10_29: setp.ne.s32 %p17, %r6, 0; - @%p17 bra BB8_31; + @%p17 bra BB10_31; ld.shared.f64 %fd39, [sdata]; cvta.to.global.u64 %rd12, %rd3; @@ -1372,7 +1727,177 @@ BB8_29: add.s64 %rd14, %rd12, %rd13; st.global.f64 [%rd14], %fd39; -BB8_31: +BB10_31: + ret; +} + + // .globl reduce_row +.visible .entry reduce_row( + .param .u64 reduce_row_param_0, + .param .u64 reduce_row_param_1, + .param .u32 reduce_row_param_2, + .param .u32 reduce_row_param_3 +) +{ + .reg .pred %p<18>; + .reg .b32 %r<36>; + .reg .f64 %fd<65>; + .reg .b64 %rd<39>; + + + ld.param.u64 %rd1, [reduce_row_param_0]; + ld.param.u64 %rd2, [reduce_row_param_1]; + ld.param.u32 %r5, [reduce_row_param_2]; + ld.param.u32 %r4, [reduce_row_param_3]; + mov.u32 %r6, %ctaid.x; + setp.ge.u32 %p1, %r6, %r5; + @%p1 bra BB11_31; + + mov.u32 %r35, %tid.x; + mov.f64 %fd63, 0d0000000000000000; + mov.f64 %fd64, %fd63; + setp.ge.u32 %p2, %r35, %r4; + @%p2 bra BB11_4; + + cvta.to.global.u64 %rd3, %rd1; + +BB11_3: + mad.lo.s32 %r8, %r6, %r4, %r35; + mul.wide.u32 %rd4, %r8, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd25, [%rd5]; + add.f64 %fd64, %fd64, %fd25; + mov.u32 %r9, %ntid.x; + add.s32 %r35, %r9, %r35; + setp.lt.u32 %p3, %r35, %r4; + mov.f64 %fd63, %fd64; + @%p3 bra BB11_3; + +BB11_4: + mov.f64 %fd61, %fd63; + mov.u32 %r10, %tid.x; + mul.wide.u32 %rd6, %r10, 8; + mov.u64 %rd7, sdata; + add.s64 %rd8, %rd7, %rd6; + st.shared.f64 [%rd8], %fd61; + bar.sync 0; + mov.u32 %r11, %ntid.x; + setp.lt.u32 %p4, %r11, 512; + @%p4 bra BB11_8; + + setp.gt.u32 %p5, %r10, 255; + mov.f64 %fd62, %fd61; + @%p5 bra BB11_7; + + ld.shared.f64 %fd26, [%rd8+2048]; + add.f64 %fd62, %fd61, %fd26; + st.shared.f64 [%rd8], %fd62; + +BB11_7: + mov.f64 %fd61, %fd62; + bar.sync 0; + +BB11_8: + mov.f64 %fd59, %fd61; + setp.lt.u32 %p6, %r11, 256; + @%p6 bra BB11_12; + + setp.gt.u32 %p7, %r10, 127; + mov.f64 %fd60, %fd59; + @%p7 bra BB11_11; + + ld.shared.f64 %fd27, [%rd8+1024]; + add.f64 %fd60, %fd59, %fd27; + st.shared.f64 [%rd8], %fd60; + +BB11_11: + mov.f64 %fd59, %fd60; + bar.sync 0; + +BB11_12: + mov.f64 %fd57, %fd59; + setp.lt.u32 %p8, %r11, 128; + @%p8 bra BB11_16; + + setp.gt.u32 %p9, %r10, 63; + mov.f64 %fd58, %fd57; + @%p9 bra BB11_15; + + ld.shared.f64 %fd28, [%rd8+512]; + add.f64 %fd58, %fd57, %fd28; + st.shared.f64 [%rd8], %fd58; + +BB11_15: + mov.f64 %fd57, %fd58; + bar.sync 0; + +BB11_16: + mov.f64 %fd56, %fd57; + setp.gt.u32 %p10, %r10, 31; + @%p10 bra BB11_29; + + setp.lt.u32 %p11, %r11, 64; + @%p11 bra BB11_19; + + ld.volatile.shared.f64 %fd29, [%rd8+256]; + add.f64 %fd56, %fd56, %fd29; + st.volatile.shared.f64 [%rd8], %fd56; + +BB11_19: + mov.f64 %fd55, %fd56; + setp.lt.u32 %p12, %r11, 32; + @%p12 bra BB11_21; + + ld.volatile.shared.f64 %fd30, [%rd8+128]; + add.f64 %fd55, %fd55, %fd30; + st.volatile.shared.f64 [%rd8], %fd55; + +BB11_21: + mov.f64 %fd54, %fd55; + setp.lt.u32 %p13, %r11, 16; + @%p13 bra BB11_23; + + ld.volatile.shared.f64 %fd31, [%rd8+64]; + add.f64 %fd54, %fd54, %fd31; + st.volatile.shared.f64 [%rd8], %fd54; + +BB11_23: + mov.f64 %fd53, %fd54; + setp.lt.u32 %p14, %r11, 8; + @%p14 bra BB11_25; + + ld.volatile.shared.f64 %fd32, [%rd8+32]; + add.f64 %fd53, %fd53, %fd32; + st.volatile.shared.f64 [%rd8], %fd53; + +BB11_25: + mov.f64 %fd52, %fd53; + setp.lt.u32 %p15, %r11, 4; + @%p15 bra BB11_27; + + ld.volatile.shared.f64 %fd33, [%rd8+16]; + add.f64 %fd52, %fd52, %fd33; + st.volatile.shared.f64 [%rd8], %fd52; + +BB11_27: + setp.lt.u32 %p16, %r11, 2; + @%p16 bra BB11_29; + + ld.volatile.shared.f64 %fd34, [%rd8+8]; + add.f64 %fd35, %fd52, %fd34; + st.volatile.shared.f64 [%rd8], %fd35; + +BB11_29: + setp.ne.s32 %p17, %r10, 0; + @%p17 bra BB11_31; + + ld.shared.f64 %fd36, [sdata]; + cvta.to.global.u64 %rd36, %rd2; + mul.wide.u32 %rd37, %r6, 8; + add.s64 %rd38, %rd36, %rd37; + st.global.f64 [%rd38], %fd36; + +BB11_31: ret; } @@ -1381,9 +1906,9 @@ BB8_31: .param .b64 __internal_accurate_pow_param_1 ) { - .reg .pred %p<8>; + .reg .pred %p<9>; .reg .f32 %f<3>; - .reg .b32 %r<49>; + .reg .b32 %r<52>; .reg .f64 %fd<135>; @@ -1391,35 +1916,35 @@ BB8_31: ld.param.f64 %fd13, [__internal_accurate_pow_param_1]; { .reg .b32 %temp; - mov.b64 {%temp, %r46}, %fd12; + mov.b64 {%temp, %r49}, %fd12; } { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd12; + mov.b64 {%r48, %temp}, %fd12; } - shr.u32 %r47, %r46, 20; - setp.ne.s32 %p1, %r47, 0; - @%p1 bra BB9_2; + shr.u32 %r50, %r49, 20; + setp.ne.s32 %p1, %r50, 0; + @%p1 bra BB12_2; mul.f64 %fd14, %fd12, 0d4350000000000000; { .reg .b32 %temp; - mov.b64 {%temp, %r46}, %fd14; + mov.b64 {%temp, %r49}, %fd14; } { .reg .b32 %temp; - mov.b64 {%r45, %temp}, %fd14; + mov.b64 {%r48, %temp}, %fd14; } - shr.u32 %r16, %r46, 20; - add.s32 %r47, %r16, -54; + shr.u32 %r16, %r49, 20; + add.s32 %r50, %r16, -54; -BB9_2: - add.s32 %r48, %r47, -1023; - and.b32 %r17, %r46, -2146435073; +BB12_2: + add.s32 %r51, %r50, -1023; + and.b32 %r17, %r49, -2146435073; or.b32 %r18, %r17, 1072693248; - mov.b64 %fd133, {%r45, %r18}; + mov.b64 %fd133, {%r48, %r18}; setp.lt.u32 %p2, %r18, 1073127583; - @%p2 bra BB9_4; + @%p2 bra BB12_4; { .reg .b32 %temp; @@ -1431,9 +1956,9 @@ BB9_2: } add.s32 %r21, %r20, -1048576; mov.b64 %fd133, {%r19, %r21}; - add.s32 %r48, %r47, -1022; + add.s32 %r51, %r50, -1022; -BB9_4: +BB12_4: add.f64 %fd16, %fd133, 0d3FF0000000000000; // inline asm rcp.approx.ftz.f64 %fd15,%fd16; @@ -1509,7 +2034,7 @@ BB9_4: add.f64 %fd76, %fd71, %fd75; sub.f64 %fd77, %fd71, %fd76; add.f64 %fd78, %fd75, %fd77; - xor.b32 %r25, %r48, -2147483648; + xor.b32 %r25, %r51, -2147483648; mov.u32 %r26, 1127219200; mov.b64 %fd79, {%r25, %r26}; mov.u32 %r27, -2147483648; @@ -1546,47 +2071,48 @@ BB9_4: add.f64 %fd4, %fd94, %fd97; sub.f64 %fd98, %fd94, %fd4; add.f64 %fd5, %fd97, %fd98; - mov.f64 %fd99, 0d4338000000000000; - mov.f64 %fd100, 0d3FF71547652B82FE; - fma.rn.f64 %fd101, %fd4, %fd100, %fd99; + mov.f64 %fd99, 0d3FF71547652B82FE; + mul.rn.f64 %fd100, %fd4, %fd99; + mov.f64 %fd101, 0d4338000000000000; + add.rn.f64 %fd102, %fd100, %fd101; { .reg .b32 %temp; - mov.b64 {%r13, %temp}, %fd101; + mov.b64 {%r13, %temp}, %fd102; } - mov.f64 %fd102, 0dC338000000000000; - add.rn.f64 %fd103, %fd101, %fd102; - mov.f64 %fd104, 0dBFE62E42FEFA39EF; - fma.rn.f64 %fd105, %fd103, %fd104, %fd4; - mov.f64 %fd106, 0dBC7ABC9E3B39803F; - fma.rn.f64 %fd107, %fd103, %fd106, %fd105; - mov.f64 %fd108, 0d3E928AF3FCA213EA; - mov.f64 %fd109, 0d3E5ADE1569CE2BDF; - fma.rn.f64 %fd110, %fd109, %fd107, %fd108; - mov.f64 %fd111, 0d3EC71DEE62401315; - fma.rn.f64 %fd112, %fd110, %fd107, %fd111; - mov.f64 %fd113, 0d3EFA01997C89EB71; - fma.rn.f64 %fd114, %fd112, %fd107, %fd113; - mov.f64 %fd115, 0d3F2A01A014761F65; - fma.rn.f64 %fd116, %fd114, %fd107, %fd115; - mov.f64 %fd117, 0d3F56C16C1852B7AF; - fma.rn.f64 %fd118, %fd116, %fd107, %fd117; - mov.f64 %fd119, 0d3F81111111122322; - fma.rn.f64 %fd120, %fd118, %fd107, %fd119; - mov.f64 %fd121, 0d3FA55555555502A1; - fma.rn.f64 %fd122, %fd120, %fd107, %fd121; - mov.f64 %fd123, 0d3FC5555555555511; - fma.rn.f64 %fd124, %fd122, %fd107, %fd123; - mov.f64 %fd125, 0d3FE000000000000B; - fma.rn.f64 %fd126, %fd124, %fd107, %fd125; - fma.rn.f64 %fd127, %fd126, %fd107, %fd18; - fma.rn.f64 %fd128, %fd127, %fd107, %fd18; + mov.f64 %fd103, 0dC338000000000000; + add.rn.f64 %fd104, %fd102, %fd103; + mov.f64 %fd105, 0dBFE62E42FEFA39EF; + fma.rn.f64 %fd106, %fd104, %fd105, %fd4; + mov.f64 %fd107, 0dBC7ABC9E3B39803F; + fma.rn.f64 %fd108, %fd104, %fd107, %fd106; + mov.f64 %fd109, 0d3E928AF3FCA213EA; + mov.f64 %fd110, 0d3E5ADE1569CE2BDF; + fma.rn.f64 %fd111, %fd110, %fd108, %fd109; + mov.f64 %fd112, 0d3EC71DEE62401315; + fma.rn.f64 %fd113, %fd111, %fd108, %fd112; + mov.f64 %fd114, 0d3EFA01997C89EB71; + fma.rn.f64 %fd115, %fd113, %fd108, %fd114; + mov.f64 %fd116, 0d3F2A01A014761F65; + fma.rn.f64 %fd117, %fd115, %fd108, %fd116; + mov.f64 %fd118, 0d3F56C16C1852B7AF; + fma.rn.f64 %fd119, %fd117, %fd108, %fd118; + mov.f64 %fd120, 0d3F81111111122322; + fma.rn.f64 %fd121, %fd119, %fd108, %fd120; + mov.f64 %fd122, 0d3FA55555555502A1; + fma.rn.f64 %fd123, %fd121, %fd108, %fd122; + mov.f64 %fd124, 0d3FC5555555555511; + fma.rn.f64 %fd125, %fd123, %fd108, %fd124; + mov.f64 %fd126, 0d3FE000000000000B; + fma.rn.f64 %fd127, %fd125, %fd108, %fd126; + fma.rn.f64 %fd128, %fd127, %fd108, %fd18; + fma.rn.f64 %fd129, %fd128, %fd108, %fd18; { .reg .b32 %temp; - mov.b64 {%r14, %temp}, %fd128; + mov.b64 {%r14, %temp}, %fd129; } { .reg .b32 %temp; - mov.b64 {%temp, %r15}, %fd128; + mov.b64 {%temp, %r15}, %fd129; } shl.b32 %r33, %r13, 20; add.s32 %r34, %r15, %r33; @@ -1598,35 +2124,47 @@ BB9_4: mov.b32 %f2, %r35; abs.f32 %f1, %f2; setp.lt.f32 %p4, %f1, 0f4086232B; - @%p4 bra BB9_7; + @%p4 bra BB12_7; setp.lt.f64 %p5, %fd4, 0d0000000000000000; - add.f64 %fd129, %fd4, 0d7FF0000000000000; - selp.f64 %fd134, 0d0000000000000000, %fd129, %p5; + add.f64 %fd130, %fd4, 0d7FF0000000000000; + selp.f64 %fd134, 0d0000000000000000, %fd130, %p5; setp.geu.f32 %p6, %f1, 0f40874800; - @%p6 bra BB9_7; + @%p6 bra BB12_7; shr.u32 %r36, %r13, 31; add.s32 %r37, %r13, %r36; shr.s32 %r38, %r37, 1; shl.b32 %r39, %r38, 20; add.s32 %r40, %r39, %r15; - mov.b64 %fd130, {%r14, %r40}; + mov.b64 %fd131, {%r14, %r40}; sub.s32 %r41, %r13, %r38; shl.b32 %r42, %r41, 20; add.s32 %r43, %r42, 1072693248; mov.u32 %r44, 0; - mov.b64 %fd131, {%r44, %r43}; - mul.f64 %fd134, %fd130, %fd131; + mov.b64 %fd132, {%r44, %r43}; + mul.f64 %fd134, %fd131, %fd132; -BB9_7: - abs.f64 %fd132, %fd134; - setp.eq.f64 %p7, %fd132, 0d7FF0000000000000; - @%p7 bra BB9_9; +BB12_7: + { + .reg .b32 %temp; + mov.b64 {%temp, %r45}, %fd134; + } + and.b32 %r46, %r45, 2147483647; + setp.ne.s32 %p7, %r46, 2146435072; + @%p7 bra BB12_9; + + { + .reg .b32 %temp; + mov.b64 {%r47, %temp}, %fd134; + } + setp.eq.s32 %p8, %r47, 0; + @%p8 bra BB12_10; +BB12_9: fma.rn.f64 %fd134, %fd134, %fd5, %fd134; -BB9_9: +BB12_10: st.param.f64 [func_retval0+0], %fd134; ret; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 4d991f4..99aef40 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -146,7 +146,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) { // Only implemented methods for GPU - if (_op == AggOp.SUM && _direction == Direction.RowCol) { + if (_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row)){ et = ExecType.GPU; k = 1; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index f988e5f..76d900d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -36,17 +36,23 @@ public class GPUInstructionParser extends InstructionParser public static final HashMap<String, GPUINSTRUCTION_TYPE> String2GPUInstructionType; static { String2GPUInstructionType = new HashMap<String, GPUINSTRUCTION_TYPE>(); + + // Neural Network Operators String2GPUInstructionType.put( "conv2d", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_data", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "maxpooling", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); + + // Matrix Multiply Operators String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); + + // Reorg/Transpose String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg); - // + // Binary Cellwise String2GPUInstructionType.put( "+" , GPUINSTRUCTION_TYPE.ArithmeticBinary); String2GPUInstructionType.put( "-" , GPUINSTRUCTION_TYPE.ArithmeticBinary); String2GPUInstructionType.put( "*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); @@ -64,7 +70,12 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary); + // Aggregate Unary + String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); + String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); + String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); + } public static GPUInstruction parseSingleInstruction (String str ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java index c506b64..2ab1b89 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java @@ -87,6 +87,13 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction { int rlen = (int)in1.getNumRows(); int clen = (int)in1.getNumColumns(); + IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn; + if (indexFunction instanceof ReduceRow){ // COL{SUM, MAX...} + ec.setMetaData(_output.getName(), 1, clen); + } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...} + ec.setMetaData(_output.getName(), rlen, 1); + } + LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), (AggregateUnaryOperator)_optr); //release inputs/outputs @@ -95,7 +102,6 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction { // If the unary aggregate is a row reduction or a column reduction, it results in a vector // which needs to be released. Otherwise a scala is produced and it is copied back to the host // and set in the execution context by invoking the setScalarOutput - IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn; if (indexFunction instanceof ReduceRow || indexFunction instanceof ReduceCol) { ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index efe3a4f..86bd732 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -30,11 +30,22 @@ public abstract class GPUContext { public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>(); protected static GPUContext currContext; - protected GPUContext() { } - public static volatile Boolean isGPUContextCreated = false; - + + protected GPUContext() {} + + /** + * Gets device memory available for SystemML operations + * @return + */ public abstract long getAvailableMemory(); + + /** + * Ensures that all the CUDA cards on the current system are + * of the minimum required compute capability. + * (The minimum required compute capability is hard coded in {@link JCudaContext}. + */ + public abstract void ensureComputeCapability() throws DMLRuntimeException; /** * Creation / Destruction of GPUContext and related handles @@ -46,6 +57,7 @@ public abstract class GPUContext { if(currContext == null && DMLScript.USE_ACCELERATOR) { synchronized(isGPUContextCreated) { currContext = new JCudaContext(); + currContext.ensureComputeCapability(); OptimizerUtils.GPU_MEMORY_BUDGET = ((JCudaContext)currContext).getAvailableMemory(); isGPUContextCreated = true; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java index d94532c..893f416 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java @@ -36,6 +36,7 @@ import jcuda.runtime.JCuda; import jcuda.jcudnn.cudnnHandle; import jcuda.jcusparse.JCusparse; import jcuda.jcusparse.cusparseHandle; +import jcuda.runtime.cudaDeviceProp; import static jcuda.jcudnn.JCudnn.cudnnCreate; import static jcuda.jcublas.JCublas2.cublasCreate; import static jcuda.jcublas.JCublas2.cublasDestroy; @@ -44,6 +45,8 @@ import static jcuda.jcusparse.JCusparse.cusparseDestroy; import static jcuda.jcusparse.JCusparse.cusparseCreate; import static jcuda.driver.JCudaDriver.cuInit; import static jcuda.driver.JCudaDriver.cuDeviceGetCount; +import static jcuda.runtime.JCuda.cudaGetDeviceProperties; +import static jcuda.runtime.JCuda.cudaGetDeviceCount; import static jcuda.runtime.JCuda.cudaMemGetInfo; import static jcuda.runtime.cudaError.cudaSuccess; @@ -55,7 +58,13 @@ import static jcuda.runtime.cudaError.cudaSuccess; * */ public class JCudaContext extends GPUContext { - + + // The minimum CUDA Compute capability needed for SystemML. + // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. + // If SystemML needs to run on an older card, this logic can be revisited. + final int MAJOR_REQUIRED = 3; + final int MINOR_REQUIRED = 0; + private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); public static boolean DEBUG = false; @@ -82,7 +91,8 @@ public class JCudaContext extends GPUContext { LOG.info("Total number of GPUs on the machine: " + deviceCount); Statistics.cudaInitTime = System.nanoTime() - start; } - + + @Override public long getAvailableMemory() { if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { long free [] = { 0 }; @@ -97,6 +107,30 @@ public class JCudaContext extends GPUContext { } return (long) (availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR); } + + @Override + public void ensureComputeCapability() throws DMLRuntimeException { + int[] devices = {-1}; + cudaGetDeviceCount(devices); + if (devices[0] == -1){ + throw new DMLRuntimeException("Call to cudaGetDeviceCount returned 0 devices"); + } + boolean isComputeCapable = true; + for (int i=0; i<devices[0]; i++) { + cudaDeviceProp properties = new cudaDeviceProp(); + cudaGetDeviceProperties(properties, i); + int major = properties.major; + int minor = properties.minor; + if (major < MAJOR_REQUIRED) { + isComputeCapable = false; + } else if (major == MAJOR_REQUIRED && minor < MINOR_REQUIRED) { + isComputeCapable = false; + } + } + if (!isComputeCapable) { + throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED); + } + } public JCudaContext() throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 5426a30..ca3ccd3 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -90,6 +90,10 @@ import jcuda.jcusparse.cusparseHandle; //FIXME move could to respective instructions, this is not a block library public class LibMatrixCUDA { + // Assume Compute Capability 3.0 + public static final int MAX_THREADS = 1024; // For compute capability > 3.0 + public static final int MAX_BLOCKS = 2147483647; // 2^31 - 1 For compute capability > 3.0 + public static cudnnHandle cudnnHandle; public static cublasHandle cublasHandle; public static cusparseHandle cusparseHandle; @@ -992,138 +996,143 @@ public class LibMatrixCUDA { assert opIndex != -1 : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction"; - //TODO - care about reductionDirection & opIndex - int rlen = (int)in1.getNumRows(); int clen = (int)in1.getNumColumns(); if (isSparse){ - long nnz = in1.getNnz(); - assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU"; - MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz); - throw new DMLRuntimeException("Internal Error - Not implemented"); + // The strategy for the time being is to convert sparse to dense + // until a sparse specific kernel is written. + ((JCudaObject)in1.getGPUObject()).sparseToDense(); + // long nnz = in1.getNnz(); + // assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU"; + // MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz); + // throw new DMLRuntimeException("Internal Error - Not implemented"); - } else { - Pointer out = null; - if (reductionDirection == REDUCTION_ALL || reductionDirection == REDUCTION_DIAG) { - // Scalar output - out = new Pointer(); - cudaMalloc(out, Sizeof.DOUBLE); - } else { - // Matrix output - MatrixObject out1 = ec.getDenseMatrixOutputForGPUInstruction(output); - out = ((JCudaObject) out1.getGPUObject()).jcudaDenseMatrixPtr; - } + } - Pointer in = ((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr; - int size = rlen * clen; - - // For scalars, set the scalar output in the Execution Context object - switch (opIndex){ - case OP_PLUS: { - switch(reductionDirection) { - case REDUCTION_ALL : { - double result = reduce_single(in, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - case REDUCTION_DIAG : - case REDUCTION_COL : - case REDUCTION_ROW : - throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet"); + Pointer out = null; + if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) { + // Matrix output + MatrixObject out1 = ec.getDenseMatrixOutputForGPUInstruction(output); + out = ((JCudaObject) out1.getGPUObject()).jcudaDenseMatrixPtr; + } + + Pointer in = ((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr; + int size = rlen * clen; + + // For scalars, set the scalar output in the Execution Context object + switch (opIndex){ + case OP_PLUS: { + switch(reductionDirection) { + case REDUCTION_ALL : { + double result = reduceAll(in, size); + ec.setScalarOutput(output, new DoubleObject(result)); + break; } - break; - } - case OP_PLUS_SQ : { - switch(reductionDirection) { - case REDUCTION_ALL: - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column summation square of matrix not implemented yet for GPU"); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); + case REDUCTION_COL : { + reduceRow(in, out, rlen, clen); + break; } - // break; + case REDUCTION_DIAG : + case REDUCTION_ROW : + throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet"); } - case OP_MEAN:{ - switch(reductionDirection) { - case REDUCTION_ALL: - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); - } - // break; + break; + } + case OP_PLUS_SQ : { + switch(reductionDirection) { + case REDUCTION_ALL: + case REDUCTION_COL: + case REDUCTION_ROW: + throw new DMLRuntimeException("Internal Error - All, Row & Column summation square of matrix not implemented yet for GPU"); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); } - case OP_VARIANCE : { - switch(reductionDirection) { - case REDUCTION_ALL: - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); - } - // break; + // break; + } + case OP_MEAN:{ + switch(reductionDirection) { + case REDUCTION_ALL: + case REDUCTION_COL: + case REDUCTION_ROW: + throw new DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); } - case OP_MULTIPLY : { - switch (reductionDirection) { - case REDUCTION_ALL: - throw new DMLRuntimeException("Internal Error - All element multiplication of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication"); - } - // break; + // break; + } + case OP_VARIANCE : { + switch(reductionDirection) { + case REDUCTION_ALL: + case REDUCTION_COL: + case REDUCTION_ROW: + throw new DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); } - case OP_MAX :{ - switch(reductionDirection) { - case REDUCTION_ALL: - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column max of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); - } - // break; + // break; + } + case OP_MULTIPLY : { + switch (reductionDirection) { + case REDUCTION_ALL: + throw new DMLRuntimeException("Internal Error - All element multiplication of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication"); } - case OP_MIN :{ - switch(reductionDirection) { - case REDUCTION_ALL: - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column min of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); - } - // break; + // break; + } + case OP_MAX :{ + switch(reductionDirection) { + case REDUCTION_ALL: + case REDUCTION_COL: + case REDUCTION_ROW: + throw new DMLRuntimeException("Internal Error - All, Row & Column max of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); } - case OP_MAXINDEX : { - switch(reductionDirection) { - case REDUCTION_COL: - throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex"); - } - // break; + // break; + } + case OP_MIN :{ + switch(reductionDirection) { + case REDUCTION_ALL: + case REDUCTION_COL: + case REDUCTION_ROW: + throw new DMLRuntimeException("Internal Error - All, Row & Column min of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); } - case OP_MININDEX : { - switch(reductionDirection) { - case REDUCTION_COL: - throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex"); - } - // break; + // break; + } + case OP_MAXINDEX : { + switch(reductionDirection) { + case REDUCTION_COL: + throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex"); } - default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!"); + // break; } - + case OP_MININDEX : { + switch(reductionDirection) { + case REDUCTION_COL: + throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex"); + } + // break; + } + default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!"); } } - - private static double reduce_single(Pointer in, int n) throws DMLRuntimeException { - int[] tmp = getThreadsBlocksAndSharedMem(n); + /** + * Do a simple reduction, the output of which is a single value + * @param in {@link Pointer} to matrix in device memory + * @param n size of array + * @return the reduced value + * @throws DMLRuntimeException + */ + private static double reduceAll(Pointer in, int n) throws DMLRuntimeException { + int[] tmp = getKernelParamsForReduceAll(n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE); @@ -1132,7 +1141,7 @@ public class LibMatrixCUDA { cudaDeviceSynchronize(); int s = n; while (s > 1) { - tmp = getThreadsBlocksAndSharedMem(n); + tmp = getKernelParamsForReduceAll(n); blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; kernels.launchKernel("reduce", new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s); @@ -1145,10 +1154,29 @@ public class LibMatrixCUDA { return result[0]; } + /** + * Do a reduction by row. Data is reduced per row and the + * resulting vector is calculated. + * @param in {@link Pointer} to input matrix in device memory (size - rows * columns) + * @param out {@link Pointer} to output matrix in device memory (size - rows * 1) + * @param rows number of rows in input matrix + * @param cols number of columns in input matrix + * @throws DMLRuntimeException + */ + private static void reduceRow(Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { + int[] tmp = getKernelParamsForReduceByRow(rows, cols); + int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; + kernels.launchKernel("reduce_row", new ExecutionConfig(blocks, threads, sharedMem), + in, out, rows, cols); + cudaDeviceSynchronize(); + } - private static int[] getThreadsBlocksAndSharedMem(int n){ - final int MAX_THREADS = 1024; - final int MAX_BLOCKS = 65535; + /** + * Get threads, blocks and shared memory for a reduce all operation + * @param n size of input array + * @return integer array containing {blocks, threads, shared memory} + */ + private static int[] getKernelParamsForReduceAll(int n){ int threads = (n < MAX_THREADS*2) ? nextPow2((n + 1)/ 2) : MAX_THREADS; int blocks = (n + (threads * 2 - 1)) / (threads * 2); @@ -1161,6 +1189,22 @@ public class LibMatrixCUDA { return new int[] {blocks, threads, sharedMemSize}; } + /** + * Get threads, blocks and shared memory for a reduce by row operation + * @param rows number of rows in input matrix + * @param cols number of columns in input matrix + * @return integer array containing {blocks, threads, shared memory} + */ + private static int[] getKernelParamsForReduceByRow(int rows, int cols) { + final int WARP_SIZE = 32; + int threads = Math.min(cols, WARP_SIZE); + int blocks = rows; + int sharedMemSize = threads * Sizeof.DOUBLE; + if (threads <= 32){ + sharedMemSize *=2; + } + return new int[] {blocks, threads, sharedMemSize}; + } private static int nextPow2(int x) {
