Repository: incubator-systemml
Updated Branches:
  refs/heads/master 6963f5e10 -> 41c513151


[SYSTEMML-1039] Added uark+/uar+

Closes #326.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/41c51315
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/41c51315
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/41c51315

Branch: refs/heads/master
Commit: 41c513151941942e785ae882adb4ed92f72fd471
Parents: 6963f5e
Author: Nakul Jindal <[email protected]>
Authored: Thu Jan 5 15:56:25 2017 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Thu Jan 5 15:56:24 2017 -0800

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   77 +-
 src/main/cpp/kernels/SystemML.ptx               | 1824 ++++++++++++------
 .../java/org/apache/sysml/hops/AggUnaryOp.java  |    2 +-
 .../instructions/GPUInstructionParser.java      |   13 +-
 .../context/AggregateUnaryGPUInstruction.java   |    8 +-
 .../instructions/gpu/context/GPUContext.java    |   18 +-
 .../instructions/gpu/context/JCudaContext.java  |   38 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  278 +--
 8 files changed, 1489 insertions(+), 769 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 5e5fd5e..11a337c 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -183,7 +183,20 @@ __global__ void fill(double* A, double scalar, int lenA) {
 }
 
 
-
+/**
+ * Does a reduce (sum) over all elements of the array.
+ * This method has been adapted from the Reduction sample in the NVIDIA CUDA 
Samples (v8.0)
+ * and the Reduction example available through jcuda.org
+ * When invoked initially, all blocks partly compute the reduction operation 
over the entire array
+ * and writes it to the output/temporary array. A second invokation needs to 
happen to get the
+ * reduced value.
+ * The number of threads, blocks and amount of shared memory is calculated in 
a specific way.
+ * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes 
this method to see
+ * how its done.
+ * @param g_idata   input data stored in device memory (of size n)
+ * @param g_odata   output/temporary array stode in device memory (of size n)
+ * @param n         size of the input and temporary/output arrays
+ */
 extern "C"
 __global__ void reduce(double *g_idata, double *g_odata, unsigned int n)
 {
@@ -237,3 +250,65 @@ __global__ void reduce(double *g_idata, double *g_odata, 
unsigned int n)
     if (tid == 0)
         g_odata[blockIdx.x] = sdata[0];
 }
+
+
+/**
+ * Does a reduce (sum) over each row of the array.
+ * The intuition for this kernel is that each block does a reduction over a 
single row.
+ * The maximum numver
+ * @param g_idata   input matrix stored in device memory
+ * @param g_odata   output vector of size [rows * 1] in device memory
+ * @param rows      number of rows in input matrix
+ * @param cols      number of columns in input matrix
+ */
+extern "C"
+__global__ void reduce_row(double *g_idata, double *g_odata, unsigned int 
rows, unsigned int cols)
+{
+    extern __shared__ double sdata[];
+
+    // one block per row
+    if (blockIdx.x >= rows) {
+        return;
+    }
+
+    unsigned int block = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    unsigned int i = tid;
+    unsigned int block_offset = block * cols;
+
+    double mySum = 0;
+    while (i < cols){
+        mySum += g_idata[block_offset + i];
+        i += blockDim.x;
+    }
+
+    // each thread puts its local sum into shared memory
+    sdata[tid] = mySum;
+    __syncthreads();
+
+
+    // do reduction in shared mem
+    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + 
sdata[tid + 256]; } __syncthreads(); }
+    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + 
sdata[tid + 128]; } __syncthreads(); }
+    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = mySum = mySum + 
sdata[tid +  64]; } __syncthreads(); }
+
+    if (tid < 32)
+    {
+        // now that we are using warp-synchronous programming (below)
+        // we need to declare our shared memory volatile so that the compiler
+        // doesn't reorder stores to it and induce incorrect behavior.
+        volatile double* smem = sdata;
+        if (blockDim.x >=  64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
+        if (blockDim.x >=  32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
+        if (blockDim.x >=  16) { smem[tid] = mySum = mySum + smem[tid +  8]; }
+        if (blockDim.x >=   8) { smem[tid] = mySum = mySum + smem[tid +  4]; }
+        if (blockDim.x >=   4) { smem[tid] = mySum = mySum + smem[tid +  2]; }
+        if (blockDim.x >=   2) { smem[tid] = mySum = mySum + smem[tid +  1]; }
+    }
+
+    // write result for this block to global mem
+    if (tid == 0)
+        g_odata[block] = sdata[0];
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index ea27ac0..0683492 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,16 +1,16 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
 // Based on LLVM 3.4svn
 //
 
-.version 5.0
-.target sm_20
+.version 4.3
+.target sm_30
 .address_size 64
 
-       // .globl       copyUpperToLowerTriangleDense
+       // .globl       getBoolean
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
        .param .b64 __internal_accurate_pow_param_0,
@@ -19,6 +19,307 @@
 ;
 .extern .shared .align 8 .b8 sdata[];
 
+.visible .func  (.param .b64 func_retval0) getBoolean(
+       .param .b32 getBoolean_param_0
+)
+{
+       .reg .pred      %p<2>;
+       .reg .b32       %r<2>;
+       .reg .f64       %fd<2>;
+
+
+       ld.param.u32    %r1, [getBoolean_param_0];
+       setp.eq.s32     %p1, %r1, 0;
+       selp.f64        %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
+       st.param.f64    [func_retval0+0], %fd1;
+       ret;
+}
+
+       // .globl       binaryOp
+.visible .func  (.param .b64 func_retval0) binaryOp(
+       .param .b64 binaryOp_param_0,
+       .param .b64 binaryOp_param_1,
+       .param .b32 binaryOp_param_2
+)
+{
+       .reg .pred      %p<41>;
+       .reg .b32       %r<30>;
+       .reg .f64       %fd<40>;
+       .reg .b64       %rd<3>;
+
+
+       ld.param.f64    %fd26, [binaryOp_param_0];
+       ld.param.f64    %fd27, [binaryOp_param_1];
+       ld.param.u32    %r3, [binaryOp_param_2];
+       setp.eq.s32     %p2, %r3, 0;
+       @%p2 bra        BB1_40;
+
+       setp.eq.s32     %p3, %r3, 1;
+       @%p3 bra        BB1_39;
+       bra.uni         BB1_2;
+
+BB1_39:
+       sub.f64         %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+
+BB1_40:
+       add.f64         %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+
+BB1_2:
+       setp.eq.s32     %p4, %r3, 2;
+       @%p4 bra        BB1_38;
+       bra.uni         BB1_3;
+
+BB1_38:
+       mul.f64         %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+
+BB1_3:
+       setp.eq.s32     %p5, %r3, 3;
+       @%p5 bra        BB1_37;
+       bra.uni         BB1_4;
+
+BB1_37:
+       div.rn.f64      %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+
+BB1_4:
+       setp.eq.s32     %p6, %r3, 4;
+       @%p6 bra        BB1_21;
+       bra.uni         BB1_5;
+
+BB1_21:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r1}, %fd26;
+       }
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r2}, %fd27;
+       }
+       bfe.u32         %r4, %r2, 20, 11;
+       add.s32         %r5, %r4, -1012;
+       mov.b64          %rd2, %fd27;
+       shl.b64         %rd1, %rd2, %r5;
+       setp.eq.s64     %p21, %rd1, -9223372036854775808;
+       abs.f64         %fd9, %fd26;
+       // Callseq Start 0
+       {
+       .reg .b32 temp_param_reg;
+       // <end>}
+       .param .b64 param0;
+       st.param.f64    [param0+0], %fd9;
+       .param .b64 param1;
+       st.param.f64    [param1+0], %fd27;
+       .param .b64 retval0;
+       call.uni (retval0), 
+       __internal_accurate_pow, 
+       (
+       param0, 
+       param1
+       );
+       ld.param.f64    %fd38, [retval0+0];
+       
+       //{
+       }// Callseq End 0
+       setp.lt.s32     %p22, %r1, 0;
+       and.pred        %p1, %p22, %p21;
+       @!%p1 bra       BB1_23;
+       bra.uni         BB1_22;
+
+BB1_22:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r6}, %fd38;
+       }
+       xor.b32         %r7, %r6, -2147483648;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r8, %temp}, %fd38;
+       }
+       mov.b64         %fd38, {%r8, %r7};
+
+BB1_23:
+       mov.f64         %fd37, %fd38;
+       setp.eq.f64     %p23, %fd26, 0d0000000000000000;
+       @%p23 bra       BB1_26;
+       bra.uni         BB1_24;
+
+BB1_26:
+       selp.b32        %r9, %r1, 0, %p21;
+       or.b32          %r10, %r9, 2146435072;
+       setp.lt.s32     %p27, %r2, 0;
+       selp.b32        %r11, %r10, %r9, %p27;
+       mov.u32         %r12, 0;
+       mov.b64         %fd37, {%r12, %r11};
+       bra.uni         BB1_27;
+
+BB1_5:
+       setp.eq.s32     %p7, %r3, 5;
+       @%p7 bra        BB1_20;
+       bra.uni         BB1_6;
+
+BB1_20:
+       setp.lt.f64     %p20, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p20;
+       bra.uni         BB1_41;
+
+BB1_6:
+       setp.eq.s32     %p8, %r3, 6;
+       @%p8 bra        BB1_19;
+       bra.uni         BB1_7;
+
+BB1_19:
+       setp.le.f64     %p19, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p19;
+       bra.uni         BB1_41;
+
+BB1_24:
+       setp.gt.s32     %p24, %r1, -1;
+       @%p24 bra       BB1_27;
+
+       cvt.rzi.f64.f64 %fd29, %fd27;
+       setp.neu.f64    %p25, %fd29, %fd27;
+       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p25;
+
+BB1_27:
+       mov.f64         %fd15, %fd37;
+       add.f64         %fd16, %fd26, %fd27;
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r13}, %fd16;
+       }
+       and.b32         %r14, %r13, 2146435072;
+       setp.ne.s32     %p28, %r14, 2146435072;
+       mov.f64         %fd36, %fd15;
+       @%p28 bra       BB1_36;
+
+       setp.gtu.f64    %p29, %fd9, 0d7FF0000000000000;
+       mov.f64         %fd36, %fd16;
+       @%p29 bra       BB1_36;
+
+       abs.f64         %fd30, %fd27;
+       setp.gtu.f64    %p30, %fd30, 0d7FF0000000000000;
+       mov.f64         %fd35, %fd16;
+       mov.f64         %fd36, %fd35;
+       @%p30 bra       BB1_36;
+
+       and.b32         %r15, %r2, 2147483647;
+       setp.ne.s32     %p31, %r15, 2146435072;
+       @%p31 bra       BB1_32;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r16, %temp}, %fd27;
+       }
+       setp.eq.s32     %p32, %r16, 0;
+       @%p32 bra       BB1_35;
+
+BB1_32:
+       and.b32         %r17, %r1, 2147483647;
+       setp.ne.s32     %p33, %r17, 2146435072;
+       mov.f64         %fd33, %fd15;
+       mov.f64         %fd36, %fd33;
+       @%p33 bra       BB1_36;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r18, %temp}, %fd26;
+       }
+       setp.ne.s32     %p34, %r18, 0;
+       mov.f64         %fd36, %fd15;
+       @%p34 bra       BB1_36;
+
+       shr.s32         %r19, %r2, 31;
+       and.b32         %r20, %r19, -2146435072;
+       add.s32         %r21, %r20, 2146435072;
+       or.b32          %r22, %r21, -2147483648;
+       selp.b32        %r23, %r22, %r21, %p1;
+       mov.u32         %r24, 0;
+       mov.b64         %fd36, {%r24, %r23};
+       bra.uni         BB1_36;
+
+BB1_7:
+       setp.eq.s32     %p9, %r3, 7;
+       @%p9 bra        BB1_18;
+       bra.uni         BB1_8;
+
+BB1_18:
+       setp.gt.f64     %p18, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p18;
+       bra.uni         BB1_41;
+
+BB1_8:
+       setp.eq.s32     %p10, %r3, 8;
+       @%p10 bra       BB1_17;
+       bra.uni         BB1_9;
+
+BB1_17:
+       setp.ge.f64     %p17, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p17;
+       bra.uni         BB1_41;
+
+BB1_9:
+       setp.eq.s32     %p11, %r3, 9;
+       @%p11 bra       BB1_16;
+       bra.uni         BB1_10;
+
+BB1_16:
+       setp.eq.f64     %p16, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p16;
+       bra.uni         BB1_41;
+
+BB1_10:
+       setp.eq.s32     %p12, %r3, 10;
+       @%p12 bra       BB1_15;
+       bra.uni         BB1_11;
+
+BB1_15:
+       setp.neu.f64    %p15, %fd26, %fd27;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p15;
+       bra.uni         BB1_41;
+
+BB1_35:
+       setp.gt.f64     %p35, %fd9, 0d3FF0000000000000;
+       selp.b32        %r25, 2146435072, 0, %p35;
+       xor.b32         %r26, %r25, 2146435072;
+       setp.lt.s32     %p36, %r2, 0;
+       selp.b32        %r27, %r26, %r25, %p36;
+       setp.eq.f64     %p37, %fd26, 0dBFF0000000000000;
+       selp.b32        %r28, 1072693248, %r27, %p37;
+       mov.u32         %r29, 0;
+       mov.b64         %fd36, {%r29, %r28};
+
+BB1_36:
+       setp.eq.f64     %p38, %fd27, 0d0000000000000000;
+       setp.eq.f64     %p39, %fd26, 0d3FF0000000000000;
+       or.pred         %p40, %p39, %p38;
+       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p40;
+
+BB1_41:
+       st.param.f64    [func_retval0+0], %fd39;
+       ret;
+
+BB1_11:
+       setp.eq.s32     %p13, %r3, 11;
+       @%p13 bra       BB1_14;
+       bra.uni         BB1_12;
+
+BB1_14:
+       min.f64         %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+
+BB1_12:
+       mov.f64         %fd39, 0dC08F380000000000;
+       setp.ne.s32     %p14, %r3, 12;
+       @%p14 bra       BB1_41;
+
+       max.f64         %fd39, %fd26, %fd27;
+       bra.uni         BB1_41;
+}
+
+       // .globl       copyUpperToLowerTriangleDense
 .visible .entry copyUpperToLowerTriangleDense(
        .param .u64 copyUpperToLowerTriangleDense_param_0,
        .param .u32 copyUpperToLowerTriangleDense_param_1,
@@ -46,10 +347,10 @@
        setp.gt.s32     %p1, %r2, %r1;
        setp.lt.s32     %p2, %r3, %r5;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB0_2;
-       bra.uni         BB0_1;
+       @!%p3 bra       BB2_2;
+       bra.uni         BB2_1;
 
-BB0_1:
+BB2_1:
        cvta.to.global.u64      %rd2, %rd1;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd3, %r12, 8;
@@ -59,7 +360,7 @@ BB0_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB0_2:
+BB2_2:
        ret;
 }
 
@@ -92,14 +393,14 @@ BB0_2:
        mad.lo.s32      %r1, %r8, %r9, %r11;
        mul.lo.s32      %r12, %r3, %r2;
        setp.ge.s32     %p1, %r1, %r12;
-       @%p1 bra        BB1_2;
+       @%p1 bra        BB3_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB1_2:
+BB3_2:
        ret;
 }
 
@@ -133,10 +434,10 @@ BB1_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB2_2;
-       bra.uni         BB2_1;
+       @!%p3 bra       BB4_2;
+       bra.uni         BB4_1;
 
-BB2_1:
+BB4_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -145,7 +446,7 @@ BB2_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd1;
 
-BB2_2:
+BB4_2:
        ret;
 }
 
@@ -178,10 +479,10 @@ BB2_2:
        setp.lt.s32     %p1, %r1, %r4;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB3_2;
-       bra.uni         BB3_1;
+       @!%p3 bra       BB5_2;
+       bra.uni         BB5_1;
 
-BB3_1:
+BB5_1:
        cvta.to.global.u64      %rd3, %rd1;
        mad.lo.s32      %r11, %r1, %r3, %r2;
        mul.wide.s32    %rd4, %r11, 8;
@@ -193,7 +494,7 @@ BB3_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB3_2:
+BB5_2:
        ret;
 }
 
@@ -237,10 +538,10 @@ BB3_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB4_6;
-       bra.uni         BB4_1;
+       @!%p3 bra       BB6_6;
+       bra.uni         BB6_1;
 
-BB4_1:
+BB6_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -250,26 +551,26 @@ BB4_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB4_5;
-       bra.uni         BB4_2;
+       @%p4 bra        BB6_5;
+       bra.uni         BB6_2;
 
-BB4_5:
+BB6_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB4_6;
+       bra.uni         BB6_6;
 
-BB4_2:
+BB6_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB4_4;
-       bra.uni         BB4_3;
+       @%p5 bra        BB6_4;
+       bra.uni         BB6_3;
 
-BB4_4:
+BB6_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB4_6;
+       bra.uni         BB6_6;
 
-BB4_3:
+BB6_3:
        st.global.f64   [%rd1], %fd6;
 
-BB4_6:
+BB6_6:
        ret;
 }
 
@@ -285,9 +586,9 @@ BB4_6:
        .param .u32 binCellOp_param_7
 )
 {
-       .reg .pred      %p<50>;
-       .reg .b32       %r<51>;
-       .reg .f64       %fd<39>;
+       .reg .pred      %p<52>;
+       .reg .b32       %r<56>;
+       .reg .f64       %fd<40>;
        .reg .b64       %rd<15>;
 
 
@@ -310,93 +611,93 @@ BB4_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB5_53;
-       bra.uni         BB5_1;
+       @!%p4 bra       BB7_55;
+       bra.uni         BB7_1;
 
-BB5_1:
+BB7_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r49, %r1;
-       @%p5 bra        BB5_5;
+       mov.u32         %r54, %r1;
+       @%p5 bra        BB7_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r50, %r3;
-       @%p6 bra        BB5_4;
+       mov.u32         %r55, %r3;
+       @%p6 bra        BB7_4;
 
-       mov.u32         %r50, %r2;
+       mov.u32         %r55, %r2;
 
-BB5_4:
-       mov.u32         %r44, %r50;
-       mov.u32         %r4, %r44;
-       mov.u32         %r49, %r4;
+BB7_4:
+       mov.u32         %r49, %r55;
+       mov.u32         %r4, %r49;
+       mov.u32         %r54, %r4;
 
-BB5_5:
-       mov.u32         %r5, %r49;
+BB7_5:
+       mov.u32         %r5, %r54;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r47, %r1;
-       @%p7 bra        BB5_9;
+       mov.u32         %r52, %r1;
+       @%p7 bra        BB7_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r48, %r3;
-       @%p8 bra        BB5_8;
+       mov.u32         %r53, %r3;
+       @%p8 bra        BB7_8;
 
-       mov.u32         %r48, %r2;
+       mov.u32         %r53, %r2;
 
-BB5_8:
-       mov.u32         %r47, %r48;
+BB7_8:
+       mov.u32         %r52, %r53;
 
-BB5_9:
+BB7_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r47, 8;
+       mul.wide.s32    %rd9, %r52, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd38, 0dC08F380000000000;
+       mov.f64         %fd39, 0dC08F380000000000;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB5_19;
+       @%p9 bra        BB7_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB5_15;
+       @%p19 bra       BB7_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB5_51;
+       @%p23 bra       BB7_53;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB5_50;
-       bra.uni         BB5_13;
+       @%p24 bra       BB7_52;
+       bra.uni         BB7_13;
 
-BB5_50:
-       sub.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+BB7_52:
+       sub.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_19:
+BB7_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB5_24;
+       @%p10 bra       BB7_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB5_34;
+       @%p16 bra       BB7_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB5_33;
-       bra.uni         BB5_22;
+       @%p17 bra       BB7_33;
+       bra.uni         BB7_22;
 
-BB5_33:
+BB7_33:
        setp.gt.f64     %p29, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
+       bra.uni         BB7_54;
 
-BB5_15:
+BB7_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB5_49;
+       @%p20 bra       BB7_51;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB5_35;
-       bra.uni         BB5_17;
+       @%p21 bra       BB7_35;
+       bra.uni         BB7_17;
 
-BB5_35:
+BB7_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -411,7 +712,7 @@ BB5_35:
        shl.b64         %rd1, %rd11, %r22;
        setp.eq.s64     %p32, %rd1, -9223372036854775808;
        abs.f64         %fd11, %fd1;
-       // Callseq Start 0
+       // Callseq Start 1
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -426,133 +727,133 @@ BB5_35:
        param0, 
        param1
        );
-       ld.param.f64    %fd37, [retval0+0];
+       ld.param.f64    %fd38, [retval0+0];
        
        //{
-       }// Callseq End 0
+       }// Callseq End 1
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB5_37;
-       bra.uni         BB5_36;
+       @!%p1 bra       BB7_37;
+       bra.uni         BB7_36;
 
-BB5_36:
+BB7_36:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd37;
+       mov.b64         {%temp, %r23}, %fd38;
        }
        xor.b32         %r24, %r23, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r25, %temp}, %fd37;
+       mov.b64         {%r25, %temp}, %fd38;
        }
-       mov.b64         %fd37, {%r25, %r24};
+       mov.b64         %fd38, {%r25, %r24};
 
-BB5_37:
-       mov.f64         %fd36, %fd37;
+BB7_37:
+       mov.f64         %fd37, %fd38;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB5_40;
-       bra.uni         BB5_38;
+       @%p34 bra       BB7_40;
+       bra.uni         BB7_38;
 
-BB5_40:
+BB7_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
-       mov.b64         %fd36, {%r29, %r28};
-       bra.uni         BB5_41;
+       mov.b64         %fd37, {%r29, %r28};
+       bra.uni         BB7_41;
 
-BB5_24:
+BB7_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB5_28;
+       @%p11 bra       BB7_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB5_32;
-       bra.uni         BB5_26;
+       @%p14 bra       BB7_32;
+       bra.uni         BB7_26;
 
-BB5_32:
+BB7_32:
        setp.eq.f64     %p27, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB7_54;
 
-BB5_28:
+BB7_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB5_31;
-       bra.uni         BB5_29;
+       @%p12 bra       BB7_31;
+       bra.uni         BB7_29;
 
-BB5_31:
-       min.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+BB7_31:
+       min.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_51:
-       add.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+BB7_53:
+       add.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_13:
+BB7_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB5_14;
-       bra.uni         BB5_52;
+       @%p25 bra       BB7_14;
+       bra.uni         BB7_54;
 
-BB5_14:
-       mul.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+BB7_14:
+       mul.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_34:
+BB7_34:
        setp.le.f64     %p30, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
+       bra.uni         BB7_54;
 
-BB5_22:
+BB7_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB5_23;
-       bra.uni         BB5_52;
+       @%p18 bra       BB7_23;
+       bra.uni         BB7_54;
 
-BB5_23:
+BB7_23:
        setp.ge.f64     %p28, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
+       bra.uni         BB7_54;
 
-BB5_49:
-       div.rn.f64      %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+BB7_51:
+       div.rn.f64      %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_17:
+BB7_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB5_18;
-       bra.uni         BB5_52;
+       @%p22 bra       BB7_18;
+       bra.uni         BB7_54;
 
-BB5_18:
+BB7_18:
        setp.lt.f64     %p31, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
+       bra.uni         BB7_54;
 
-BB5_26:
+BB7_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB5_27;
-       bra.uni         BB5_52;
+       @%p15 bra       BB7_27;
+       bra.uni         BB7_54;
 
-BB5_27:
+BB7_27:
        setp.neu.f64    %p26, %fd1, %fd2;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB5_52;
+       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB7_54;
 
-BB5_29:
+BB7_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB5_52;
+       @%p13 bra       BB7_54;
 
-       max.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB5_52;
+       max.f64         %fd39, %fd1, %fd2;
+       bra.uni         BB7_54;
 
-BB5_38:
+BB7_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB5_41;
+       @%p35 bra       BB7_41;
 
-       cvt.rzi.f64.f64 %fd30, %fd2;
-       setp.neu.f64    %p36, %fd30, %fd2;
-       selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
+       cvt.rzi.f64.f64 %fd29, %fd2;
+       setp.neu.f64    %p36, %fd29, %fd2;
+       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p36;
 
-BB5_41:
-       mov.f64         %fd17, %fd36;
+BB7_41:
+       mov.f64         %fd17, %fd37;
        add.f64         %fd18, %fd1, %fd2;
        {
        .reg .b32 %temp; 
@@ -560,60 +861,78 @@ BB5_41:
        }
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
-       mov.f64         %fd35, %fd17;
-       @%p39 bra       BB5_48;
+       mov.f64         %fd36, %fd17;
+       @%p39 bra       BB7_50;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
+       mov.f64         %fd36, %fd18;
+       @%p40 bra       BB7_50;
+
+       abs.f64         %fd30, %fd2;
+       setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
        mov.f64         %fd35, %fd18;
-       @%p40 bra       BB5_48;
-
-       abs.f64         %fd19, %fd2;
-       setp.gtu.f64    %p41, %fd19, 0d7FF0000000000000;
-       mov.f64         %fd34, %fd18;
-       mov.f64         %fd35, %fd34;
-       @%p41 bra       BB5_48;
-
-       setp.eq.f64     %p42, %fd19, 0d7FF0000000000000;
-       @%p42 bra       BB5_47;
-       bra.uni         BB5_45;
-
-BB5_47:
-       setp.gt.f64     %p44, %fd11, 0d3FF0000000000000;
-       selp.b32        %r37, 2146435072, 0, %p44;
-       xor.b32         %r38, %r37, 2146435072;
-       setp.lt.s32     %p45, %r9, 0;
-       selp.b32        %r39, %r38, %r37, %p45;
-       setp.eq.f64     %p46, %fd1, 0dBFF0000000000000;
-       selp.b32        %r40, 1072693248, %r39, %p46;
+       mov.f64         %fd36, %fd35;
+       @%p41 bra       BB7_50;
+
+       and.b32         %r32, %r9, 2147483647;
+       setp.ne.s32     %p42, %r32, 2146435072;
+       @%p42 bra       BB7_46;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r33, %temp}, %fd2;
+       }
+       setp.eq.s32     %p43, %r33, 0;
+       @%p43 bra       BB7_49;
+
+BB7_46:
+       and.b32         %r34, %r8, 2147483647;
+       setp.ne.s32     %p44, %r34, 2146435072;
+       mov.f64         %fd33, %fd17;
+       mov.f64         %fd36, %fd33;
+       @%p44 bra       BB7_50;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r35, %temp}, %fd1;
+       }
+       setp.ne.s32     %p45, %r35, 0;
+       mov.f64         %fd36, %fd17;
+       @%p45 bra       BB7_50;
+
+       shr.s32         %r36, %r9, 31;
+       and.b32         %r37, %r36, -2146435072;
+       add.s32         %r38, %r37, 2146435072;
+       or.b32          %r39, %r38, -2147483648;
+       selp.b32        %r40, %r39, %r38, %p1;
        mov.u32         %r41, 0;
-       mov.b64         %fd35, {%r41, %r40};
-       bra.uni         BB5_48;
-
-BB5_45:
-       setp.neu.f64    %p43, %fd11, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd17;
-       @%p43 bra       BB5_48;
-
-       shr.s32         %r32, %r9, 31;
-       and.b32         %r33, %r32, -2146435072;
-       selp.b32        %r34, -1048576, 2146435072, %p1;
-       add.s32         %r35, %r34, %r33;
-       mov.u32         %r36, 0;
-       mov.b64         %fd35, {%r36, %r35};
-
-BB5_48:
-       setp.eq.f64     %p47, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p48, %fd1, 0d3FF0000000000000;
-       or.pred         %p49, %p48, %p47;
-       selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p49;
-
-BB5_52:
+       mov.b64         %fd36, {%r41, %r40};
+       bra.uni         BB7_50;
+
+BB7_49:
+       setp.gt.f64     %p46, %fd11, 0d3FF0000000000000;
+       selp.b32        %r42, 2146435072, 0, %p46;
+       xor.b32         %r43, %r42, 2146435072;
+       setp.lt.s32     %p47, %r9, 0;
+       selp.b32        %r44, %r43, %r42, %p47;
+       setp.eq.f64     %p48, %fd1, 0dBFF0000000000000;
+       selp.b32        %r45, 1072693248, %r44, %p48;
+       mov.u32         %r46, 0;
+       mov.b64         %fd36, {%r46, %r45};
+
+BB7_50:
+       setp.eq.f64     %p49, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p50, %fd1, 0d3FF0000000000000;
+       or.pred         %p51, %p50, %p49;
+       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p51;
+
+BB7_54:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
-       st.global.f64   [%rd14], %fd38;
+       st.global.f64   [%rd14], %fd39;
 
-BB5_53:
+BB7_55:
        ret;
 }
 
@@ -628,14 +947,14 @@ BB5_53:
        .param .u32 binCellScalarOp_param_6
 )
 {
-       .reg .pred      %p<85>;
-       .reg .b32       %r<61>;
-       .reg .f64       %fd<75>;
+       .reg .pred      %p<89>;
+       .reg .b32       %r<71>;
+       .reg .f64       %fd<77>;
        .reg .b64       %rd<12>;
 
 
        ld.param.u64    %rd4, [binCellScalarOp_param_0];
-       ld.param.f64    %fd54, [binCellScalarOp_param_1];
+       ld.param.f64    %fd52, [binCellScalarOp_param_1];
        ld.param.u64    %rd5, [binCellScalarOp_param_2];
        ld.param.u32    %r8, [binCellScalarOp_param_3];
        ld.param.u32    %r9, [binCellScalarOp_param_4];
@@ -652,7 +971,7 @@ BB5_53:
        mad.lo.s32      %r1, %r14, %r15, %r17;
        mul.lo.s32      %r18, %r9, %r8;
        setp.ge.s32     %p3, %r1, %r18;
-       @%p3 bra        BB6_88;
+       @%p3 bra        BB8_92;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -661,181 +980,181 @@ BB5_53:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB6_45;
+       @%p4 bra        BB8_47;
 
        setp.eq.s32     %p5, %r6, 0;
-       @%p5 bra        BB6_43;
+       @%p5 bra        BB8_45;
 
-       mov.f64         %fd66, 0dC08F380000000000;
+       mov.f64         %fd67, 0dC08F380000000000;
        setp.gt.s32     %p6, %r6, 6;
-       @%p6 bra        BB6_13;
+       @%p6 bra        BB8_13;
 
        setp.gt.s32     %p14, %r6, 3;
-       @%p14 bra       BB6_9;
+       @%p14 bra       BB8_9;
 
        setp.eq.s32     %p18, %r6, 1;
-       @%p18 bra       BB6_42;
+       @%p18 bra       BB8_44;
 
        setp.eq.s32     %p19, %r6, 2;
-       @%p19 bra       BB6_41;
-       bra.uni         BB6_7;
+       @%p19 bra       BB8_43;
+       bra.uni         BB8_7;
 
-BB6_41:
-       mul.f64         %fd66, %fd1, %fd54;
-       bra.uni         BB6_44;
+BB8_43:
+       mul.f64         %fd67, %fd1, %fd52;
+       bra.uni         BB8_46;
 
-BB6_45:
-       setp.eq.s32     %p45, %r6, 0;
-       @%p45 bra       BB6_86;
+BB8_47:
+       setp.eq.s32     %p47, %r6, 0;
+       @%p47 bra       BB8_90;
 
-       mov.f64         %fd74, 0dC08F380000000000;
-       setp.gt.s32     %p46, %r6, 6;
-       @%p46 bra       BB6_56;
+       mov.f64         %fd76, 0dC08F380000000000;
+       setp.gt.s32     %p48, %r6, 6;
+       @%p48 bra       BB8_58;
 
-       setp.gt.s32     %p54, %r6, 3;
-       @%p54 bra       BB6_52;
+       setp.gt.s32     %p56, %r6, 3;
+       @%p56 bra       BB8_54;
 
-       setp.eq.s32     %p58, %r6, 1;
-       @%p58 bra       BB6_85;
+       setp.eq.s32     %p60, %r6, 1;
+       @%p60 bra       BB8_89;
 
-       setp.eq.s32     %p59, %r6, 2;
-       @%p59 bra       BB6_84;
-       bra.uni         BB6_50;
+       setp.eq.s32     %p61, %r6, 2;
+       @%p61 bra       BB8_88;
+       bra.uni         BB8_52;
 
-BB6_84:
-       mul.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB6_87;
+BB8_88:
+       mul.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_91;
 
-BB6_43:
-       add.f64         %fd66, %fd1, %fd54;
+BB8_45:
+       add.f64         %fd67, %fd1, %fd52;
 
-BB6_44:
-       st.global.f64   [%rd1], %fd66;
-       bra.uni         BB6_88;
+BB8_46:
+       st.global.f64   [%rd1], %fd67;
+       bra.uni         BB8_92;
 
-BB6_13:
+BB8_13:
        setp.gt.s32     %p7, %r6, 9;
-       @%p7 bra        BB6_18;
+       @%p7 bra        BB8_18;
 
        setp.eq.s32     %p11, %r6, 7;
-       @%p11 bra       BB6_25;
+       @%p11 bra       BB8_25;
 
        setp.eq.s32     %p12, %r6, 8;
-       @%p12 bra       BB6_24;
-       bra.uni         BB6_16;
+       @%p12 bra       BB8_24;
+       bra.uni         BB8_16;
 
-BB6_24:
-       setp.le.f64     %p23, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB6_44;
+BB8_24:
+       setp.le.f64     %p23, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+       bra.uni         BB8_46;
 
-BB6_86:
-       add.f64         %fd74, %fd1, %fd54;
+BB8_90:
+       add.f64         %fd76, %fd1, %fd52;
 
-BB6_87:
-       st.global.f64   [%rd1], %fd74;
+BB8_91:
+       st.global.f64   [%rd1], %fd76;
 
-BB6_88:
+BB8_92:
        ret;
 
-BB6_56:
-       setp.gt.s32     %p47, %r6, 9;
-       @%p47 bra       BB6_61;
+BB8_58:
+       setp.gt.s32     %p49, %r6, 9;
+       @%p49 bra       BB8_63;
 
-       setp.eq.s32     %p51, %r6, 7;
-       @%p51 bra       BB6_68;
+       setp.eq.s32     %p53, %r6, 7;
+       @%p53 bra       BB8_70;
 
-       setp.eq.s32     %p52, %r6, 8;
-       @%p52 bra       BB6_67;
-       bra.uni         BB6_59;
+       setp.eq.s32     %p54, %r6, 8;
+       @%p54 bra       BB8_69;
+       bra.uni         BB8_61;
 
-BB6_67:
-       setp.ge.f64     %p63, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p63;
-       bra.uni         BB6_87;
+BB8_69:
+       setp.ge.f64     %p65, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+       bra.uni         BB8_91;
 
-BB6_9:
+BB8_9:
        setp.eq.s32     %p15, %r6, 4;
-       @%p15 bra       BB6_27;
+       @%p15 bra       BB8_27;
 
        setp.eq.s32     %p16, %r6, 5;
-       @%p16 bra       BB6_26;
-       bra.uni         BB6_11;
+       @%p16 bra       BB8_26;
+       bra.uni         BB8_11;
 
-BB6_26:
-       setp.gt.f64     %p26, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB6_44;
+BB8_26:
+       setp.gt.f64     %p26, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB8_46;
 
-BB6_18:
+BB8_18:
        setp.eq.s32     %p8, %r6, 10;
-       @%p8 bra        BB6_23;
+       @%p8 bra        BB8_23;
 
        setp.eq.s32     %p9, %r6, 11;
-       @%p9 bra        BB6_22;
-       bra.uni         BB6_20;
+       @%p9 bra        BB8_22;
+       bra.uni         BB8_20;
 
-BB6_22:
-       min.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB6_44;
+BB8_22:
+       min.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_46;
 
-BB6_52:
-       setp.eq.s32     %p55, %r6, 4;
-       @%p55 bra       BB6_70;
+BB8_54:
+       setp.eq.s32     %p57, %r6, 4;
+       @%p57 bra       BB8_72;
 
-       setp.eq.s32     %p56, %r6, 5;
-       @%p56 bra       BB6_69;
-       bra.uni         BB6_54;
+       setp.eq.s32     %p58, %r6, 5;
+       @%p58 bra       BB8_71;
+       bra.uni         BB8_56;
 
-BB6_69:
-       setp.lt.f64     %p66, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66;
-       bra.uni         BB6_87;
+BB8_71:
+       setp.lt.f64     %p68, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
+       bra.uni         BB8_91;
 
-BB6_61:
-       setp.eq.s32     %p48, %r6, 10;
-       @%p48 bra       BB6_66;
+BB8_63:
+       setp.eq.s32     %p50, %r6, 10;
+       @%p50 bra       BB8_68;
 
-       setp.eq.s32     %p49, %r6, 11;
-       @%p49 bra       BB6_65;
-       bra.uni         BB6_63;
+       setp.eq.s32     %p51, %r6, 11;
+       @%p51 bra       BB8_67;
+       bra.uni         BB8_65;
 
-BB6_65:
-       min.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB6_87;
+BB8_67:
+       min.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_91;
 
-BB6_42:
-       sub.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB6_44;
+BB8_44:
+       sub.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_46;
 
-BB6_7:
+BB8_7:
        setp.eq.s32     %p20, %r6, 3;
-       @%p20 bra       BB6_8;
-       bra.uni         BB6_44;
+       @%p20 bra       BB8_8;
+       bra.uni         BB8_46;
 
-BB6_8:
-       div.rn.f64      %fd66, %fd54, %fd1;
-       bra.uni         BB6_44;
+BB8_8:
+       div.rn.f64      %fd67, %fd52, %fd1;
+       bra.uni         BB8_46;
 
-BB6_25:
-       setp.lt.f64     %p24, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB6_44;
+BB8_25:
+       setp.lt.f64     %p24, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+       bra.uni         BB8_46;
 
-BB6_16:
+BB8_16:
        setp.eq.s32     %p13, %r6, 9;
-       @%p13 bra       BB6_17;
-       bra.uni         BB6_44;
+       @%p13 bra       BB8_17;
+       bra.uni         BB8_46;
 
-BB6_17:
-       setp.eq.f64     %p22, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB6_44;
+BB8_17:
+       setp.eq.f64     %p22, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+       bra.uni         BB8_46;
 
-BB6_27:
+BB8_27:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r2}, %fd54;
+       mov.b64         {%temp, %r2}, %fd52;
        }
        {
        .reg .b32 %temp; 
@@ -846,8 +1165,8 @@ BB6_27:
        mov.b64          %rd10, %fd1;
        shl.b64         %rd2, %rd10, %r20;
        setp.eq.s64     %p27, %rd2, -9223372036854775808;
-       abs.f64         %fd10, %fd54;
-       // Callseq Start 1
+       abs.f64         %fd10, %fd52;
+       // Callseq Start 2
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -862,115 +1181,115 @@ BB6_27:
        param0, 
        param1
        );
-       ld.param.f64    %fd65, [retval0+0];
+       ld.param.f64    %fd66, [retval0+0];
        
        //{
-       }// Callseq End 1
+       }// Callseq End 2
        setp.lt.s32     %p28, %r2, 0;
        and.pred        %p1, %p28, %p27;
-       @!%p1 bra       BB6_29;
-       bra.uni         BB6_28;
+       @!%p1 bra       BB8_29;
+       bra.uni         BB8_28;
 
-BB6_28:
+BB8_28:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r21}, %fd65;
+       mov.b64         {%temp, %r21}, %fd66;
        }
        xor.b32         %r22, %r21, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r23, %temp}, %fd65;
+       mov.b64         {%r23, %temp}, %fd66;
        }
-       mov.b64         %fd65, {%r23, %r22};
+       mov.b64         %fd66, {%r23, %r22};
 
-BB6_29:
-       mov.f64         %fd64, %fd65;
-       setp.eq.f64     %p29, %fd54, 0d0000000000000000;
-       @%p29 bra       BB6_32;
-       bra.uni         BB6_30;
+BB8_29:
+       mov.f64         %fd65, %fd66;
+       setp.eq.f64     %p29, %fd52, 0d0000000000000000;
+       @%p29 bra       BB8_32;
+       bra.uni         BB8_30;
 
-BB6_32:
+BB8_32:
        selp.b32        %r24, %r2, 0, %p27;
        or.b32          %r25, %r24, 2146435072;
        setp.lt.s32     %p33, %r3, 0;
        selp.b32        %r26, %r25, %r24, %p33;
        mov.u32         %r27, 0;
-       mov.b64         %fd64, {%r27, %r26};
-       bra.uni         BB6_33;
+       mov.b64         %fd65, {%r27, %r26};
+       bra.uni         BB8_33;
 
-BB6_11:
+BB8_11:
        setp.eq.s32     %p17, %r6, 6;
-       @%p17 bra       BB6_12;
-       bra.uni         BB6_44;
+       @%p17 bra       BB8_12;
+       bra.uni         BB8_46;
 
-BB6_12:
-       setp.ge.f64     %p25, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB6_44;
+BB8_12:
+       setp.ge.f64     %p25, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+       bra.uni         BB8_46;
 
-BB6_23:
-       setp.neu.f64    %p21, %fd1, %fd54;
-       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21;
-       bra.uni         BB6_44;
+BB8_23:
+       setp.neu.f64    %p21, %fd1, %fd52;
+       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
+       bra.uni         BB8_46;
 
-BB6_20:
+BB8_20:
        setp.ne.s32     %p10, %r6, 12;
-       @%p10 bra       BB6_44;
+       @%p10 bra       BB8_46;
 
-       max.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB6_44;
+       max.f64         %fd67, %fd52, %fd1;
+       bra.uni         BB8_46;
 
-BB6_85:
-       sub.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB6_87;
+BB8_89:
+       sub.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_91;
 
-BB6_50:
-       setp.eq.s32     %p60, %r6, 3;
-       @%p60 bra       BB6_51;
-       bra.uni         BB6_87;
+BB8_52:
+       setp.eq.s32     %p62, %r6, 3;
+       @%p62 bra       BB8_53;
+       bra.uni         BB8_91;
 
-BB6_51:
-       div.rn.f64      %fd74, %fd1, %fd54;
-       bra.uni         BB6_87;
+BB8_53:
+       div.rn.f64      %fd76, %fd1, %fd52;
+       bra.uni         BB8_91;
 
-BB6_68:
-       setp.gt.f64     %p64, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p64;
-       bra.uni         BB6_87;
+BB8_70:
+       setp.gt.f64     %p66, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+       bra.uni         BB8_91;
 
-BB6_59:
-       setp.eq.s32     %p53, %r6, 9;
-       @%p53 bra       BB6_60;
-       bra.uni         BB6_87;
+BB8_61:
+       setp.eq.s32     %p55, %r6, 9;
+       @%p55 bra       BB8_62;
+       bra.uni         BB8_91;
 
-BB6_60:
-       setp.eq.f64     %p62, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p62;
-       bra.uni         BB6_87;
+BB8_62:
+       setp.eq.f64     %p64, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
+       bra.uni         BB8_91;
 
-BB6_70:
+BB8_72:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r5}, %fd54;
+       mov.b64         {%temp, %r5}, %fd52;
        }
-       bfe.u32         %r40, %r5, 20, 11;
-       add.s32         %r41, %r40, -1012;
-       mov.b64          %rd11, %fd54;
-       shl.b64         %rd3, %rd11, %r41;
-       setp.eq.s64     %p67, %rd3, -9223372036854775808;
-       abs.f64         %fd36, %fd1;
-       // Callseq Start 2
+       bfe.u32         %r45, %r5, 20, 11;
+       add.s32         %r46, %r45, -1012;
+       mov.b64          %rd11, %fd52;
+       shl.b64         %rd3, %rd11, %r46;
+       setp.eq.s64     %p69, %rd3, -9223372036854775808;
+       abs.f64         %fd35, %fd1;
+       // Callseq Start 3
        {
        .reg .b32 temp_param_reg;
        // <end>}
        .param .b64 param0;
-       st.param.f64    [param0+0], %fd36;
+       st.param.f64    [param0+0], %fd35;
        .param .b64 param1;
-       st.param.f64    [param1+0], %fd54;
+       st.param.f64    [param1+0], %fd52;
        .param .b64 retval0;
        call.uni (retval0), 
        __internal_accurate_pow, 
@@ -978,193 +1297,229 @@ BB6_70:
        param0, 
        param1
        );
-       ld.param.f64    %fd73, [retval0+0];
+       ld.param.f64    %fd75, [retval0+0];
        
        //{
-       }// Callseq End 2
-       setp.lt.s32     %p68, %r4, 0;
-       and.pred        %p2, %p68, %p67;
-       @!%p2 bra       BB6_72;
-       bra.uni         BB6_71;
+       }// Callseq End 3
+       setp.lt.s32     %p70, %r4, 0;
+       and.pred        %p2, %p70, %p69;
+       @!%p2 bra       BB8_74;
+       bra.uni         BB8_73;
 
-BB6_71:
+BB8_73:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r42}, %fd73;
+       mov.b64         {%temp, %r47}, %fd75;
        }
-       xor.b32         %r43, %r42, -2147483648;
+       xor.b32         %r48, %r47, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r44, %temp}, %fd73;
+       mov.b64         {%r49, %temp}, %fd75;
        }
-       mov.b64         %fd73, {%r44, %r43};
-
-BB6_72:
-       mov.f64         %fd72, %fd73;
-       setp.eq.f64     %p69, %fd1, 0d0000000000000000;
-       @%p69 bra       BB6_75;
-       bra.uni         BB6_73;
-
-BB6_75:
-       selp.b32        %r45, %r4, 0, %p67;
-       or.b32          %r46, %r45, 2146435072;
-       setp.lt.s32     %p73, %r5, 0;
-       selp.b32        %r47, %r46, %r45, %p73;
-       mov.u32         %r48, 0;
-       mov.b64         %fd72, {%r48, %r47};
-       bra.uni         BB6_76;
-
-BB6_54:
-       setp.eq.s32     %p57, %r6, 6;
-       @%p57 bra       BB6_55;
-       bra.uni         BB6_87;
-
-BB6_55:
-       setp.le.f64     %p65, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65;
-       bra.uni         BB6_87;
-
-BB6_66:
-       setp.neu.f64    %p61, %fd1, %fd54;
-       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p61;
-       bra.uni         BB6_87;
-
-BB6_63:
-       setp.ne.s32     %p50, %r6, 12;
-       @%p50 bra       BB6_87;
-
-       max.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB6_87;
-
-BB6_30:
+       mov.b64         %fd75, {%r49, %r48};
+
+BB8_74:
+       mov.f64         %fd74, %fd75;
+       setp.eq.f64     %p71, %fd1, 0d0000000000000000;
+       @%p71 bra       BB8_77;
+       bra.uni         BB8_75;
+
+BB8_77:
+       selp.b32        %r50, %r4, 0, %p69;
+       or.b32          %r51, %r50, 2146435072;
+       setp.lt.s32     %p75, %r5, 0;
+       selp.b32        %r52, %r51, %r50, %p75;
+       mov.u32         %r53, 0;
+       mov.b64         %fd74, {%r53, %r52};
+       bra.uni         BB8_78;
+
+BB8_56:
+       setp.eq.s32     %p59, %r6, 6;
+       @%p59 bra       BB8_57;
+       bra.uni         BB8_91;
+
+BB8_57:
+       setp.le.f64     %p67, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
+       bra.uni         BB8_91;
+
+BB8_68:
+       setp.neu.f64    %p63, %fd1, %fd52;
+       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
+       bra.uni         BB8_91;
+
+BB8_65:
+       setp.ne.s32     %p52, %r6, 12;
+       @%p52 bra       BB8_91;
+
+       max.f64         %fd76, %fd1, %fd52;
+       bra.uni         BB8_91;
+
+BB8_30:
        setp.gt.s32     %p30, %r2, -1;
-       @%p30 bra       BB6_33;
+       @%p30 bra       BB8_33;
 
-       cvt.rzi.f64.f64 %fd56, %fd1;
-       setp.neu.f64    %p31, %fd56, %fd1;
-       selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p31;
+       cvt.rzi.f64.f64 %fd54, %fd1;
+       setp.neu.f64    %p31, %fd54, %fd1;
+       selp.f64        %fd65, 0dFFF8000000000000, %fd65, %p31;
 
-BB6_33:
-       mov.f64         %fd16, %fd64;
-       add.f64         %fd17, %fd1, %fd54;
+BB8_33:
+       mov.f64         %fd16, %fd65;
+       add.f64         %fd17, %fd1, %fd52;
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r28}, %fd17;
        }
        and.b32         %r29, %r28, 2146435072;
        setp.ne.s32     %p34, %r29, 2146435072;
-       mov.f64         %fd63, %fd16;
-       @%p34 bra       BB6_40;
+       mov.f64         %fd64, %fd16;
+       @%p34 bra       BB8_42;
 
        setp.gtu.f64    %p35, %fd10, 0d7FF0000000000000;
+       mov.f64         %fd64, %fd17;
+       @%p35 bra       BB8_42;
+
+       abs.f64         %fd55, %fd1;
+       setp.gtu.f64    %p36, %fd55, 0d7FF0000000000000;
        mov.f64         %fd63, %fd17;
-       @%p35 bra       BB6_40;
-
-       abs.f64         %fd18, %fd1;
-       setp.gtu.f64    %p36, %fd18, 0d7FF0000000000000;
-       mov.f64         %fd62, %fd17;
-       mov.f64         %fd63, %fd62;
-       @%p36 bra       BB6_40;
-
-       setp.eq.f64     %p37, %fd18, 0d7FF0000000000000;
-       @%p37 bra       BB6_39;
-       bra.uni         BB6_37;
-
-BB6_39:
-       setp.gt.f64     %p39, %fd10, 0d3FF0000000000000;
-       selp.b32        %r35, 2146435072, 0, %p39;
-       xor.b32         %r36, %r35, 2146435072;
-       setp.lt.s32     %p40, %r3, 0;
-       selp.b32        %r37, %r36, %r35, %p40;
-       setp.eq.f64     %p41, %fd54, 0dBFF0000000000000;
-       selp.b32        %r38, 1072693248, %r37, %p41;
+       mov.f64         %fd64, %fd63;
+       @%p36 bra       BB8_42;
+
+       and.b32         %r30, %r3, 2147483647;
+       setp.ne.s32     %p37, %r30, 2146435072;
+       @%p37 bra       BB8_38;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r31, %temp}, %fd1;
+       }
+       setp.eq.s32     %p38, %r31, 0;
+       @%p38 bra       BB8_41;
+
+BB8_38:
+       and.b32         %r32, %r2, 2147483647;
+       setp.ne.s32     %p39, %r32, 2146435072;
+       mov.f64         %fd61, %fd16;
+       mov.f64         %fd64, %fd61;
+       @%p39 bra       BB8_42;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r33, %temp}, %fd52;
+       }
+       setp.ne.s32     %p40, %r33, 0;
+       mov.f64         %fd64, %fd16;
+       @%p40 bra       BB8_42;
+
+       shr.s32         %r34, %r3, 31;
+       and.b32         %r35, %r34, -2146435072;
+       add.s32         %r36, %r35, 2146435072;
+       or.b32          %r37, %r36, -2147483648;
+       selp.b32        %r38, %r37, %r36, %p1;
        mov.u32         %r39, 0;
-       mov.b64         %fd63, {%r39, %r38};
-       bra.uni         BB6_40;
+       mov.b64         %fd64, {%r39, %r38};
+       bra.uni         BB8_42;
 
-BB6_73:
-       setp.gt.s32     %p70, %r4, -1;
-       @%p70 bra       BB6_76;
+BB8_75:
+       setp.gt.s32     %p72, %r4, -1;
+       @%p72 bra       BB8_78;
 
-       cvt.rzi.f64.f64 %fd58, %fd54;
-       setp.neu.f64    %p71, %fd58, %fd54;
-       selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p71;
+       cvt.rzi.f64.f64 %fd57, %fd52;
+       setp.neu.f64    %p73, %fd57, %fd52;
+       selp.f64        %fd74, 0dFFF8000000000000, %fd74, %p73;
 
-BB6_76:
-       mov.f64         %fd42, %fd72;
-       add.f64         %fd43, %fd1, %fd54;
+BB8_78:
+       mov.f64         %fd41, %fd74;
+       add.f64         %fd42, %fd1, %fd52;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r49}, %fd43;
+       mov.b64         {%temp, %r54}, %fd42;
        }
-       and.b32         %r50, %r49, 2146435072;
-       setp.ne.s32     %p74, %r50, 2146435072;
-       mov.f64         %fd71, %fd42;
-       @%p74 bra       BB6_83;
-
-       setp.gtu.f64    %p75, %fd36, 0d7FF0000000000000;
-       mov.f64         %fd71, %fd43;
-       @%p75 bra       BB6_83;
-
-       abs.f64         %fd44, %fd54;
-       setp.gtu.f64    %p76, %fd44, 0d7FF0000000000000;
-       mov.f64         %fd70, %fd43;
-       mov.f64         %fd71, %fd70;
-       @%p76 bra       BB6_83;
-
-       setp.eq.f64     %p77, %fd44, 0d7FF0000000000000;
-       @%p77 bra       BB6_82;
-       bra.uni         BB6_80;
-
-BB6_82:
-       setp.gt.f64     %p79, %fd36, 0d3FF0000000000000;
-       selp.b32        %r56, 2146435072, 0, %p79;
-       xor.b32         %r57, %r56, 2146435072;
-       setp.lt.s32     %p80, %r5, 0;
-       selp.b32        %r58, %r57, %r56, %p80;
-       setp.eq.f64     %p81, %fd1, 0dBFF0000000000000;
-       selp.b32        %r59, 1072693248, %r58, %p81;
-       mov.u32         %r60, 0;
-       mov.b64         %fd71, {%r60, %r59};
-       bra.uni         BB6_83;
-
-BB6_37:
-       setp.neu.f64    %p38, %fd10, 0d7FF0000000000000;
-       mov.f64         %fd63, %fd16;
-       @%p38 bra       BB6_40;
-
-       shr.s32         %r30, %r3, 31;
-       and.b32         %r31, %r30, -2146435072;
-       selp.b32        %r32, -1048576, 2146435072, %p1;
-       add.s32         %r33, %r32, %r31;
-       mov.u32         %r34, 0;
-       mov.b64         %fd63, {%r34, %r33};
-
-BB6_40:
-       setp.eq.f64     %p42, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p43, %fd54, 0d3FF0000000000000;
-       or.pred         %p44, %p43, %p42;
-       selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p44;
-       bra.uni         BB6_44;
-
-BB6_80:
-       setp.neu.f64    %p78, %fd36, 0d7FF0000000000000;
-       mov.f64         %fd71, %fd42;
-       @%p78 bra       BB6_83;
-
-       shr.s32         %r51, %r5, 31;
-       and.b32         %r52, %r51, -2146435072;
-       selp.b32        %r53, -1048576, 2146435072, %p2;
-       add.s32         %r54, %r53, %r52;
-       mov.u32         %r55, 0;
-       mov.b64         %fd71, {%r55, %r54};
-
-BB6_83:
-       setp.eq.f64     %p82, %fd54, 0d0000000000000000;
-       setp.eq.f64     %p83, %fd1, 0d3FF0000000000000;
-       or.pred         %p84, %p83, %p82;
-       selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p84;
-       bra.uni         BB6_87;
+       and.b32         %r55, %r54, 2146435072;
+       setp.ne.s32     %p76, %r55, 2146435072;
+       mov.f64         %fd73, %fd41;
+       @%p76 bra       BB8_87;
+
+       setp.gtu.f64    %p77, %fd35, 0d7FF0000000000000;
+       mov.f64         %fd73, %fd42;
+       @%p77 bra       BB8_87;
+
+       abs.f64         %fd58, %fd52;
+       setp.gtu.f64    %p78, %fd58, 0d7FF0000000000000;
+       mov.f64         %fd72, %fd42;
+       mov.f64         %fd73, %fd72;
+       @%p78 bra       BB8_87;
+
+       and.b32         %r56, %r5, 2147483647;
+       setp.ne.s32     %p79, %r56, 2146435072;
+       @%p79 bra       BB8_83;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r57, %temp}, %fd52;
+       }
+       setp.eq.s32     %p80, %r57, 0;
+       @%p80 bra       BB8_86;
+
+BB8_83:
+       and.b32         %r58, %r4, 2147483647;
+       setp.ne.s32     %p81, %r58, 2146435072;
+       mov.f64         %fd70, %fd41;
+       mov.f64         %fd73, %fd70;
+       @%p81 bra       BB8_87;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r59, %temp}, %fd1;
+       }
+       setp.ne.s32     %p82, %r59, 0;
+       mov.f64         %fd73, %fd41;
+       @%p82 bra       BB8_87;
+
+       shr.s32         %r60, %r5, 31;
+       and.b32         %r61, %r60, -2146435072;
+       add.s32         %r62, %r61, 2146435072;
+       or.b32          %r63, %r62, -2147483648;
+       selp.b32        %r64, %r63, %r62, %p2;
+       mov.u32         %r65, 0;
+       mov.b64         %fd73, {%r65, %r64};
+       bra.uni         BB8_87;
+
+BB8_41:
+       setp.gt.f64     %p41, %fd10, 0d3FF0000000000000;
+       selp.b32        %r40, 2146435072, 0, %p41;
+       xor.b32         %r41, %r40, 2146435072;
+       setp.lt.s32     %p42, %r3, 0;
+       selp.b32        %r42, %r41, %r40, %p42;
+       setp.eq.f64     %p43, %fd52, 0dBFF0000000000000;
+       selp.b32        %r43, 1072693248, %r42, %p43;
+       mov.u32         %r44, 0;
+       mov.b64         %fd64, {%r44, %r43};
+
+BB8_42:
+       setp.eq.f64     %p44, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p45, %fd52, 0d3FF0000000000000;
+       or.pred         %p46, %p45, %p44;
+       selp.f64        %fd67, 0d3FF0000000000000, %fd64, %p46;
+       bra.uni         BB8_46;
+
+BB8_86:
+       setp.gt.f64     %p83, %fd35, 0d3FF0000000000000;
+       selp.b32        %r66, 2146435072, 0, %p83;
+       xor.b32         %r67, %r66, 2146435072;
+       setp.lt.s32     %p84, %r5, 0;
+       selp.b32        %r68, %r67, %r66, %p84;
+       setp.eq.f64     %p85, %fd1, 0dBFF0000000000000;
+       selp.b32        %r69, 1072693248, %r68, %p85;
+       mov.u32         %r70, 0;
+       mov.b64         %fd73, {%r70, %r69};
+
+BB8_87:
+       setp.eq.f64     %p86, %fd52, 0d0000000000000000;
+       setp.eq.f64     %p87, %fd1, 0d3FF0000000000000;
+       or.pred         %p88, %p87, %p86;
+       selp.f64        %fd76, 0d3FF0000000000000, %fd73, %p88;
+       bra.uni         BB8_91;
 }
 
        // .globl       fill
@@ -1188,14 +1543,14 @@ BB6_83:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB7_2;
+       @%p1 bra        BB9_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB7_2:
+BB9_2:
        ret;
 }
 
@@ -1223,9 +1578,9 @@ BB7_2:
        mov.f64         %fd67, 0d0000000000000000;
        mov.f64         %fd68, %fd67;
        setp.ge.u32     %p1, %r30, %r5;
-       @%p1 bra        BB8_4;
+       @%p1 bra        BB10_4;
 
-BB8_1:
+BB10_1:
        mov.f64         %fd1, %fd68;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r30, 8;
@@ -1234,23 +1589,23 @@ BB8_1:
        add.f64         %fd69, %fd1, %fd27;
        add.s32         %r3, %r30, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB8_3;
+       @%p2 bra        BB10_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd28, [%rd9];
        add.f64         %fd69, %fd69, %fd28;
 
-BB8_3:
+BB10_3:
        mov.f64         %fd68, %fd69;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r30, %r12, %r13, %r30;
        setp.lt.u32     %p3, %r30, %r5;
        mov.f64         %fd67, %fd68;
-       @%p3 bra        BB8_1;
+       @%p3 bra        BB10_1;
 
-BB8_4:
+BB10_4:
        mov.f64         %fd65, %fd67;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -1258,113 +1613,113 @@ BB8_4:
        st.shared.f64   [%rd1], %fd65;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 512;
-       @%p4 bra        BB8_8;
+       @%p4 bra        BB10_8;
 
        setp.gt.u32     %p5, %r6, 255;
        mov.f64         %fd66, %fd65;
-       @%p5 bra        BB8_7;
+       @%p5 bra        BB10_7;
 
        ld.shared.f64   %fd29, [%rd1+2048];
        add.f64         %fd66, %fd65, %fd29;
        st.shared.f64   [%rd1], %fd66;
 
-BB8_7:
+BB10_7:
        mov.f64         %fd65, %fd66;
        bar.sync        0;
 
-BB8_8:
+BB10_8:
        mov.f64         %fd63, %fd65;
        setp.lt.u32     %p6, %r9, 256;
-       @%p6 bra        BB8_12;
+       @%p6 bra        BB10_12;
 
        setp.gt.u32     %p7, %r6, 127;
        mov.f64         %fd64, %fd63;
-       @%p7 bra        BB8_11;
+       @%p7 bra        BB10_11;
 
        ld.shared.f64   %fd30, [%rd1+1024];
        add.f64         %fd64, %fd63, %fd30;
        st.shared.f64   [%rd1], %fd64;
 
-BB8_11:
+BB10_11:
        mov.f64         %fd63, %fd64;
        bar.sync        0;
 
-BB8_12:
+BB10_12:
        mov.f64         %fd61, %fd63;
        setp.lt.u32     %p8, %r9, 128;
-       @%p8 bra        BB8_16;
+       @%p8 bra        BB10_16;
 
        setp.gt.u32     %p9, %r6, 63;
        mov.f64         %fd62, %fd61;
-       @%p9 bra        BB8_15;
+       @%p9 bra        BB10_15;
 
        ld.shared.f64   %fd31, [%rd1+512];
        add.f64         %fd62, %fd61, %fd31;
        st.shared.f64   [%rd1], %fd62;
 
-BB8_15:
+BB10_15:
        mov.f64         %fd61, %fd62;
        bar.sync        0;
 
-BB8_16:
+BB10_16:
        mov.f64         %fd60, %fd61;
        setp.gt.u32     %p10, %r6, 31;
-       @%p10 bra       BB8_29;
+       @%p10 bra       BB10_29;
 
        setp.lt.u32     %p11, %r9, 64;
-       @%p11 bra       BB8_19;
+       @%p11 bra       BB10_19;
 
        ld.volatile.shared.f64  %fd32, [%rd1+256];
        add.f64         %fd60, %fd60, %fd32;
        st.volatile.shared.f64  [%rd1], %fd60;
 
-BB8_19:
+BB10_19:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p12, %r9, 32;
-       @%p12 bra       BB8_21;
+       @%p12 bra       BB10_21;
 
        ld.volatile.shared.f64  %fd33, [%rd1+128];
        add.f64         %fd59, %fd59, %fd33;
        st.volatile.shared.f64  [%rd1], %fd59;
 
-BB8_21:
+BB10_21:
        mov.f64         %fd58, %fd59;
        setp.lt.u32     %p13, %r9, 16;
-       @%p13 bra       BB8_23;
+       @%p13 bra       BB10_23;
 
        ld.volatile.shared.f64  %fd34, [%rd1+64];
        add.f64         %fd58, %fd58, %fd34;
        st.volatile.shared.f64  [%rd1], %fd58;
 
-BB8_23:
+BB10_23:
        mov.f64         %fd57, %fd58;
        setp.lt.u32     %p14, %r9, 8;
-       @%p14 bra       BB8_25;
+       @%p14 bra       BB10_25;
 
        ld.volatile.shared.f64  %fd35, [%rd1+32];
        add.f64         %fd57, %fd57, %fd35;
        st.volatile.shared.f64  [%rd1], %fd57;
 
-BB8_25:
+BB10_25:
        mov.f64         %fd56, %fd57;
        setp.lt.u32     %p15, %r9, 4;
-       @%p15 bra       BB8_27;
+       @%p15 bra       BB10_27;
 
        ld.volatile.shared.f64  %fd36, [%rd1+16];
        add.f64         %fd56, %fd56, %fd36;
        st.volatile.shared.f64  [%rd1], %fd56;
 
-BB8_27:
+BB10_27:
        setp.lt.u32     %p16, %r9, 2;
-       @%p16 bra       BB8_29;
+       @%p16 bra       BB10_29;
 
        ld.volatile.shared.f64  %fd37, [%rd1+8];
        add.f64         %fd38, %fd56, %fd37;
        st.volatile.shared.f64  [%rd1], %fd38;
 
-BB8_29:
+BB10_29:
        setp.ne.s32     %p17, %r6, 0;
-       @%p17 bra       BB8_31;
+       @%p17 bra       BB10_31;
 
        ld.shared.f64   %fd39, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -1372,7 +1727,177 @@ BB8_29:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd39;
 
-BB8_31:
+BB10_31:
+       ret;
+}
+
+       // .globl       reduce_row
+.visible .entry reduce_row(
+       .param .u64 reduce_row_param_0,
+       .param .u64 reduce_row_param_1,
+       .param .u32 reduce_row_param_2,
+       .param .u32 reduce_row_param_3
+)
+{
+       .reg .pred      %p<18>;
+       .reg .b32       %r<36>;
+       .reg .f64       %fd<65>;
+       .reg .b64       %rd<39>;
+
+
+       ld.param.u64    %rd1, [reduce_row_param_0];
+       ld.param.u64    %rd2, [reduce_row_param_1];
+       ld.param.u32    %r5, [reduce_row_param_2];
+       ld.param.u32    %r4, [reduce_row_param_3];
+       mov.u32         %r6, %ctaid.x;
+       setp.ge.u32     %p1, %r6, %r5;
+       @%p1 bra        BB11_31;
+
+       mov.u32         %r35, %tid.x;
+       mov.f64         %fd63, 0d0000000000000000;
+       mov.f64         %fd64, %fd63;
+       setp.ge.u32     %p2, %r35, %r4;
+       @%p2 bra        BB11_4;
+
+       cvta.to.global.u64      %rd3, %rd1;
+
+BB11_3:
+       mad.lo.s32      %r8, %r6, %r4, %r35;
+       mul.wide.u32    %rd4, %r8, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd25, [%rd5];
+       add.f64         %fd64, %fd64, %fd25;
+       mov.u32         %r9, %ntid.x;
+       add.s32         %r35, %r9, %r35;
+       setp.lt.u32     %p3, %r35, %r4;
+       mov.f64         %fd63, %fd64;
+       @%p3 bra        BB11_3;
+
+BB11_4:
+       mov.f64         %fd61, %fd63;
+       mov.u32         %r10, %tid.x;
+       mul.wide.u32    %rd6, %r10, 8;
+       mov.u64         %rd7, sdata;
+       add.s64         %rd8, %rd7, %rd6;
+       st.shared.f64   [%rd8], %fd61;
+       bar.sync        0;
+       mov.u32         %r11, %ntid.x;
+       setp.lt.u32     %p4, %r11, 512;
+       @%p4 bra        BB11_8;
+
+       setp.gt.u32     %p5, %r10, 255;
+       mov.f64         %fd62, %fd61;
+       @%p5 bra        BB11_7;
+
+       ld.shared.f64   %fd26, [%rd8+2048];
+       add.f64         %fd62, %fd61, %fd26;
+       st.shared.f64   [%rd8], %fd62;
+
+BB11_7:
+       mov.f64         %fd61, %fd62;
+       bar.sync        0;
+
+BB11_8:
+       mov.f64         %fd59, %fd61;
+       setp.lt.u32     %p6, %r11, 256;
+       @%p6 bra        BB11_12;
+
+       setp.gt.u32     %p7, %r10, 127;
+       mov.f64         %fd60, %fd59;
+       @%p7 bra        BB11_11;
+
+       ld.shared.f64   %fd27, [%rd8+1024];
+       add.f64         %fd60, %fd59, %fd27;
+       st.shared.f64   [%rd8], %fd60;
+
+BB11_11:
+       mov.f64         %fd59, %fd60;
+       bar.sync        0;
+
+BB11_12:
+       mov.f64         %fd57, %fd59;
+       setp.lt.u32     %p8, %r11, 128;
+       @%p8 bra        BB11_16;
+
+       setp.gt.u32     %p9, %r10, 63;
+       mov.f64         %fd58, %fd57;
+       @%p9 bra        BB11_15;
+
+       ld.shared.f64   %fd28, [%rd8+512];
+       add.f64         %fd58, %fd57, %fd28;
+       st.shared.f64   [%rd8], %fd58;
+
+BB11_15:
+       mov.f64         %fd57, %fd58;
+       bar.sync        0;
+
+BB11_16:
+       mov.f64         %fd56, %fd57;
+       setp.gt.u32     %p10, %r10, 31;
+       @%p10 bra       BB11_29;
+
+       setp.lt.u32     %p11, %r11, 64;
+       @%p11 bra       BB11_19;
+
+       ld.volatile.shared.f64  %fd29, [%rd8+256];
+       add.f64         %fd56, %fd56, %fd29;
+       st.volatile.shared.f64  [%rd8], %fd56;
+
+BB11_19:
+       mov.f64         %fd55, %fd56;
+       setp.lt.u32     %p12, %r11, 32;
+       @%p12 bra       BB11_21;
+
+       ld.volatile.shared.f64  %fd30, [%rd8+128];
+       add.f64         %fd55, %fd55, %fd30;
+       st.volatile.shared.f64  [%rd8], %fd55;
+
+BB11_21:
+       mov.f64         %fd54, %fd55;
+       setp.lt.u32     %p13, %r11, 16;
+       @%p13 bra       BB11_23;
+
+       ld.volatile.shared.f64  %fd31, [%rd8+64];
+       add.f64         %fd54, %fd54, %fd31;
+       st.volatile.shared.f64  [%rd8], %fd54;
+
+BB11_23:
+       mov.f64         %fd53, %fd54;
+       setp.lt.u32     %p14, %r11, 8;
+       @%p14 bra       BB11_25;
+
+       ld.volatile.shared.f64  %fd32, [%rd8+32];
+       add.f64         %fd53, %fd53, %fd32;
+       st.volatile.shared.f64  [%rd8], %fd53;
+
+BB11_25:
+       mov.f64         %fd52, %fd53;
+       setp.lt.u32     %p15, %r11, 4;
+       @%p15 bra       BB11_27;
+
+       ld.volatile.shared.f64  %fd33, [%rd8+16];
+       add.f64         %fd52, %fd52, %fd33;
+       st.volatile.shared.f64  [%rd8], %fd52;
+
+BB11_27:
+       setp.lt.u32     %p16, %r11, 2;
+       @%p16 bra       BB11_29;
+
+       ld.volatile.shared.f64  %fd34, [%rd8+8];
+       add.f64         %fd35, %fd52, %fd34;
+       st.volatile.shared.f64  [%rd8], %fd35;
+
+BB11_29:
+       setp.ne.s32     %p17, %r10, 0;
+       @%p17 bra       BB11_31;
+
+       ld.shared.f64   %fd36, [sdata];
+       cvta.to.global.u64      %rd36, %rd2;
+       mul.wide.u32    %rd37, %r6, 8;
+       add.s64         %rd38, %rd36, %rd37;
+       st.global.f64   [%rd38], %fd36;
+
+BB11_31:
        ret;
 }
 
@@ -1381,9 +1906,9 @@ BB8_31:
        .param .b64 __internal_accurate_pow_param_1
 )
 {
-       .reg .pred      %p<8>;
+       .reg .pred      %p<9>;
        .reg .f32       %f<3>;
-       .reg .b32       %r<49>;
+       .reg .b32       %r<52>;
        .reg .f64       %fd<135>;
 
 
@@ -1391,35 +1916,35 @@ BB8_31:
        ld.param.f64    %fd13, [__internal_accurate_pow_param_1];
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r46}, %fd12;
+       mov.b64         {%temp, %r49}, %fd12;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%r45, %temp}, %fd12;
+       mov.b64         {%r48, %temp}, %fd12;
        }
-       shr.u32         %r47, %r46, 20;
-       setp.ne.s32     %p1, %r47, 0;
-       @%p1 bra        BB9_2;
+       shr.u32         %r50, %r49, 20;
+       setp.ne.s32     %p1, %r50, 0;
+       @%p1 bra        BB12_2;
 
        mul.f64         %fd14, %fd12, 0d4350000000000000;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r46}, %fd14;
+       mov.b64         {%temp, %r49}, %fd14;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%r45, %temp}, %fd14;
+       mov.b64         {%r48, %temp}, %fd14;
        }
-       shr.u32         %r16, %r46, 20;
-       add.s32         %r47, %r16, -54;
+       shr.u32         %r16, %r49, 20;
+       add.s32         %r50, %r16, -54;
 
-BB9_2:
-       add.s32         %r48, %r47, -1023;
-       and.b32         %r17, %r46, -2146435073;
+BB12_2:
+       add.s32         %r51, %r50, -1023;
+       and.b32         %r17, %r49, -2146435073;
        or.b32          %r18, %r17, 1072693248;
-       mov.b64         %fd133, {%r45, %r18};
+       mov.b64         %fd133, {%r48, %r18};
        setp.lt.u32     %p2, %r18, 1073127583;
-       @%p2 bra        BB9_4;
+       @%p2 bra        BB12_4;
 
        {
        .reg .b32 %temp; 
@@ -1431,9 +1956,9 @@ BB9_2:
        }
        add.s32         %r21, %r20, -1048576;
        mov.b64         %fd133, {%r19, %r21};
-       add.s32         %r48, %r47, -1022;
+       add.s32         %r51, %r50, -1022;
 
-BB9_4:
+BB12_4:
        add.f64         %fd16, %fd133, 0d3FF0000000000000;
        // inline asm
        rcp.approx.ftz.f64 %fd15,%fd16;
@@ -1509,7 +2034,7 @@ BB9_4:
        add.f64         %fd76, %fd71, %fd75;
        sub.f64         %fd77, %fd71, %fd76;
        add.f64         %fd78, %fd75, %fd77;
-       xor.b32         %r25, %r48, -2147483648;
+       xor.b32         %r25, %r51, -2147483648;
        mov.u32         %r26, 1127219200;
        mov.b64         %fd79, {%r25, %r26};
        mov.u32         %r27, -2147483648;
@@ -1546,47 +2071,48 @@ BB9_4:
        add.f64         %fd4, %fd94, %fd97;
        sub.f64         %fd98, %fd94, %fd4;
        add.f64         %fd5, %fd97, %fd98;
-       mov.f64         %fd99, 0d4338000000000000;
-       mov.f64         %fd100, 0d3FF71547652B82FE;
-       fma.rn.f64      %fd101, %fd4, %fd100, %fd99;
+       mov.f64         %fd99, 0d3FF71547652B82FE;
+       mul.rn.f64      %fd100, %fd4, %fd99;
+       mov.f64         %fd101, 0d4338000000000000;
+       add.rn.f64      %fd102, %fd100, %fd101;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r13, %temp}, %fd101;
+       mov.b64         {%r13, %temp}, %fd102;
        }
-       mov.f64         %fd102, 0dC338000000000000;
-       add.rn.f64      %fd103, %fd101, %fd102;
-       mov.f64         %fd104, 0dBFE62E42FEFA39EF;
-       fma.rn.f64      %fd105, %fd103, %fd104, %fd4;
-       mov.f64         %fd106, 0dBC7ABC9E3B39803F;
-       fma.rn.f64      %fd107, %fd103, %fd106, %fd105;
-       mov.f64         %fd108, 0d3E928AF3FCA213EA;
-       mov.f64         %fd109, 0d3E5ADE1569CE2BDF;
-       fma.rn.f64      %fd110, %fd109, %fd107, %fd108;
-       mov.f64         %fd111, 0d3EC71DEE62401315;
-       fma.rn.f64      %fd112, %fd110, %fd107, %fd111;
-       mov.f64         %fd113, 0d3EFA01997C89EB71;
-       fma.rn.f64      %fd114, %fd112, %fd107, %fd113;
-       mov.f64         %fd115, 0d3F2A01A014761F65;
-       fma.rn.f64      %fd116, %fd114, %fd107, %fd115;
-       mov.f64         %fd117, 0d3F56C16C1852B7AF;
-       fma.rn.f64      %fd118, %fd116, %fd107, %fd117;
-       mov.f64         %fd119, 0d3F81111111122322;
-       fma.rn.f64      %fd120, %fd118, %fd107, %fd119;
-       mov.f64         %fd121, 0d3FA55555555502A1;
-       fma.rn.f64      %fd122, %fd120, %fd107, %fd121;
-       mov.f64         %fd123, 0d3FC5555555555511;
-       fma.rn.f64      %fd124, %fd122, %fd107, %fd123;
-       mov.f64         %fd125, 0d3FE000000000000B;
-       fma.rn.f64      %fd126, %fd124, %fd107, %fd125;
-       fma.rn.f64      %fd127, %fd126, %fd107, %fd18;
-       fma.rn.f64      %fd128, %fd127, %fd107, %fd18;
+       mov.f64         %fd103, 0dC338000000000000;
+       add.rn.f64      %fd104, %fd102, %fd103;
+       mov.f64         %fd105, 0dBFE62E42FEFA39EF;
+       fma.rn.f64      %fd106, %fd104, %fd105, %fd4;
+       mov.f64         %fd107, 0dBC7ABC9E3B39803F;
+       fma.rn.f64      %fd108, %fd104, %fd107, %fd106;
+       mov.f64         %fd109, 0d3E928AF3FCA213EA;
+       mov.f64         %fd110, 0d3E5ADE1569CE2BDF;
+       fma.rn.f64      %fd111, %fd110, %fd108, %fd109;
+       mov.f64         %fd112, 0d3EC71DEE62401315;
+       fma.rn.f64      %fd113, %fd111, %fd108, %fd112;
+       mov.f64         %fd114, 0d3EFA01997C89EB71;
+       fma.rn.f64      %fd115, %fd113, %fd108, %fd114;
+       mov.f64         %fd116, 0d3F2A01A014761F65;
+       fma.rn.f64      %fd117, %fd115, %fd108, %fd116;
+       mov.f64         %fd118, 0d3F56C16C1852B7AF;
+       fma.rn.f64      %fd119, %fd117, %fd108, %fd118;
+       mov.f64         %fd120, 0d3F81111111122322;
+       fma.rn.f64      %fd121, %fd119, %fd108, %fd120;
+       mov.f64         %fd122, 0d3FA55555555502A1;
+       fma.rn.f64      %fd123, %fd121, %fd108, %fd122;
+       mov.f64         %fd124, 0d3FC5555555555511;
+       fma.rn.f64      %fd125, %fd123, %fd108, %fd124;
+       mov.f64         %fd126, 0d3FE000000000000B;
+       fma.rn.f64      %fd127, %fd125, %fd108, %fd126;
+       fma.rn.f64      %fd128, %fd127, %fd108, %fd18;
+       fma.rn.f64      %fd129, %fd128, %fd108, %fd18;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r14, %temp}, %fd128;
+       mov.b64         {%r14, %temp}, %fd129;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r15}, %fd128;
+       mov.b64         {%temp, %r15}, %fd129;
        }
        shl.b32         %r33, %r13, 20;
        add.s32         %r34, %r15, %r33;
@@ -1598,35 +2124,47 @@ BB9_4:
        mov.b32          %f2, %r35;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p4, %f1, 0f4086232B;
-       @%p4 bra        BB9_7;
+       @%p4 bra        BB12_7;
 
        setp.lt.f64     %p5, %fd4, 0d0000000000000000;
-       add.f64         %fd129, %fd4, 0d7FF0000000000000;
-       selp.f64        %fd134, 0d0000000000000000, %fd129, %p5;
+       add.f64         %fd130, %fd4, 0d7FF0000000000000;
+       selp.f64        %fd134, 0d0000000000000000, %fd130, %p5;
        setp.geu.f32    %p6, %f1, 0f40874800;
-       @%p6 bra        BB9_7;
+       @%p6 bra        BB12_7;
 
        shr.u32         %r36, %r13, 31;
        add.s32         %r37, %r13, %r36;
        shr.s32         %r38, %r37, 1;
        shl.b32         %r39, %r38, 20;
        add.s32         %r40, %r39, %r15;
-       mov.b64         %fd130, {%r14, %r40};
+       mov.b64         %fd131, {%r14, %r40};
        sub.s32         %r41, %r13, %r38;
        shl.b32         %r42, %r41, 20;
        add.s32         %r43, %r42, 1072693248;
        mov.u32         %r44, 0;
-       mov.b64         %fd131, {%r44, %r43};
-       mul.f64         %fd134, %fd130, %fd131;
+       mov.b64         %fd132, {%r44, %r43};
+       mul.f64         %fd134, %fd131, %fd132;
 
-BB9_7:
-       abs.f64         %fd132, %fd134;
-       setp.eq.f64     %p7, %fd132, 0d7FF0000000000000;
-       @%p7 bra        BB9_9;
+BB12_7:
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%temp, %r45}, %fd134;
+       }
+       and.b32         %r46, %r45, 2147483647;
+       setp.ne.s32     %p7, %r46, 2146435072;
+       @%p7 bra        BB12_9;
+
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r47, %temp}, %fd134;
+       }
+       setp.eq.s32     %p8, %r47, 0;
+       @%p8 bra        BB12_10;
 
+BB12_9:
        fma.rn.f64      %fd134, %fd134, %fd5, %fd134;
 
-BB9_9:
+BB12_10:
        st.param.f64    [func_retval0+0], %fd134;
        ret;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 4d991f4..99aef40 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -146,7 +146,7 @@ public class AggUnaryOp extends Hop implements 
MultiThreadedHop
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
                                        if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) {
                                                // Only implemented methods for 
GPU
-                                               if (_op == AggOp.SUM && 
_direction == Direction.RowCol) {
+                                               if (_op == AggOp.SUM && 
(_direction == Direction.RowCol || _direction == Direction.Row)){
                                                        et = ExecType.GPU;
                                                        k = 1;
                                                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index f988e5f..76d900d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -36,17 +36,23 @@ public class GPUInstructionParser  extends InstructionParser
        public static final HashMap<String, GPUINSTRUCTION_TYPE> 
String2GPUInstructionType;
        static {
                String2GPUInstructionType = new HashMap<String, 
GPUINSTRUCTION_TYPE>();
+
+               // Neural Network Operators
                String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "bias_add",                      
 GPUINSTRUCTION_TYPE.Convolution);
+
+               // Matrix Multiply Operators
                String2GPUInstructionType.put( "ba+*",                   
GPUINSTRUCTION_TYPE.AggregateBinary);
                String2GPUInstructionType.put( "tsmm",                   
GPUINSTRUCTION_TYPE.MMTSJ);
+
+               // Reorg/Transpose
                String2GPUInstructionType.put( "r'",                     
GPUINSTRUCTION_TYPE.Reorg);
        
-               // 
+               // Binary Cellwise
                String2GPUInstructionType.put( "+"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
                String2GPUInstructionType.put( "-"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
                String2GPUInstructionType.put( "*"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
@@ -64,7 +70,12 @@ public class GPUInstructionParser  extends InstructionParser
                
                String2GPUInstructionType.put( "sel+"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
 
+               // Aggregate Unary
+               String2GPUInstructionType.put( "ua+"     , 
GPUINSTRUCTION_TYPE.AggregateUnary);
                String2GPUInstructionType.put( "uak+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+               String2GPUInstructionType.put( "uar+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+               String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
index c506b64..2ab1b89 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
@@ -87,6 +87,13 @@ public class AggregateUnaryGPUInstruction extends 
GPUInstruction {
     int rlen = (int)in1.getNumRows();
     int clen = (int)in1.getNumColumns();
 
+    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
+    if (indexFunction instanceof ReduceRow){  // COL{SUM, MAX...}
+      ec.setMetaData(_output.getName(), 1, clen);
+    } else if (indexFunction instanceof ReduceCol) { // ROW{SUM, MAX,...}
+      ec.setMetaData(_output.getName(), rlen, 1);
+    }
+
     LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), 
(AggregateUnaryOperator)_optr);
 
     //release inputs/outputs
@@ -95,7 +102,6 @@ public class AggregateUnaryGPUInstruction extends 
GPUInstruction {
     // If the unary aggregate is a row reduction or a column reduction, it 
results in a vector
     // which needs to be released. Otherwise a scala is produced and it is 
copied back to the host
     // and set in the execution context by invoking the setScalarOutput
-    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
     if (indexFunction instanceof ReduceRow || indexFunction instanceof 
ReduceCol) {
       ec.releaseMatrixOutputForGPUInstruction(_output.getName());
     }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index efe3a4f..86bd732 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -30,11 +30,22 @@ public abstract class GPUContext {
 
        public static ArrayList<GPUObject> allocatedPointers = new 
ArrayList<GPUObject>(); 
        protected static GPUContext currContext;
-       protected GPUContext() { }
-       
        public static volatile Boolean isGPUContextCreated = false;
-       
+
+       protected GPUContext() {}
+
+       /**
+        * Gets device memory available for SystemML operations
+        * @return
+        */
        public abstract long getAvailableMemory();
+
+       /**
+        * Ensures that all the CUDA cards on the current system are
+        * of the minimum required compute capability.
+        * (The minimum required compute capability is hard coded in {@link 
JCudaContext}.
+        */
+       public abstract void ensureComputeCapability() throws 
DMLRuntimeException;
        
        /**
         * Creation / Destruction of GPUContext and related handles
@@ -46,6 +57,7 @@ public abstract class GPUContext {
                if(currContext == null && DMLScript.USE_ACCELERATOR) {
                        synchronized(isGPUContextCreated) {
                                currContext = new JCudaContext();
+                               currContext.ensureComputeCapability();
                                OptimizerUtils.GPU_MEMORY_BUDGET = 
((JCudaContext)currContext).getAvailableMemory();
                                isGPUContextCreated = true;
                        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
index d94532c..893f416 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
@@ -36,6 +36,7 @@ import jcuda.runtime.JCuda;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.cudaDeviceProp;
 import static jcuda.jcudnn.JCudnn.cudnnCreate;
 import static jcuda.jcublas.JCublas2.cublasCreate;
 import static jcuda.jcublas.JCublas2.cublasDestroy;
@@ -44,6 +45,8 @@ import static jcuda.jcusparse.JCusparse.cusparseDestroy;
 import static jcuda.jcusparse.JCusparse.cusparseCreate;
 import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
+import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
 import static jcuda.runtime.JCuda.cudaMemGetInfo;
 import static jcuda.runtime.cudaError.cudaSuccess;
 
@@ -55,7 +58,13 @@ import static jcuda.runtime.cudaError.cudaSuccess;
  *
  */
 public class JCudaContext extends GPUContext {
-       
+
+       // The minimum CUDA Compute capability needed for SystemML.
+       // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
+       // If SystemML needs to run on an older card, this logic can be 
revisited.
+       final int MAJOR_REQUIRED = 3;
+       final int MINOR_REQUIRED = 0;
+
        private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
        
        public static boolean DEBUG = false;
@@ -82,7 +91,8 @@ public class JCudaContext extends GPUContext {
         LOG.info("Total number of GPUs on the machine: " + deviceCount);
         Statistics.cudaInitTime = System.nanoTime() - start;
        }
-       
+
+       @Override
        public long getAvailableMemory() {
                if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
                        long free [] = { 0 };
@@ -97,6 +107,30 @@ public class JCudaContext extends GPUContext {
                }
                return (long) 
(availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR);
        }
+
+       @Override
+       public void ensureComputeCapability() throws DMLRuntimeException {
+               int[] devices =  {-1};
+               cudaGetDeviceCount(devices);
+               if (devices[0] == -1){
+                       throw new DMLRuntimeException("Call to 
cudaGetDeviceCount returned 0 devices");
+               }
+               boolean isComputeCapable = true;
+               for (int i=0; i<devices[0]; i++) {
+                       cudaDeviceProp properties = new cudaDeviceProp();
+                       cudaGetDeviceProperties(properties, i);
+                       int major = properties.major;
+                       int minor = properties.minor;
+                       if (major < MAJOR_REQUIRED) {
+                               isComputeCapable = false;
+                       } else if (major == MAJOR_REQUIRED && minor < 
MINOR_REQUIRED) {
+                               isComputeCapable = false;
+                       }
+               }
+               if (!isComputeCapable) {
+                       throw new DMLRuntimeException("One of the CUDA cards on 
the system has compute capability lower than " + MAJOR_REQUIRED + "." + 
MINOR_REQUIRED);
+               }
+       }
        
        
        public JCudaContext() throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/41c51315/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 5426a30..ca3ccd3 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -90,6 +90,10 @@ import jcuda.jcusparse.cusparseHandle;
 //FIXME move could to respective instructions, this is not a block library
 public class LibMatrixCUDA {
 
+       // Assume Compute Capability 3.0
+       public static final int MAX_THREADS = 1024;                             
// For compute capability > 3.0
+       public static final int MAX_BLOCKS = 2147483647;        // 2^31 - 1 For 
compute capability > 3.0
+
        public static cudnnHandle cudnnHandle;
        public static cublasHandle cublasHandle;
        public static cusparseHandle cusparseHandle;
@@ -992,138 +996,143 @@ public class LibMatrixCUDA {
                assert opIndex != -1 : "Internal Error - Incorrect type of 
operation set for aggregate unary GPU instruction";
 
 
-               //TODO - care about reductionDirection & opIndex
-
                int rlen = (int)in1.getNumRows();
                int clen = (int)in1.getNumColumns();
                if (isSparse){
-                       long nnz = in1.getNnz();
-                       assert nnz > 0 : "Internal Error - number of non zeroes 
set to " + nnz + " in Aggregate Binary for GPU";
-                       MatrixObject out = 
ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
-                       throw new DMLRuntimeException("Internal Error - Not 
implemented");
+                       // The strategy for the time being is to convert sparse 
to dense
+                       // until a sparse specific kernel is written.
+                       ((JCudaObject)in1.getGPUObject()).sparseToDense();
+                       // long nnz = in1.getNnz();
+                       // assert nnz > 0 : "Internal Error - number of non 
zeroes set to " + nnz + " in Aggregate Binary for GPU";
+                       // MatrixObject out = 
ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
+                       // throw new DMLRuntimeException("Internal Error - Not 
implemented");
 
-               } else {
-                       Pointer out = null;
-                       if (reductionDirection == REDUCTION_ALL || 
reductionDirection == REDUCTION_DIAG) {
-                               // Scalar output
-                               out = new Pointer();
-                               cudaMalloc(out, Sizeof.DOUBLE);
-                       } else {
-                               // Matrix output
-                               MatrixObject out1 = 
ec.getDenseMatrixOutputForGPUInstruction(output);
-                               out = ((JCudaObject) 
out1.getGPUObject()).jcudaDenseMatrixPtr;
-                       }
+               }
 
-                       Pointer in = 
((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr;
-                       int size = rlen * clen;
-
-                       // For scalars, set the scalar output in the Execution 
Context object
-                       switch (opIndex){
-                               case OP_PLUS: {
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL : {
-                                                       double result = 
reduce_single(in, size);
-                                                       
ec.setScalarOutput(output, new DoubleObject(result));
-                                                       break;
-                                               }
-                                               case REDUCTION_DIAG :
-                                               case REDUCTION_COL :
-                                               case REDUCTION_ROW :
-                                                       throw new 
DMLRuntimeException("Internal Error - Row, Column and Diag summation not 
implemented yet");
+               Pointer out = null;
+               if (reductionDirection == REDUCTION_COL || reductionDirection 
== REDUCTION_ROW) {
+                       // Matrix output
+                       MatrixObject out1 = 
ec.getDenseMatrixOutputForGPUInstruction(output);
+                       out = ((JCudaObject) 
out1.getGPUObject()).jcudaDenseMatrixPtr;
+               }
+
+               Pointer in = 
((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr;
+               int size = rlen * clen;
+
+               // For scalars, set the scalar output in the Execution Context 
object
+               switch (opIndex){
+                       case OP_PLUS: {
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL : {
+                                               double result = reduceAll(in, 
size);
+                                               ec.setScalarOutput(output, new 
DoubleObject(result));
+                                               break;
                                        }
-                                       break;
-                               }
-                               case OP_PLUS_SQ : {
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL:
-                                               case REDUCTION_COL:
-                                               case REDUCTION_ROW:
-                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column summation square of 
matrix not implemented yet for GPU");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
summation squared");
+                                       case REDUCTION_COL : {
+                                               reduceRow(in, out, rlen, clen);
+                                               break;
                                        }
-                                       // break;
+                                       case REDUCTION_DIAG :
+                                       case REDUCTION_ROW :
+                                               throw new 
DMLRuntimeException("Internal Error - Row, Column and Diag summation not 
implemented yet");
                                }
-                               case OP_MEAN:{
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL:
-                                               case REDUCTION_COL:
-                                               case REDUCTION_ROW:
-                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not 
implemented yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
mean");
-                                       }
-                                       // break;
+                               break;
+                       }
+                       case OP_PLUS_SQ : {
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL:
+                                       case REDUCTION_COL:
+                                       case REDUCTION_ROW:
+                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column summation square of 
matrix not implemented yet for GPU");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
summation squared");
                                }
-                               case OP_VARIANCE : {
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL:
-                                               case REDUCTION_COL:
-                                               case REDUCTION_ROW:
-                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not 
implemented yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
variance");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_MEAN:{
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL:
+                                       case REDUCTION_COL:
+                                       case REDUCTION_ROW:
+                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not 
implemented yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
mean");
                                }
-                               case OP_MULTIPLY : {
-                                       switch (reductionDirection) {
-                                               case REDUCTION_ALL:
-                                                       throw new 
DMLRuntimeException("Internal Error - All element multiplication of matrix not 
implemented yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
multiplication");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_VARIANCE : {
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL:
+                                       case REDUCTION_COL:
+                                       case REDUCTION_ROW:
+                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not 
implemented yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
variance");
                                }
-                               case OP_MAX :{
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL:
-                                               case REDUCTION_COL:
-                                               case REDUCTION_ROW:
-                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column max of matrix not 
implemented yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_MULTIPLY : {
+                               switch (reductionDirection) {
+                                       case REDUCTION_ALL:
+                                               throw new 
DMLRuntimeException("Internal Error - All element multiplication of matrix not 
implemented yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
multiplication");
                                }
-                               case OP_MIN :{
-                                       switch(reductionDirection) {
-                                               case REDUCTION_ALL:
-                                               case REDUCTION_COL:
-                                               case REDUCTION_ROW:
-                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column min of matrix not 
implemented yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_MAX :{
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL:
+                                       case REDUCTION_COL:
+                                       case REDUCTION_ROW:
+                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column max of matrix not 
implemented yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
                                }
-                               case OP_MAXINDEX : {
-                                       switch(reductionDirection) {
-                                               case REDUCTION_COL:
-                                                       throw new 
DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented 
yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
maxindex");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_MIN :{
+                               switch(reductionDirection) {
+                                       case REDUCTION_ALL:
+                                       case REDUCTION_COL:
+                                       case REDUCTION_ROW:
+                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column min of matrix not 
implemented yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
                                }
-                               case OP_MININDEX : {
-                                       switch(reductionDirection) {
-                                               case REDUCTION_COL:
-                                                       throw new 
DMLRuntimeException("Internal Error - Column minindex of matrix not implemented 
yet for GPU ");
-                                               default:
-                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
minindex");
-                                       }
-                                       // break;
+                               // break;
+                       }
+                       case OP_MAXINDEX : {
+                               switch(reductionDirection) {
+                                       case REDUCTION_COL:
+                                               throw new 
DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented 
yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
maxindex");
                                }
-                               default : throw new 
DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
+                               // break;
                        }
-
+                       case OP_MININDEX : {
+                               switch(reductionDirection) {
+                                       case REDUCTION_COL:
+                                               throw new 
DMLRuntimeException("Internal Error - Column minindex of matrix not implemented 
yet for GPU ");
+                                       default:
+                                               throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
minindex");
+                               }
+                               // break;
+                       }
+                       default : throw new DMLRuntimeException("Internal Error 
- Invalid GPU Unary aggregate function!");
                }
        }
 
-
-       private static double reduce_single(Pointer in, int n) throws 
DMLRuntimeException {
-               int[] tmp = getThreadsBlocksAndSharedMem(n);
+       /**
+        * Do a simple reduction, the output of which is a single value
+        * @param in    {@link Pointer} to matrix in device memory
+        * @param n             size of array
+        * @return      the reduced value
+        * @throws DMLRuntimeException
+        */
+       private static double reduceAll(Pointer in, int n) throws 
DMLRuntimeException {
+               int[] tmp = getKernelParamsForReduceAll(n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
                Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE);
@@ -1132,7 +1141,7 @@ public class LibMatrixCUDA {
                cudaDeviceSynchronize();
                int s = n;
                while (s > 1) {
-                       tmp = getThreadsBlocksAndSharedMem(n);
+                       tmp = getKernelParamsForReduceAll(n);
                        blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
                        kernels.launchKernel("reduce", new 
ExecutionConfig(blocks, threads, sharedMem),
                                                        tempOut, tempOut, s);
@@ -1145,10 +1154,29 @@ public class LibMatrixCUDA {
                return result[0];
        }
 
+       /**
+        * Do a reduction by row. Data is reduced per row and the
+        * resulting vector is calculated.
+        * @param in            {@link Pointer} to input matrix in device 
memory (size - rows * columns)
+        * @param out           {@link Pointer} to output matrix in device 
memory (size - rows * 1)
+        * @param rows  number of rows in input matrix
+        * @param cols  number of columns in input matrix
+        * @throws DMLRuntimeException
+        */
+       private static void reduceRow(Pointer in, Pointer out, int rows, int 
cols) throws DMLRuntimeException {
+               int[] tmp = getKernelParamsForReduceByRow(rows, cols);
+               int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
+               kernels.launchKernel("reduce_row", new ExecutionConfig(blocks, 
threads, sharedMem),
+                                               in, out, rows, cols);
+               cudaDeviceSynchronize();
+       }
 
-       private static int[] getThreadsBlocksAndSharedMem(int n){
-               final int MAX_THREADS = 1024;
-               final int MAX_BLOCKS = 65535;
+       /**
+        * Get threads, blocks and shared memory for a reduce all operation
+        * @param n size of input array
+        * @return integer array containing {blocks, threads, shared memory}
+        */
+       private static int[] getKernelParamsForReduceAll(int n){
                int threads = (n < MAX_THREADS*2) ? nextPow2((n + 1)/ 2) : 
MAX_THREADS;
 
                int blocks = (n + (threads * 2 - 1)) / (threads * 2);
@@ -1161,6 +1189,22 @@ public class LibMatrixCUDA {
                return new int[] {blocks, threads, sharedMemSize};
        }
 
+       /**
+        * Get threads, blocks and shared memory for a reduce by row operation
+        * @param rows number of rows in input matrix
+        * @param cols number of columns in input matrix
+        * @return integer array containing {blocks, threads, shared memory}
+        */
+       private static int[] getKernelParamsForReduceByRow(int rows, int cols) {
+               final int WARP_SIZE = 32;
+               int threads = Math.min(cols, WARP_SIZE);
+               int blocks = rows;
+               int sharedMemSize = threads * Sizeof.DOUBLE;
+               if (threads <= 32){
+                       sharedMemSize *=2;
+               }
+               return new int[] {blocks, threads, sharedMemSize};
+       }
 
        private static int nextPow2(int x)
        {

Reply via email to