[SYSTEMML-1038] Implemented the uark+ op for CUDA.

Closes #319.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3caae271
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3caae271
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3caae271

Branch: refs/heads/master
Commit: 3caae2718359b2004ba7acabe35386f5c5417fb3
Parents: 154f077
Author: Nakul Jindal <naku...@gmail.com>
Authored: Sun Dec 18 11:08:57 2016 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Sun Dec 18 11:08:56 2016 -0800

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |   83 +-
 src/main/cpp/kernels/SystemML.ptx               | 1302 ++++++++----------
 .../java/org/apache/sysml/hops/AggUnaryOp.java  |    5 +
 .../instructions/GPUInstructionParser.java      |    6 +
 .../gpu/AggregateBinaryGPUInstruction.java      |   16 +-
 .../instructions/gpu/GPUInstruction.java        |    2 +-
 .../context/AggregateUnaryGPUInstruction.java   |   85 ++
 .../gpu/context/ExecutionConfig.java            |   17 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  714 +++++++---
 9 files changed, 1274 insertions(+), 956 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 0c78045..5e5fd5e 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -16,13 +16,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
- 
+
 /**********************************
-When updating a kernel or adding a new one, 
+When updating a kernel or adding a new one,
 please compile the ptx file and commit it:
-nvcc -ptx SystemML.cu 
+nvcc -ptx SystemML.cu
 ***********************************/
 
+
 // dim => rlen (Assumption: rlen == clen)
 // N = length of dense array
 extern "C"
@@ -45,8 +46,8 @@ __device__ double getBoolean(int val) {
                return 1.0;
 }
 
-// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, 
-// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal, 
+// op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
+// 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=log}
 extern "C"
 __device__ double binaryOp(double x, double y, int op) {
@@ -61,8 +62,8 @@ __device__ double binaryOp(double x, double y, int op) {
                return x / y;
        else if(op == 4)
                return pow(x, y);
-       // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 
10=notequal,        
-       else if(op == 5) 
+       // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
+       else if(op == 5)
                return getBoolean(x < y);
        else if(op == 6)
                return getBoolean(x <= y);
@@ -91,7 +92,7 @@ __global__ void dense_matrix_set(double* A,  double scalar, 
int rlen, int clen)
        int index = ix * clen + iy;
        if(index < rlen*clen) {
                A[index] = scalar;
-       }       
+       }
 }
 
 extern "C"
@@ -125,17 +126,17 @@ __global__ void compareAndSet(double* A,  double* ret, 
int rlen, int clen, doubl
                        ret[index] = ifEqualsVal;
                else if(A[index] < compareVal)
                        ret[index] = ifLessThanVal;
-               else            
+               else
                        ret[index] = ifGreaterThanVal;
        }
 }
 
 extern "C"
-__global__ void binCellOp(double* A, double* B, double* C, 
+__global__ void binCellOp(double* A, double* B, double* C,
        int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) 
{
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
-       
+
        if(ix < maxRlen && iy < maxClen) {
                int outIndex = ix * maxClen + iy;
                int aIndex = outIndex;
@@ -180,3 +181,59 @@ __global__ void fill(double* A, double scalar, int lenA) {
            A[index] = scalar;
        }
 }
+
+
+
+extern "C"
+__global__ void reduce(double *g_idata, double *g_odata, unsigned int n)
+{
+    extern __shared__ double sdata[];
+
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
+    unsigned int gridSize = blockDim.x*2*gridDim.x;
+
+    double mySum = 0;
+
+    // we reduce multiple elements per thread.  The number is determined by the
+    // number of active thread blocks (via gridDim).  More blocks will result
+    // in a larger gridSize and therefore fewer elements per thread
+    while (i < n)
+    {
+        mySum += g_idata[i];
+        // ensure we don't read out of bounds
+        if (i + blockDim.x < n)
+            mySum += g_idata[i+blockDim.x];
+        i += gridSize;
+    }
+
+    // each thread puts its local sum into shared memory
+    sdata[tid] = mySum;
+    __syncthreads();
+
+
+    // do reduction in shared mem
+    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + 
sdata[tid + 256]; } __syncthreads(); }
+    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + 
sdata[tid + 128]; } __syncthreads(); }
+    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = mySum = mySum + 
sdata[tid +  64]; } __syncthreads(); }
+
+    if (tid < 32)
+    {
+        // now that we are using warp-synchronous programming (below)
+        // we need to declare our shared memory volatile so that the compiler
+        // doesn't reorder stores to it and induce incorrect behavior.
+        volatile double* smem = sdata;
+        if (blockDim.x >=  64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
+        if (blockDim.x >=  32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
+        if (blockDim.x >=  16) { smem[tid] = mySum = mySum + smem[tid +  8]; }
+        if (blockDim.x >=   8) { smem[tid] = mySum = mySum + smem[tid +  4]; }
+        if (blockDim.x >=   4) { smem[tid] = mySum = mySum + smem[tid +  2]; }
+        if (blockDim.x >=   2) { smem[tid] = mySum = mySum + smem[tid +  1]; }
+    }
+
+    // write result for this block to global mem
+    if (tid == 0)
+        g_odata[blockIdx.x] = sdata[0];
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index b21e18c..ea27ac0 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,323 +1,24 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-// 
-//   http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-19805474
-// Cuda compilation tools, release 7.5, V7.5.16
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
-.version 4.3
+.version 5.0
 .target sm_20
 .address_size 64
 
-       // .globl       getBoolean
+       // .globl       copyUpperToLowerTriangleDense
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
        .param .b64 __internal_accurate_pow_param_0,
        .param .b64 __internal_accurate_pow_param_1
 )
 ;
+.extern .shared .align 8 .b8 sdata[];
 
-.visible .func  (.param .b64 func_retval0) getBoolean(
-       .param .b32 getBoolean_param_0
-)
-{
-       .reg .pred      %p<2>;
-       .reg .b32       %r<2>;
-       .reg .f64       %fd<2>;
-
-
-       ld.param.u32    %r1, [getBoolean_param_0];
-       setp.eq.s32     %p1, %r1, 0;
-       selp.f64        %fd1, 0d0000000000000000, 0d3FF0000000000000, %p1;
-       st.param.f64    [func_retval0+0], %fd1;
-       ret;
-}
-
-       // .globl       binaryOp
-.visible .func  (.param .b64 func_retval0) binaryOp(
-       .param .b64 binaryOp_param_0,
-       .param .b64 binaryOp_param_1,
-       .param .b32 binaryOp_param_2
-)
-{
-       .reg .pred      %p<39>;
-       .reg .b32       %r<26>;
-       .reg .f64       %fd<39>;
-       .reg .b64       %rd<3>;
-
-
-       ld.param.f64    %fd27, [binaryOp_param_0];
-       ld.param.f64    %fd28, [binaryOp_param_1];
-       ld.param.u32    %r3, [binaryOp_param_2];
-       setp.eq.s32     %p2, %r3, 0;
-       @%p2 bra        BB1_38;
-
-       setp.eq.s32     %p3, %r3, 1;
-       @%p3 bra        BB1_37;
-       bra.uni         BB1_2;
-
-BB1_37:
-       sub.f64         %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-
-BB1_38:
-       add.f64         %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-
-BB1_2:
-       setp.eq.s32     %p4, %r3, 2;
-       @%p4 bra        BB1_36;
-       bra.uni         BB1_3;
-
-BB1_36:
-       mul.f64         %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-
-BB1_3:
-       setp.eq.s32     %p5, %r3, 3;
-       @%p5 bra        BB1_35;
-       bra.uni         BB1_4;
-
-BB1_35:
-       div.rn.f64      %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-
-BB1_4:
-       setp.eq.s32     %p6, %r3, 4;
-       @%p6 bra        BB1_21;
-       bra.uni         BB1_5;
-
-BB1_21:
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r1}, %fd27;
-       }
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r2}, %fd28;
-       }
-       bfe.u32         %r4, %r2, 20, 11;
-       add.s32         %r5, %r4, -1012;
-       mov.b64          %rd2, %fd28;
-       shl.b64         %rd1, %rd2, %r5;
-       setp.eq.s64     %p21, %rd1, -9223372036854775808;
-       abs.f64         %fd9, %fd27;
-       // Callseq Start 0
-       {
-       .reg .b32 temp_param_reg;
-       // <end>}
-       .param .b64 param0;
-       st.param.f64    [param0+0], %fd9;
-       .param .b64 param1;
-       st.param.f64    [param1+0], %fd28;
-       .param .b64 retval0;
-       call.uni (retval0), 
-       __internal_accurate_pow, 
-       (
-       param0, 
-       param1
-       );
-       ld.param.f64    %fd37, [retval0+0];
-       
-       //{
-       }// Callseq End 0
-       setp.lt.s32     %p22, %r1, 0;
-       and.pred        %p1, %p22, %p21;
-       @!%p1 bra       BB1_23;
-       bra.uni         BB1_22;
-
-BB1_22:
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r6}, %fd37;
-       }
-       xor.b32         %r7, %r6, -2147483648;
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r8, %temp}, %fd37;
-       }
-       mov.b64         %fd37, {%r8, %r7};
-
-BB1_23:
-       mov.f64         %fd36, %fd37;
-       setp.eq.f64     %p23, %fd27, 0d0000000000000000;
-       @%p23 bra       BB1_26;
-       bra.uni         BB1_24;
-
-BB1_26:
-       selp.b32        %r9, %r1, 0, %p21;
-       or.b32          %r10, %r9, 2146435072;
-       setp.lt.s32     %p27, %r2, 0;
-       selp.b32        %r11, %r10, %r9, %p27;
-       mov.u32         %r12, 0;
-       mov.b64         %fd36, {%r12, %r11};
-       bra.uni         BB1_27;
-
-BB1_5:
-       setp.eq.s32     %p7, %r3, 5;
-       @%p7 bra        BB1_20;
-       bra.uni         BB1_6;
-
-BB1_20:
-       setp.lt.f64     %p20, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p20;
-       bra.uni         BB1_39;
-
-BB1_6:
-       setp.eq.s32     %p8, %r3, 6;
-       @%p8 bra        BB1_19;
-       bra.uni         BB1_7;
-
-BB1_19:
-       setp.le.f64     %p19, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p19;
-       bra.uni         BB1_39;
-
-BB1_24:
-       setp.gt.s32     %p24, %r1, -1;
-       @%p24 bra       BB1_27;
-
-       cvt.rzi.f64.f64 %fd30, %fd28;
-       setp.neu.f64    %p25, %fd30, %fd28;
-       selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p25;
-
-BB1_27:
-       mov.f64         %fd15, %fd36;
-       add.f64         %fd16, %fd27, %fd28;
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%temp, %r13}, %fd16;
-       }
-       and.b32         %r14, %r13, 2146435072;
-       setp.ne.s32     %p28, %r14, 2146435072;
-       mov.f64         %fd35, %fd15;
-       @%p28 bra       BB1_34;
-
-       setp.gtu.f64    %p29, %fd9, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd16;
-       @%p29 bra       BB1_34;
-
-       abs.f64         %fd17, %fd28;
-       setp.gtu.f64    %p30, %fd17, 0d7FF0000000000000;
-       mov.f64         %fd34, %fd16;
-       mov.f64         %fd35, %fd34;
-       @%p30 bra       BB1_34;
-
-       setp.eq.f64     %p31, %fd17, 0d7FF0000000000000;
-       @%p31 bra       BB1_33;
-       bra.uni         BB1_31;
-
-BB1_33:
-       setp.gt.f64     %p33, %fd9, 0d3FF0000000000000;
-       selp.b32        %r21, 2146435072, 0, %p33;
-       xor.b32         %r22, %r21, 2146435072;
-       setp.lt.s32     %p34, %r2, 0;
-       selp.b32        %r23, %r22, %r21, %p34;
-       setp.eq.f64     %p35, %fd27, 0dBFF0000000000000;
-       selp.b32        %r24, 1072693248, %r23, %p35;
-       mov.u32         %r25, 0;
-       mov.b64         %fd35, {%r25, %r24};
-       bra.uni         BB1_34;
-
-BB1_7:
-       setp.eq.s32     %p9, %r3, 7;
-       @%p9 bra        BB1_18;
-       bra.uni         BB1_8;
-
-BB1_18:
-       setp.gt.f64     %p18, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p18;
-       bra.uni         BB1_39;
-
-BB1_8:
-       setp.eq.s32     %p10, %r3, 8;
-       @%p10 bra       BB1_17;
-       bra.uni         BB1_9;
-
-BB1_17:
-       setp.ge.f64     %p17, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p17;
-       bra.uni         BB1_39;
-
-BB1_9:
-       setp.eq.s32     %p11, %r3, 9;
-       @%p11 bra       BB1_16;
-       bra.uni         BB1_10;
-
-BB1_16:
-       setp.eq.f64     %p16, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p16;
-       bra.uni         BB1_39;
-
-BB1_31:
-       setp.neu.f64    %p32, %fd9, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd15;
-       @%p32 bra       BB1_34;
-
-       shr.s32         %r15, %r2, 31;
-       and.b32         %r16, %r15, -2146435072;
-       add.s32         %r17, %r16, 2146435072;
-       or.b32          %r18, %r17, -2147483648;
-       selp.b32        %r19, %r18, %r17, %p1;
-       mov.u32         %r20, 0;
-       mov.b64         %fd35, {%r20, %r19};
-
-BB1_34:
-       setp.eq.f64     %p36, %fd28, 0d0000000000000000;
-       setp.eq.f64     %p37, %fd27, 0d3FF0000000000000;
-       or.pred         %p38, %p37, %p36;
-       selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p38;
-
-BB1_39:
-       st.param.f64    [func_retval0+0], %fd38;
-       ret;
-
-BB1_10:
-       setp.eq.s32     %p12, %r3, 10;
-       @%p12 bra       BB1_15;
-       bra.uni         BB1_11;
-
-BB1_15:
-       setp.neu.f64    %p15, %fd27, %fd28;
-       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p15;
-       bra.uni         BB1_39;
-
-BB1_11:
-       setp.eq.s32     %p13, %r3, 11;
-       @%p13 bra       BB1_14;
-       bra.uni         BB1_12;
-
-BB1_14:
-       min.f64         %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-
-BB1_12:
-       mov.f64         %fd38, 0dC08F380000000000;
-       setp.ne.s32     %p14, %r3, 12;
-       @%p14 bra       BB1_39;
-
-       max.f64         %fd38, %fd27, %fd28;
-       bra.uni         BB1_39;
-}
-
-       // .globl       copyUpperToLowerTriangleDense
 .visible .entry copyUpperToLowerTriangleDense(
        .param .u64 copyUpperToLowerTriangleDense_param_0,
        .param .u32 copyUpperToLowerTriangleDense_param_1,
@@ -345,10 +46,10 @@ BB1_12:
        setp.gt.s32     %p1, %r2, %r1;
        setp.lt.s32     %p2, %r3, %r5;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB2_2;
-       bra.uni         BB2_1;
+       @!%p3 bra       BB0_2;
+       bra.uni         BB0_1;
 
-BB2_1:
+BB0_1:
        cvta.to.global.u64      %rd2, %rd1;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd3, %r12, 8;
@@ -358,7 +59,7 @@ BB2_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB2_2:
+BB0_2:
        ret;
 }
 
@@ -391,14 +92,14 @@ BB2_2:
        mad.lo.s32      %r1, %r8, %r9, %r11;
        mul.lo.s32      %r12, %r3, %r2;
        setp.ge.s32     %p1, %r1, %r12;
-       @%p1 bra        BB3_2;
+       @%p1 bra        BB1_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB3_2:
+BB1_2:
        ret;
 }
 
@@ -432,10 +133,10 @@ BB3_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB4_2;
-       bra.uni         BB4_1;
+       @!%p3 bra       BB2_2;
+       bra.uni         BB2_1;
 
-BB4_1:
+BB2_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -444,7 +145,7 @@ BB4_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd1;
 
-BB4_2:
+BB2_2:
        ret;
 }
 
@@ -477,10 +178,10 @@ BB4_2:
        setp.lt.s32     %p1, %r1, %r4;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB5_2;
-       bra.uni         BB5_1;
+       @!%p3 bra       BB3_2;
+       bra.uni         BB3_1;
 
-BB5_1:
+BB3_1:
        cvta.to.global.u64      %rd3, %rd1;
        mad.lo.s32      %r11, %r1, %r3, %r2;
        mul.wide.s32    %rd4, %r11, 8;
@@ -492,7 +193,7 @@ BB5_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB5_2:
+BB3_2:
        ret;
 }
 
@@ -536,10 +237,10 @@ BB5_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB6_6;
-       bra.uni         BB6_1;
+       @!%p3 bra       BB4_6;
+       bra.uni         BB4_1;
 
-BB6_1:
+BB4_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -549,26 +250,26 @@ BB6_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB6_5;
-       bra.uni         BB6_2;
+       @%p4 bra        BB4_5;
+       bra.uni         BB4_2;
 
-BB6_5:
+BB4_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB6_6;
+       bra.uni         BB4_6;
 
-BB6_2:
+BB4_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB6_4;
-       bra.uni         BB6_3;
+       @%p5 bra        BB4_4;
+       bra.uni         BB4_3;
 
-BB6_4:
+BB4_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB6_6;
+       bra.uni         BB4_6;
 
-BB6_3:
+BB4_3:
        st.global.f64   [%rd1], %fd6;
 
-BB6_6:
+BB4_6:
        ret;
 }
 
@@ -585,7 +286,7 @@ BB6_6:
 )
 {
        .reg .pred      %p<50>;
-       .reg .b32       %r<52>;
+       .reg .b32       %r<51>;
        .reg .f64       %fd<39>;
        .reg .b64       %rd<15>;
 
@@ -609,93 +310,93 @@ BB6_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB7_53;
-       bra.uni         BB7_1;
+       @!%p4 bra       BB5_53;
+       bra.uni         BB5_1;
 
-BB7_1:
+BB5_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r50, %r1;
-       @%p5 bra        BB7_5;
+       mov.u32         %r49, %r1;
+       @%p5 bra        BB5_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r51, %r3;
-       @%p6 bra        BB7_4;
+       mov.u32         %r50, %r3;
+       @%p6 bra        BB5_4;
 
-       mov.u32         %r51, %r2;
+       mov.u32         %r50, %r2;
 
-BB7_4:
-       mov.u32         %r45, %r51;
-       mov.u32         %r4, %r45;
-       mov.u32         %r50, %r4;
+BB5_4:
+       mov.u32         %r44, %r50;
+       mov.u32         %r4, %r44;
+       mov.u32         %r49, %r4;
 
-BB7_5:
-       mov.u32         %r5, %r50;
+BB5_5:
+       mov.u32         %r5, %r49;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r48, %r1;
-       @%p7 bra        BB7_9;
+       mov.u32         %r47, %r1;
+       @%p7 bra        BB5_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r49, %r3;
-       @%p8 bra        BB7_8;
+       mov.u32         %r48, %r3;
+       @%p8 bra        BB5_8;
 
-       mov.u32         %r49, %r2;
+       mov.u32         %r48, %r2;
 
-BB7_8:
-       mov.u32         %r48, %r49;
+BB5_8:
+       mov.u32         %r47, %r48;
 
-BB7_9:
+BB5_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r48, 8;
+       mul.wide.s32    %rd9, %r47, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
        mov.f64         %fd38, 0dC08F380000000000;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB7_19;
+       @%p9 bra        BB5_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB7_15;
+       @%p19 bra       BB5_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB7_51;
+       @%p23 bra       BB5_51;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB7_50;
-       bra.uni         BB7_13;
+       @%p24 bra       BB5_50;
+       bra.uni         BB5_13;
 
-BB7_50:
+BB5_50:
        sub.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_19:
+BB5_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB7_24;
+       @%p10 bra       BB5_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB7_34;
+       @%p16 bra       BB5_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB7_33;
-       bra.uni         BB7_22;
+       @%p17 bra       BB5_33;
+       bra.uni         BB5_22;
 
-BB7_33:
+BB5_33:
        setp.gt.f64     %p29, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_15:
+BB5_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB7_49;
+       @%p20 bra       BB5_49;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB7_35;
-       bra.uni         BB7_17;
+       @%p21 bra       BB5_35;
+       bra.uni         BB5_17;
 
-BB7_35:
+BB5_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -710,7 +411,7 @@ BB7_35:
        shl.b64         %rd1, %rd11, %r22;
        setp.eq.s64     %p32, %rd1, -9223372036854775808;
        abs.f64         %fd11, %fd1;
-       // Callseq Start 1
+       // Callseq Start 0
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -728,13 +429,13 @@ BB7_35:
        ld.param.f64    %fd37, [retval0+0];
        
        //{
-       }// Callseq End 1
+       }// Callseq End 0
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB7_37;
-       bra.uni         BB7_36;
+       @!%p1 bra       BB5_37;
+       bra.uni         BB5_36;
 
-BB7_36:
+BB5_36:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r23}, %fd37;
@@ -746,111 +447,111 @@ BB7_36:
        }
        mov.b64         %fd37, {%r25, %r24};
 
-BB7_37:
+BB5_37:
        mov.f64         %fd36, %fd37;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB7_40;
-       bra.uni         BB7_38;
+       @%p34 bra       BB5_40;
+       bra.uni         BB5_38;
 
-BB7_40:
+BB5_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
        mov.b64         %fd36, {%r29, %r28};
-       bra.uni         BB7_41;
+       bra.uni         BB5_41;
 
-BB7_24:
+BB5_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB7_28;
+       @%p11 bra       BB5_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB7_32;
-       bra.uni         BB7_26;
+       @%p14 bra       BB5_32;
+       bra.uni         BB5_26;
 
-BB7_32:
+BB5_32:
        setp.eq.f64     %p27, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_28:
+BB5_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB7_31;
-       bra.uni         BB7_29;
+       @%p12 bra       BB5_31;
+       bra.uni         BB5_29;
 
-BB7_31:
+BB5_31:
        min.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_51:
+BB5_51:
        add.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_13:
+BB5_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB7_14;
-       bra.uni         BB7_52;
+       @%p25 bra       BB5_14;
+       bra.uni         BB5_52;
 
-BB7_14:
+BB5_14:
        mul.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_34:
+BB5_34:
        setp.le.f64     %p30, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_22:
+BB5_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB7_23;
-       bra.uni         BB7_52;
+       @%p18 bra       BB5_23;
+       bra.uni         BB5_52;
 
-BB7_23:
+BB5_23:
        setp.ge.f64     %p28, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_49:
+BB5_49:
        div.rn.f64      %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_17:
+BB5_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB7_18;
-       bra.uni         BB7_52;
+       @%p22 bra       BB5_18;
+       bra.uni         BB5_52;
 
-BB7_18:
+BB5_18:
        setp.lt.f64     %p31, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_26:
+BB5_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB7_27;
-       bra.uni         BB7_52;
+       @%p15 bra       BB5_27;
+       bra.uni         BB5_52;
 
-BB7_27:
+BB5_27:
        setp.neu.f64    %p26, %fd1, %fd2;
        selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_29:
+BB5_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB7_52;
+       @%p13 bra       BB5_52;
 
        max.f64         %fd38, %fd1, %fd2;
-       bra.uni         BB7_52;
+       bra.uni         BB5_52;
 
-BB7_38:
+BB5_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB7_41;
+       @%p35 bra       BB5_41;
 
        cvt.rzi.f64.f64 %fd30, %fd2;
        setp.neu.f64    %p36, %fd30, %fd2;
        selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
 
-BB7_41:
+BB5_41:
        mov.f64         %fd17, %fd36;
        add.f64         %fd18, %fd1, %fd2;
        {
@@ -860,60 +561,59 @@ BB7_41:
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
        mov.f64         %fd35, %fd17;
-       @%p39 bra       BB7_48;
+       @%p39 bra       BB5_48;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
        mov.f64         %fd35, %fd18;
-       @%p40 bra       BB7_48;
+       @%p40 bra       BB5_48;
 
        abs.f64         %fd19, %fd2;
        setp.gtu.f64    %p41, %fd19, 0d7FF0000000000000;
        mov.f64         %fd34, %fd18;
        mov.f64         %fd35, %fd34;
-       @%p41 bra       BB7_48;
+       @%p41 bra       BB5_48;
 
        setp.eq.f64     %p42, %fd19, 0d7FF0000000000000;
-       @%p42 bra       BB7_47;
-       bra.uni         BB7_45;
+       @%p42 bra       BB5_47;
+       bra.uni         BB5_45;
 
-BB7_47:
+BB5_47:
        setp.gt.f64     %p44, %fd11, 0d3FF0000000000000;
-       selp.b32        %r38, 2146435072, 0, %p44;
-       xor.b32         %r39, %r38, 2146435072;
+       selp.b32        %r37, 2146435072, 0, %p44;
+       xor.b32         %r38, %r37, 2146435072;
        setp.lt.s32     %p45, %r9, 0;
-       selp.b32        %r40, %r39, %r38, %p45;
+       selp.b32        %r39, %r38, %r37, %p45;
        setp.eq.f64     %p46, %fd1, 0dBFF0000000000000;
-       selp.b32        %r41, 1072693248, %r40, %p46;
-       mov.u32         %r42, 0;
-       mov.b64         %fd35, {%r42, %r41};
-       bra.uni         BB7_48;
+       selp.b32        %r40, 1072693248, %r39, %p46;
+       mov.u32         %r41, 0;
+       mov.b64         %fd35, {%r41, %r40};
+       bra.uni         BB5_48;
 
-BB7_45:
+BB5_45:
        setp.neu.f64    %p43, %fd11, 0d7FF0000000000000;
        mov.f64         %fd35, %fd17;
-       @%p43 bra       BB7_48;
+       @%p43 bra       BB5_48;
 
        shr.s32         %r32, %r9, 31;
        and.b32         %r33, %r32, -2146435072;
-       add.s32         %r34, %r33, 2146435072;
-       or.b32          %r35, %r34, -2147483648;
-       selp.b32        %r36, %r35, %r34, %p1;
-       mov.u32         %r37, 0;
-       mov.b64         %fd35, {%r37, %r36};
+       selp.b32        %r34, -1048576, 2146435072, %p1;
+       add.s32         %r35, %r34, %r33;
+       mov.u32         %r36, 0;
+       mov.b64         %fd35, {%r36, %r35};
 
-BB7_48:
+BB5_48:
        setp.eq.f64     %p47, %fd2, 0d0000000000000000;
        setp.eq.f64     %p48, %fd1, 0d3FF0000000000000;
        or.pred         %p49, %p48, %p47;
        selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p49;
 
-BB7_52:
+BB5_52:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd38;
 
-BB7_53:
+BB5_53:
        ret;
 }
 
@@ -929,7 +629,7 @@ BB7_53:
 )
 {
        .reg .pred      %p<85>;
-       .reg .b32       %r<63>;
+       .reg .b32       %r<61>;
        .reg .f64       %fd<75>;
        .reg .b64       %rd<12>;
 
@@ -952,7 +652,7 @@ BB7_53:
        mad.lo.s32      %r1, %r14, %r15, %r17;
        mul.lo.s32      %r18, %r9, %r8;
        setp.ge.s32     %p3, %r1, %r18;
-       @%p3 bra        BB8_88;
+       @%p3 bra        BB6_88;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -961,178 +661,178 @@ BB7_53:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB8_45;
+       @%p4 bra        BB6_45;
 
        setp.eq.s32     %p5, %r6, 0;
-       @%p5 bra        BB8_43;
+       @%p5 bra        BB6_43;
 
        mov.f64         %fd66, 0dC08F380000000000;
        setp.gt.s32     %p6, %r6, 6;
-       @%p6 bra        BB8_13;
+       @%p6 bra        BB6_13;
 
        setp.gt.s32     %p14, %r6, 3;
-       @%p14 bra       BB8_9;
+       @%p14 bra       BB6_9;
 
        setp.eq.s32     %p18, %r6, 1;
-       @%p18 bra       BB8_42;
+       @%p18 bra       BB6_42;
 
        setp.eq.s32     %p19, %r6, 2;
-       @%p19 bra       BB8_41;
-       bra.uni         BB8_7;
+       @%p19 bra       BB6_41;
+       bra.uni         BB6_7;
 
-BB8_41:
+BB6_41:
        mul.f64         %fd66, %fd1, %fd54;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_45:
+BB6_45:
        setp.eq.s32     %p45, %r6, 0;
-       @%p45 bra       BB8_86;
+       @%p45 bra       BB6_86;
 
        mov.f64         %fd74, 0dC08F380000000000;
        setp.gt.s32     %p46, %r6, 6;
-       @%p46 bra       BB8_56;
+       @%p46 bra       BB6_56;
 
        setp.gt.s32     %p54, %r6, 3;
-       @%p54 bra       BB8_52;
+       @%p54 bra       BB6_52;
 
        setp.eq.s32     %p58, %r6, 1;
-       @%p58 bra       BB8_85;
+       @%p58 bra       BB6_85;
 
        setp.eq.s32     %p59, %r6, 2;
-       @%p59 bra       BB8_84;
-       bra.uni         BB8_50;
+       @%p59 bra       BB6_84;
+       bra.uni         BB6_50;
 
-BB8_84:
+BB6_84:
        mul.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_43:
+BB6_43:
        add.f64         %fd66, %fd1, %fd54;
 
-BB8_44:
+BB6_44:
        st.global.f64   [%rd1], %fd66;
-       bra.uni         BB8_88;
+       bra.uni         BB6_88;
 
-BB8_13:
+BB6_13:
        setp.gt.s32     %p7, %r6, 9;
-       @%p7 bra        BB8_18;
+       @%p7 bra        BB6_18;
 
        setp.eq.s32     %p11, %r6, 7;
-       @%p11 bra       BB8_25;
+       @%p11 bra       BB6_25;
 
        setp.eq.s32     %p12, %r6, 8;
-       @%p12 bra       BB8_24;
-       bra.uni         BB8_16;
+       @%p12 bra       BB6_24;
+       bra.uni         BB6_16;
 
-BB8_24:
+BB6_24:
        setp.le.f64     %p23, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_86:
+BB6_86:
        add.f64         %fd74, %fd1, %fd54;
 
-BB8_87:
+BB6_87:
        st.global.f64   [%rd1], %fd74;
 
-BB8_88:
+BB6_88:
        ret;
 
-BB8_56:
+BB6_56:
        setp.gt.s32     %p47, %r6, 9;
-       @%p47 bra       BB8_61;
+       @%p47 bra       BB6_61;
 
        setp.eq.s32     %p51, %r6, 7;
-       @%p51 bra       BB8_68;
+       @%p51 bra       BB6_68;
 
        setp.eq.s32     %p52, %r6, 8;
-       @%p52 bra       BB8_67;
-       bra.uni         BB8_59;
+       @%p52 bra       BB6_67;
+       bra.uni         BB6_59;
 
-BB8_67:
+BB6_67:
        setp.ge.f64     %p63, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p63;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_9:
+BB6_9:
        setp.eq.s32     %p15, %r6, 4;
-       @%p15 bra       BB8_27;
+       @%p15 bra       BB6_27;
 
        setp.eq.s32     %p16, %r6, 5;
-       @%p16 bra       BB8_26;
-       bra.uni         BB8_11;
+       @%p16 bra       BB6_26;
+       bra.uni         BB6_11;
 
-BB8_26:
+BB6_26:
        setp.gt.f64     %p26, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_18:
+BB6_18:
        setp.eq.s32     %p8, %r6, 10;
-       @%p8 bra        BB8_23;
+       @%p8 bra        BB6_23;
 
        setp.eq.s32     %p9, %r6, 11;
-       @%p9 bra        BB8_22;
-       bra.uni         BB8_20;
+       @%p9 bra        BB6_22;
+       bra.uni         BB6_20;
 
-BB8_22:
+BB6_22:
        min.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_52:
+BB6_52:
        setp.eq.s32     %p55, %r6, 4;
-       @%p55 bra       BB8_70;
+       @%p55 bra       BB6_70;
 
        setp.eq.s32     %p56, %r6, 5;
-       @%p56 bra       BB8_69;
-       bra.uni         BB8_54;
+       @%p56 bra       BB6_69;
+       bra.uni         BB6_54;
 
-BB8_69:
+BB6_69:
        setp.lt.f64     %p66, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p66;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_61:
+BB6_61:
        setp.eq.s32     %p48, %r6, 10;
-       @%p48 bra       BB8_66;
+       @%p48 bra       BB6_66;
 
        setp.eq.s32     %p49, %r6, 11;
-       @%p49 bra       BB8_65;
-       bra.uni         BB8_63;
+       @%p49 bra       BB6_65;
+       bra.uni         BB6_63;
 
-BB8_65:
+BB6_65:
        min.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_42:
+BB6_42:
        sub.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_7:
+BB6_7:
        setp.eq.s32     %p20, %r6, 3;
-       @%p20 bra       BB8_8;
-       bra.uni         BB8_44;
+       @%p20 bra       BB6_8;
+       bra.uni         BB6_44;
 
-BB8_8:
+BB6_8:
        div.rn.f64      %fd66, %fd54, %fd1;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_25:
+BB6_25:
        setp.lt.f64     %p24, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_16:
+BB6_16:
        setp.eq.s32     %p13, %r6, 9;
-       @%p13 bra       BB8_17;
-       bra.uni         BB8_44;
+       @%p13 bra       BB6_17;
+       bra.uni         BB6_44;
 
-BB8_17:
+BB6_17:
        setp.eq.f64     %p22, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_27:
+BB6_27:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd54;
@@ -1147,7 +847,7 @@ BB8_27:
        shl.b64         %rd2, %rd10, %r20;
        setp.eq.s64     %p27, %rd2, -9223372036854775808;
        abs.f64         %fd10, %fd54;
-       // Callseq Start 2
+       // Callseq Start 1
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -1165,13 +865,13 @@ BB8_27:
        ld.param.f64    %fd65, [retval0+0];
        
        //{
-       }// Callseq End 2
+       }// Callseq End 1
        setp.lt.s32     %p28, %r2, 0;
        and.pred        %p1, %p28, %p27;
-       @!%p1 bra       BB8_29;
-       bra.uni         BB8_28;
+       @!%p1 bra       BB6_29;
+       bra.uni         BB6_28;
 
-BB8_28:
+BB6_28:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r21}, %fd65;
@@ -1183,72 +883,72 @@ BB8_28:
        }
        mov.b64         %fd65, {%r23, %r22};
 
-BB8_29:
+BB6_29:
        mov.f64         %fd64, %fd65;
        setp.eq.f64     %p29, %fd54, 0d0000000000000000;
-       @%p29 bra       BB8_32;
-       bra.uni         BB8_30;
+       @%p29 bra       BB6_32;
+       bra.uni         BB6_30;
 
-BB8_32:
+BB6_32:
        selp.b32        %r24, %r2, 0, %p27;
        or.b32          %r25, %r24, 2146435072;
        setp.lt.s32     %p33, %r3, 0;
        selp.b32        %r26, %r25, %r24, %p33;
        mov.u32         %r27, 0;
        mov.b64         %fd64, {%r27, %r26};
-       bra.uni         BB8_33;
+       bra.uni         BB6_33;
 
-BB8_11:
+BB6_11:
        setp.eq.s32     %p17, %r6, 6;
-       @%p17 bra       BB8_12;
-       bra.uni         BB8_44;
+       @%p17 bra       BB6_12;
+       bra.uni         BB6_44;
 
-BB8_12:
+BB6_12:
        setp.ge.f64     %p25, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_23:
+BB6_23:
        setp.neu.f64    %p21, %fd1, %fd54;
        selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p21;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_20:
+BB6_20:
        setp.ne.s32     %p10, %r6, 12;
-       @%p10 bra       BB8_44;
+       @%p10 bra       BB6_44;
 
        max.f64         %fd66, %fd54, %fd1;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_85:
+BB6_85:
        sub.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_50:
+BB6_50:
        setp.eq.s32     %p60, %r6, 3;
-       @%p60 bra       BB8_51;
-       bra.uni         BB8_87;
+       @%p60 bra       BB6_51;
+       bra.uni         BB6_87;
 
-BB8_51:
+BB6_51:
        div.rn.f64      %fd74, %fd1, %fd54;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_68:
+BB6_68:
        setp.gt.f64     %p64, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p64;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_59:
+BB6_59:
        setp.eq.s32     %p53, %r6, 9;
-       @%p53 bra       BB8_60;
-       bra.uni         BB8_87;
+       @%p53 bra       BB6_60;
+       bra.uni         BB6_87;
 
-BB8_60:
+BB6_60:
        setp.eq.f64     %p62, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p62;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_70:
+BB6_70:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -1257,13 +957,13 @@ BB8_70:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd54;
        }
-       bfe.u32         %r41, %r5, 20, 11;
-       add.s32         %r42, %r41, -1012;
+       bfe.u32         %r40, %r5, 20, 11;
+       add.s32         %r41, %r40, -1012;
        mov.b64          %rd11, %fd54;
-       shl.b64         %rd3, %rd11, %r42;
+       shl.b64         %rd3, %rd11, %r41;
        setp.eq.s64     %p67, %rd3, -9223372036854775808;
        abs.f64         %fd36, %fd1;
-       // Callseq Start 3
+       // Callseq Start 2
        {
        .reg .b32 temp_param_reg;
        // <end>}
@@ -1281,70 +981,70 @@ BB8_70:
        ld.param.f64    %fd73, [retval0+0];
        
        //{
-       }// Callseq End 3
+       }// Callseq End 2
        setp.lt.s32     %p68, %r4, 0;
        and.pred        %p2, %p68, %p67;
-       @!%p2 bra       BB8_72;
-       bra.uni         BB8_71;
+       @!%p2 bra       BB6_72;
+       bra.uni         BB6_71;
 
-BB8_71:
+BB6_71:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r43}, %fd73;
+       mov.b64         {%temp, %r42}, %fd73;
        }
-       xor.b32         %r44, %r43, -2147483648;
+       xor.b32         %r43, %r42, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r45, %temp}, %fd73;
+       mov.b64         {%r44, %temp}, %fd73;
        }
-       mov.b64         %fd73, {%r45, %r44};
+       mov.b64         %fd73, {%r44, %r43};
 
-BB8_72:
+BB6_72:
        mov.f64         %fd72, %fd73;
        setp.eq.f64     %p69, %fd1, 0d0000000000000000;
-       @%p69 bra       BB8_75;
-       bra.uni         BB8_73;
+       @%p69 bra       BB6_75;
+       bra.uni         BB6_73;
 
-BB8_75:
-       selp.b32        %r46, %r4, 0, %p67;
-       or.b32          %r47, %r46, 2146435072;
+BB6_75:
+       selp.b32        %r45, %r4, 0, %p67;
+       or.b32          %r46, %r45, 2146435072;
        setp.lt.s32     %p73, %r5, 0;
-       selp.b32        %r48, %r47, %r46, %p73;
-       mov.u32         %r49, 0;
-       mov.b64         %fd72, {%r49, %r48};
-       bra.uni         BB8_76;
+       selp.b32        %r47, %r46, %r45, %p73;
+       mov.u32         %r48, 0;
+       mov.b64         %fd72, {%r48, %r47};
+       bra.uni         BB6_76;
 
-BB8_54:
+BB6_54:
        setp.eq.s32     %p57, %r6, 6;
-       @%p57 bra       BB8_55;
-       bra.uni         BB8_87;
+       @%p57 bra       BB6_55;
+       bra.uni         BB6_87;
 
-BB8_55:
+BB6_55:
        setp.le.f64     %p65, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p65;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_66:
+BB6_66:
        setp.neu.f64    %p61, %fd1, %fd54;
        selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p61;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_63:
+BB6_63:
        setp.ne.s32     %p50, %r6, 12;
-       @%p50 bra       BB8_87;
+       @%p50 bra       BB6_87;
 
        max.f64         %fd74, %fd1, %fd54;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 
-BB8_30:
+BB6_30:
        setp.gt.s32     %p30, %r2, -1;
-       @%p30 bra       BB8_33;
+       @%p30 bra       BB6_33;
 
        cvt.rzi.f64.f64 %fd56, %fd1;
        setp.neu.f64    %p31, %fd56, %fd1;
        selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p31;
 
-BB8_33:
+BB6_33:
        mov.f64         %fd16, %fd64;
        add.f64         %fd17, %fd1, %fd54;
        {
@@ -1354,119 +1054,117 @@ BB8_33:
        and.b32         %r29, %r28, 2146435072;
        setp.ne.s32     %p34, %r29, 2146435072;
        mov.f64         %fd63, %fd16;
-       @%p34 bra       BB8_40;
+       @%p34 bra       BB6_40;
 
        setp.gtu.f64    %p35, %fd10, 0d7FF0000000000000;
        mov.f64         %fd63, %fd17;
-       @%p35 bra       BB8_40;
+       @%p35 bra       BB6_40;
 
        abs.f64         %fd18, %fd1;
        setp.gtu.f64    %p36, %fd18, 0d7FF0000000000000;
        mov.f64         %fd62, %fd17;
        mov.f64         %fd63, %fd62;
-       @%p36 bra       BB8_40;
+       @%p36 bra       BB6_40;
 
        setp.eq.f64     %p37, %fd18, 0d7FF0000000000000;
-       @%p37 bra       BB8_39;
-       bra.uni         BB8_37;
+       @%p37 bra       BB6_39;
+       bra.uni         BB6_37;
 
-BB8_39:
+BB6_39:
        setp.gt.f64     %p39, %fd10, 0d3FF0000000000000;
-       selp.b32        %r36, 2146435072, 0, %p39;
-       xor.b32         %r37, %r36, 2146435072;
+       selp.b32        %r35, 2146435072, 0, %p39;
+       xor.b32         %r36, %r35, 2146435072;
        setp.lt.s32     %p40, %r3, 0;
-       selp.b32        %r38, %r37, %r36, %p40;
+       selp.b32        %r37, %r36, %r35, %p40;
        setp.eq.f64     %p41, %fd54, 0dBFF0000000000000;
-       selp.b32        %r39, 1072693248, %r38, %p41;
-       mov.u32         %r40, 0;
-       mov.b64         %fd63, {%r40, %r39};
-       bra.uni         BB8_40;
+       selp.b32        %r38, 1072693248, %r37, %p41;
+       mov.u32         %r39, 0;
+       mov.b64         %fd63, {%r39, %r38};
+       bra.uni         BB6_40;
 
-BB8_73:
+BB6_73:
        setp.gt.s32     %p70, %r4, -1;
-       @%p70 bra       BB8_76;
+       @%p70 bra       BB6_76;
 
        cvt.rzi.f64.f64 %fd58, %fd54;
        setp.neu.f64    %p71, %fd58, %fd54;
        selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p71;
 
-BB8_76:
+BB6_76:
        mov.f64         %fd42, %fd72;
        add.f64         %fd43, %fd1, %fd54;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r50}, %fd43;
+       mov.b64         {%temp, %r49}, %fd43;
        }
-       and.b32         %r51, %r50, 2146435072;
-       setp.ne.s32     %p74, %r51, 2146435072;
+       and.b32         %r50, %r49, 2146435072;
+       setp.ne.s32     %p74, %r50, 2146435072;
        mov.f64         %fd71, %fd42;
-       @%p74 bra       BB8_83;
+       @%p74 bra       BB6_83;
 
        setp.gtu.f64    %p75, %fd36, 0d7FF0000000000000;
        mov.f64         %fd71, %fd43;
-       @%p75 bra       BB8_83;
+       @%p75 bra       BB6_83;
 
        abs.f64         %fd44, %fd54;
        setp.gtu.f64    %p76, %fd44, 0d7FF0000000000000;
        mov.f64         %fd70, %fd43;
        mov.f64         %fd71, %fd70;
-       @%p76 bra       BB8_83;
+       @%p76 bra       BB6_83;
 
        setp.eq.f64     %p77, %fd44, 0d7FF0000000000000;
-       @%p77 bra       BB8_82;
-       bra.uni         BB8_80;
+       @%p77 bra       BB6_82;
+       bra.uni         BB6_80;
 
-BB8_82:
+BB6_82:
        setp.gt.f64     %p79, %fd36, 0d3FF0000000000000;
-       selp.b32        %r58, 2146435072, 0, %p79;
-       xor.b32         %r59, %r58, 2146435072;
+       selp.b32        %r56, 2146435072, 0, %p79;
+       xor.b32         %r57, %r56, 2146435072;
        setp.lt.s32     %p80, %r5, 0;
-       selp.b32        %r60, %r59, %r58, %p80;
+       selp.b32        %r58, %r57, %r56, %p80;
        setp.eq.f64     %p81, %fd1, 0dBFF0000000000000;
-       selp.b32        %r61, 1072693248, %r60, %p81;
-       mov.u32         %r62, 0;
-       mov.b64         %fd71, {%r62, %r61};
-       bra.uni         BB8_83;
+       selp.b32        %r59, 1072693248, %r58, %p81;
+       mov.u32         %r60, 0;
+       mov.b64         %fd71, {%r60, %r59};
+       bra.uni         BB6_83;
 
-BB8_37:
+BB6_37:
        setp.neu.f64    %p38, %fd10, 0d7FF0000000000000;
        mov.f64         %fd63, %fd16;
-       @%p38 bra       BB8_40;
+       @%p38 bra       BB6_40;
 
        shr.s32         %r30, %r3, 31;
        and.b32         %r31, %r30, -2146435072;
-       add.s32         %r32, %r31, 2146435072;
-       or.b32          %r33, %r32, -2147483648;
-       selp.b32        %r34, %r33, %r32, %p1;
-       mov.u32         %r35, 0;
-       mov.b64         %fd63, {%r35, %r34};
+       selp.b32        %r32, -1048576, 2146435072, %p1;
+       add.s32         %r33, %r32, %r31;
+       mov.u32         %r34, 0;
+       mov.b64         %fd63, {%r34, %r33};
 
-BB8_40:
+BB6_40:
        setp.eq.f64     %p42, %fd1, 0d0000000000000000;
        setp.eq.f64     %p43, %fd54, 0d3FF0000000000000;
        or.pred         %p44, %p43, %p42;
        selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p44;
-       bra.uni         BB8_44;
+       bra.uni         BB6_44;
 
-BB8_80:
+BB6_80:
        setp.neu.f64    %p78, %fd36, 0d7FF0000000000000;
        mov.f64         %fd71, %fd42;
-       @%p78 bra       BB8_83;
+       @%p78 bra       BB6_83;
 
-       shr.s32         %r52, %r5, 31;
-       and.b32         %r53, %r52, -2146435072;
-       add.s32         %r54, %r53, 2146435072;
-       or.b32          %r55, %r54, -2147483648;
-       selp.b32        %r56, %r55, %r54, %p2;
-       mov.u32         %r57, 0;
-       mov.b64         %fd71, {%r57, %r56};
+       shr.s32         %r51, %r5, 31;
+       and.b32         %r52, %r51, -2146435072;
+       selp.b32        %r53, -1048576, 2146435072, %p2;
+       add.s32         %r54, %r53, %r52;
+       mov.u32         %r55, 0;
+       mov.b64         %fd71, {%r55, %r54};
 
-BB8_83:
+BB6_83:
        setp.eq.f64     %p82, %fd54, 0d0000000000000000;
        setp.eq.f64     %p83, %fd1, 0d3FF0000000000000;
        or.pred         %p84, %p83, %p82;
        selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p84;
-       bra.uni         BB8_87;
+       bra.uni         BB6_87;
 }
 
        // .globl       fill
@@ -1490,14 +1188,191 @@ BB8_83:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB9_2;
+       @%p1 bra        BB7_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB9_2:
+BB7_2:
+       ret;
+}
+
+       // .globl       reduce
+.visible .entry reduce(
+       .param .u64 reduce_param_0,
+       .param .u64 reduce_param_1,
+       .param .u32 reduce_param_2
+)
+{
+       .reg .pred      %p<18>;
+       .reg .b32       %r<31>;
+       .reg .f64       %fd<70>;
+       .reg .b64       %rd<15>;
+
+
+       ld.param.u64    %rd2, [reduce_param_0];
+       ld.param.u64    %rd3, [reduce_param_1];
+       ld.param.u32    %r5, [reduce_param_2];
+       mov.u32         %r6, %tid.x;
+       mov.u32         %r7, %ctaid.x;
+       shl.b32         %r8, %r7, 1;
+       mov.u32         %r9, %ntid.x;
+       mad.lo.s32      %r30, %r8, %r9, %r6;
+       mov.f64         %fd67, 0d0000000000000000;
+       mov.f64         %fd68, %fd67;
+       setp.ge.u32     %p1, %r30, %r5;
+       @%p1 bra        BB8_4;
+
+BB8_1:
+       mov.f64         %fd1, %fd68;
+       cvta.to.global.u64      %rd4, %rd2;
+       mul.wide.u32    %rd5, %r30, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd27, [%rd6];
+       add.f64         %fd69, %fd1, %fd27;
+       add.s32         %r3, %r30, %r9;
+       setp.ge.u32     %p2, %r3, %r5;
+       @%p2 bra        BB8_3;
+
+       mul.wide.u32    %rd8, %r3, 8;
+       add.s64         %rd9, %rd4, %rd8;
+       ld.global.f64   %fd28, [%rd9];
+       add.f64         %fd69, %fd69, %fd28;
+
+BB8_3:
+       mov.f64         %fd68, %fd69;
+       shl.b32         %r12, %r9, 1;
+       mov.u32         %r13, %nctaid.x;
+       mad.lo.s32      %r30, %r12, %r13, %r30;
+       setp.lt.u32     %p3, %r30, %r5;
+       mov.f64         %fd67, %fd68;
+       @%p3 bra        BB8_1;
+
+BB8_4:
+       mov.f64         %fd65, %fd67;
+       mul.wide.u32    %rd10, %r6, 8;
+       mov.u64         %rd11, sdata;
+       add.s64         %rd1, %rd11, %rd10;
+       st.shared.f64   [%rd1], %fd65;
+       bar.sync        0;
+       setp.lt.u32     %p4, %r9, 512;
+       @%p4 bra        BB8_8;
+
+       setp.gt.u32     %p5, %r6, 255;
+       mov.f64         %fd66, %fd65;
+       @%p5 bra        BB8_7;
+
+       ld.shared.f64   %fd29, [%rd1+2048];
+       add.f64         %fd66, %fd65, %fd29;
+       st.shared.f64   [%rd1], %fd66;
+
+BB8_7:
+       mov.f64         %fd65, %fd66;
+       bar.sync        0;
+
+BB8_8:
+       mov.f64         %fd63, %fd65;
+       setp.lt.u32     %p6, %r9, 256;
+       @%p6 bra        BB8_12;
+
+       setp.gt.u32     %p7, %r6, 127;
+       mov.f64         %fd64, %fd63;
+       @%p7 bra        BB8_11;
+
+       ld.shared.f64   %fd30, [%rd1+1024];
+       add.f64         %fd64, %fd63, %fd30;
+       st.shared.f64   [%rd1], %fd64;
+
+BB8_11:
+       mov.f64         %fd63, %fd64;
+       bar.sync        0;
+
+BB8_12:
+       mov.f64         %fd61, %fd63;
+       setp.lt.u32     %p8, %r9, 128;
+       @%p8 bra        BB8_16;
+
+       setp.gt.u32     %p9, %r6, 63;
+       mov.f64         %fd62, %fd61;
+       @%p9 bra        BB8_15;
+
+       ld.shared.f64   %fd31, [%rd1+512];
+       add.f64         %fd62, %fd61, %fd31;
+       st.shared.f64   [%rd1], %fd62;
+
+BB8_15:
+       mov.f64         %fd61, %fd62;
+       bar.sync        0;
+
+BB8_16:
+       mov.f64         %fd60, %fd61;
+       setp.gt.u32     %p10, %r6, 31;
+       @%p10 bra       BB8_29;
+
+       setp.lt.u32     %p11, %r9, 64;
+       @%p11 bra       BB8_19;
+
+       ld.volatile.shared.f64  %fd32, [%rd1+256];
+       add.f64         %fd60, %fd60, %fd32;
+       st.volatile.shared.f64  [%rd1], %fd60;
+
+BB8_19:
+       mov.f64         %fd59, %fd60;
+       setp.lt.u32     %p12, %r9, 32;
+       @%p12 bra       BB8_21;
+
+       ld.volatile.shared.f64  %fd33, [%rd1+128];
+       add.f64         %fd59, %fd59, %fd33;
+       st.volatile.shared.f64  [%rd1], %fd59;
+
+BB8_21:
+       mov.f64         %fd58, %fd59;
+       setp.lt.u32     %p13, %r9, 16;
+       @%p13 bra       BB8_23;
+
+       ld.volatile.shared.f64  %fd34, [%rd1+64];
+       add.f64         %fd58, %fd58, %fd34;
+       st.volatile.shared.f64  [%rd1], %fd58;
+
+BB8_23:
+       mov.f64         %fd57, %fd58;
+       setp.lt.u32     %p14, %r9, 8;
+       @%p14 bra       BB8_25;
+
+       ld.volatile.shared.f64  %fd35, [%rd1+32];
+       add.f64         %fd57, %fd57, %fd35;
+       st.volatile.shared.f64  [%rd1], %fd57;
+
+BB8_25:
+       mov.f64         %fd56, %fd57;
+       setp.lt.u32     %p15, %r9, 4;
+       @%p15 bra       BB8_27;
+
+       ld.volatile.shared.f64  %fd36, [%rd1+16];
+       add.f64         %fd56, %fd56, %fd36;
+       st.volatile.shared.f64  [%rd1], %fd56;
+
+BB8_27:
+       setp.lt.u32     %p16, %r9, 2;
+       @%p16 bra       BB8_29;
+
+       ld.volatile.shared.f64  %fd37, [%rd1+8];
+       add.f64         %fd38, %fd56, %fd37;
+       st.volatile.shared.f64  [%rd1], %fd38;
+
+BB8_29:
+       setp.ne.s32     %p17, %r6, 0;
+       @%p17 bra       BB8_31;
+
+       ld.shared.f64   %fd39, [sdata];
+       cvta.to.global.u64      %rd12, %rd3;
+       mul.wide.u32    %rd13, %r7, 8;
+       add.s64         %rd14, %rd12, %rd13;
+       st.global.f64   [%rd14], %fd39;
+
+BB8_31:
        ret;
 }
 
@@ -1509,7 +1384,7 @@ BB9_2:
        .reg .pred      %p<8>;
        .reg .f32       %f<3>;
        .reg .b32       %r<49>;
-       .reg .f64       %fd<136>;
+       .reg .f64       %fd<135>;
 
 
        ld.param.f64    %fd12, [__internal_accurate_pow_param_0];
@@ -1524,7 +1399,7 @@ BB9_2:
        }
        shr.u32         %r47, %r46, 20;
        setp.ne.s32     %p1, %r47, 0;
-       @%p1 bra        BB10_2;
+       @%p1 bra        BB9_2;
 
        mul.f64         %fd14, %fd12, 0d4350000000000000;
        {
@@ -1538,28 +1413,28 @@ BB9_2:
        shr.u32         %r16, %r46, 20;
        add.s32         %r47, %r16, -54;
 
-BB10_2:
+BB9_2:
        add.s32         %r48, %r47, -1023;
        and.b32         %r17, %r46, -2146435073;
        or.b32          %r18, %r17, 1072693248;
-       mov.b64         %fd134, {%r45, %r18};
+       mov.b64         %fd133, {%r45, %r18};
        setp.lt.u32     %p2, %r18, 1073127583;
-       @%p2 bra        BB10_4;
+       @%p2 bra        BB9_4;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r19, %temp}, %fd134;
+       mov.b64         {%r19, %temp}, %fd133;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r20}, %fd134;
+       mov.b64         {%temp, %r20}, %fd133;
        }
        add.s32         %r21, %r20, -1048576;
-       mov.b64         %fd134, {%r19, %r21};
+       mov.b64         %fd133, {%r19, %r21};
        add.s32         %r48, %r47, -1022;
 
-BB10_4:
-       add.f64         %fd16, %fd134, 0d3FF0000000000000;
+BB9_4:
+       add.f64         %fd16, %fd133, 0d3FF0000000000000;
        // inline asm
        rcp.approx.ftz.f64 %fd15,%fd16;
        // inline asm
@@ -1568,7 +1443,7 @@ BB10_4:
        fma.rn.f64      %fd19, %fd17, %fd15, %fd18;
        fma.rn.f64      %fd20, %fd19, %fd19, %fd19;
        fma.rn.f64      %fd21, %fd20, %fd15, %fd15;
-       add.f64         %fd22, %fd134, 0dBFF0000000000000;
+       add.f64         %fd22, %fd133, 0dBFF0000000000000;
        mul.f64         %fd23, %fd22, %fd21;
        fma.rn.f64      %fd24, %fd22, %fd21, %fd23;
        mul.f64         %fd25, %fd24, %fd24;
@@ -1671,52 +1546,51 @@ BB10_4:
        add.f64         %fd4, %fd94, %fd97;
        sub.f64         %fd98, %fd94, %fd4;
        add.f64         %fd5, %fd97, %fd98;
-       mov.f64         %fd99, 0d3FF71547652B82FE;
-       mul.rn.f64      %fd100, %fd4, %fd99;
-       mov.f64         %fd101, 0d4338000000000000;
-       add.rn.f64      %fd102, %fd100, %fd101;
+       mov.f64         %fd99, 0d4338000000000000;
+       mov.f64         %fd100, 0d3FF71547652B82FE;
+       fma.rn.f64      %fd101, %fd4, %fd100, %fd99;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r13, %temp}, %fd102;
+       mov.b64         {%r13, %temp}, %fd101;
        }
-       mov.f64         %fd103, 0dC338000000000000;
-       add.rn.f64      %fd104, %fd102, %fd103;
-       mov.f64         %fd105, 0dBFE62E42FEFA39EF;
-       fma.rn.f64      %fd106, %fd104, %fd105, %fd4;
-       mov.f64         %fd107, 0dBC7ABC9E3B39803F;
-       fma.rn.f64      %fd108, %fd104, %fd107, %fd106;
-       mov.f64         %fd109, 0d3E928AF3FCA213EA;
-       mov.f64         %fd110, 0d3E5ADE1569CE2BDF;
-       fma.rn.f64      %fd111, %fd110, %fd108, %fd109;
-       mov.f64         %fd112, 0d3EC71DEE62401315;
-       fma.rn.f64      %fd113, %fd111, %fd108, %fd112;
-       mov.f64         %fd114, 0d3EFA01997C89EB71;
-       fma.rn.f64      %fd115, %fd113, %fd108, %fd114;
-       mov.f64         %fd116, 0d3F2A01A014761F65;
-       fma.rn.f64      %fd117, %fd115, %fd108, %fd116;
-       mov.f64         %fd118, 0d3F56C16C1852B7AF;
-       fma.rn.f64      %fd119, %fd117, %fd108, %fd118;
-       mov.f64         %fd120, 0d3F81111111122322;
-       fma.rn.f64      %fd121, %fd119, %fd108, %fd120;
-       mov.f64         %fd122, 0d3FA55555555502A1;
-       fma.rn.f64      %fd123, %fd121, %fd108, %fd122;
-       mov.f64         %fd124, 0d3FC5555555555511;
-       fma.rn.f64      %fd125, %fd123, %fd108, %fd124;
-       mov.f64         %fd126, 0d3FE000000000000B;
-       fma.rn.f64      %fd127, %fd125, %fd108, %fd126;
-       fma.rn.f64      %fd128, %fd127, %fd108, %fd18;
-       fma.rn.f64      %fd129, %fd128, %fd108, %fd18;
+       mov.f64         %fd102, 0dC338000000000000;
+       add.rn.f64      %fd103, %fd101, %fd102;
+       mov.f64         %fd104, 0dBFE62E42FEFA39EF;
+       fma.rn.f64      %fd105, %fd103, %fd104, %fd4;
+       mov.f64         %fd106, 0dBC7ABC9E3B39803F;
+       fma.rn.f64      %fd107, %fd103, %fd106, %fd105;
+       mov.f64         %fd108, 0d3E928AF3FCA213EA;
+       mov.f64         %fd109, 0d3E5ADE1569CE2BDF;
+       fma.rn.f64      %fd110, %fd109, %fd107, %fd108;
+       mov.f64         %fd111, 0d3EC71DEE62401315;
+       fma.rn.f64      %fd112, %fd110, %fd107, %fd111;
+       mov.f64         %fd113, 0d3EFA01997C89EB71;
+       fma.rn.f64      %fd114, %fd112, %fd107, %fd113;
+       mov.f64         %fd115, 0d3F2A01A014761F65;
+       fma.rn.f64      %fd116, %fd114, %fd107, %fd115;
+       mov.f64         %fd117, 0d3F56C16C1852B7AF;
+       fma.rn.f64      %fd118, %fd116, %fd107, %fd117;
+       mov.f64         %fd119, 0d3F81111111122322;
+       fma.rn.f64      %fd120, %fd118, %fd107, %fd119;
+       mov.f64         %fd121, 0d3FA55555555502A1;
+       fma.rn.f64      %fd122, %fd120, %fd107, %fd121;
+       mov.f64         %fd123, 0d3FC5555555555511;
+       fma.rn.f64      %fd124, %fd122, %fd107, %fd123;
+       mov.f64         %fd125, 0d3FE000000000000B;
+       fma.rn.f64      %fd126, %fd124, %fd107, %fd125;
+       fma.rn.f64      %fd127, %fd126, %fd107, %fd18;
+       fma.rn.f64      %fd128, %fd127, %fd107, %fd18;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r14, %temp}, %fd129;
+       mov.b64         {%r14, %temp}, %fd128;
        }
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r15}, %fd129;
+       mov.b64         {%temp, %r15}, %fd128;
        }
        shl.b32         %r33, %r13, 20;
        add.s32         %r34, %r15, %r33;
-       mov.b64         %fd135, {%r14, %r34};
+       mov.b64         %fd134, {%r14, %r34};
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r35}, %fd4;
@@ -1724,36 +1598,36 @@ BB10_4:
        mov.b32          %f2, %r35;
        abs.f32         %f1, %f2;
        setp.lt.f32     %p4, %f1, 0f4086232B;
-       @%p4 bra        BB10_7;
+       @%p4 bra        BB9_7;
 
        setp.lt.f64     %p5, %fd4, 0d0000000000000000;
-       add.f64         %fd130, %fd4, 0d7FF0000000000000;
-       selp.f64        %fd135, 0d0000000000000000, %fd130, %p5;
+       add.f64         %fd129, %fd4, 0d7FF0000000000000;
+       selp.f64        %fd134, 0d0000000000000000, %fd129, %p5;
        setp.geu.f32    %p6, %f1, 0f40874800;
-       @%p6 bra        BB10_7;
+       @%p6 bra        BB9_7;
 
        shr.u32         %r36, %r13, 31;
        add.s32         %r37, %r13, %r36;
        shr.s32         %r38, %r37, 1;
        shl.b32         %r39, %r38, 20;
        add.s32         %r40, %r39, %r15;
-       mov.b64         %fd131, {%r14, %r40};
+       mov.b64         %fd130, {%r14, %r40};
        sub.s32         %r41, %r13, %r38;
        shl.b32         %r42, %r41, 20;
        add.s32         %r43, %r42, 1072693248;
        mov.u32         %r44, 0;
-       mov.b64         %fd132, {%r44, %r43};
-       mul.f64         %fd135, %fd131, %fd132;
+       mov.b64         %fd131, {%r44, %r43};
+       mul.f64         %fd134, %fd130, %fd131;
 
-BB10_7:
-       abs.f64         %fd133, %fd135;
-       setp.eq.f64     %p7, %fd133, 0d7FF0000000000000;
-       @%p7 bra        BB10_9;
+BB9_7:
+       abs.f64         %fd132, %fd134;
+       setp.eq.f64     %p7, %fd132, 0d7FF0000000000000;
+       @%p7 bra        BB9_9;
 
-       fma.rn.f64      %fd135, %fd135, %fd5, %fd135;
+       fma.rn.f64      %fd134, %fd134, %fd5, %fd134;
 
-BB10_9:
-       st.param.f64    [func_retval0+0], %fd135;
+BB9_9:
+       st.param.f64    [func_retval0+0], %fd134;
        ret;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 11dc3ce..5d795f1 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.AggBinaryOp.SparkAggType;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
@@ -143,6 +144,10 @@ public class AggUnaryOp extends Hop implements 
MultiThreadedHop
                                }                               
                                else { //general case           
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) {
+                                               et = ExecType.GPU;
+                                               k = 1;
+                                       }
                                        agg1 = new 
PartialAggregate(input.constructLops(), 
                                                        HopsAgg2Lops.get(_op), 
HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k);
                                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 9c8be5d..f988e5f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -29,6 +29,7 @@ import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ReorgGPUInstruction;
+import 
org.apache.sysml.runtime.instructions.gpu.context.AggregateUnaryGPUInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {
@@ -62,6 +63,8 @@ public class GPUInstructionParser  extends InstructionParser
                
                
                String2GPUInstructionType.put( "sel+"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+
+               String2GPUInstructionType.put( "uak+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 
@@ -88,6 +91,9 @@ public class GPUInstructionParser  extends InstructionParser
                        throw new DMLRuntimeException("The instruction is not 
GPU-enabled:" + str);
                
                switch(gputype) {
+                       case AggregateUnary:
+                               return 
AggregateUnaryGPUInstruction.parseInstruction(str);
+
                        case AggregateBinary:
                                return 
AggregateBinaryGPUInstruction.parseInstruction(str);
                        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index b9cbfab..7219c6c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -88,14 +88,14 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                
                //get inputs
                MatrixObject m1 = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
-        MatrixObject m2 = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
-        
-        //compute matrix multiplication
-        int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : 
m1.getNumRows());
-        int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
-        
-        ec.setMetaData(_output.getName(), rlen, clen);
-        LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), 
_isLeftTransposed, _isRightTransposed);
+               MatrixObject m2 = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+
+               //compute matrix multiplication
+               int rlen = (int) (_isLeftTransposed ? m1.getNumColumns() : 
m1.getNumRows());
+               int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
+
+               ec.setMetaData(_output.getName(), rlen, clen);
+               LibMatrixCUDA.matmult(ec, m1, m2, _output.getName(), 
_isLeftTransposed, _isRightTransposed);
         
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 27a12fd..aca197e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
 
 public abstract class GPUInstruction extends Instruction 
 {
-       public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution, MMTSJ, 
Reorg, ArithmeticBinary, BuiltinUnary }; 
+       public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, 
Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary };
        
        protected GPUINSTRUCTION_TYPE _gputype;
        protected Operator _optr;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
new file mode 100644
index 0000000..04221f6
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/AggregateUnaryGPUInstruction.java
@@ -0,0 +1,85 @@
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.IndexFunction;
+import org.apache.sysml.runtime.functionobjects.ReduceAll;
+import org.apache.sysml.runtime.functionobjects.ReduceCol;
+import org.apache.sysml.runtime.functionobjects.ReduceRow;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.*;
+import org.apache.sysml.utils.Statistics;
+
+/**
+ * Implements aggregate unary instructions for CUDA
+ */
+public class AggregateUnaryGPUInstruction extends GPUInstruction {
+  private CPOperand _input1 = null;
+  private CPOperand _output = null;
+
+  public AggregateUnaryGPUInstruction(Operator op, CPOperand in1, CPOperand 
out,
+                                       String opcode, String istr)
+  {
+    super(op, opcode, istr);
+    _gputype = GPUINSTRUCTION_TYPE.AggregateUnary;
+    _input1 = in1;
+    _output = out;
+  }
+
+  public static AggregateUnaryGPUInstruction parseInstruction(String str )
+          throws DMLRuntimeException
+  {
+    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+    String opcode = parts[0];
+    CPOperand in1 = new CPOperand(parts[1]);
+    CPOperand out = new CPOperand(parts[2]);
+
+    // This follows logic similar to AggregateUnaryCPInstruction.
+    // nrow, ncol & length should either read or refresh metadata
+    Operator aggop = null;
+    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
+      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
+    } else {
+      aggop = InstructionUtils.parseBasicAggregateUnaryOperator(opcode);
+    }
+    return new AggregateUnaryGPUInstruction(aggop, in1, out, opcode, str);
+  }
+
+  @Override
+  public void processInstruction(ExecutionContext ec)
+          throws DMLRuntimeException
+  {
+    Statistics.incrementNoOfExecutedGPUInst();
+
+    String opcode = getOpcode();
+
+    // nrow, ncol & length should either read or refresh metadata
+    if(opcode.equalsIgnoreCase("nrow") || opcode.equalsIgnoreCase("ncol") || 
opcode.equalsIgnoreCase("length")) {
+      throw new DMLRuntimeException("nrow, ncol & length should not be 
compiled as GPU instructions!");
+    }
+
+    //get inputs
+    MatrixObject in1 = ec.getMatrixInputForGPUInstruction(_input1.getName());
+
+    int rlen = (int)in1.getNumRows();
+    int clen = (int)in1.getNumColumns();
+
+    LibMatrixCUDA.unaryAggregate(ec, in1, _output.getName(), 
(AggregateUnaryOperator)_optr);
+
+    //release inputs/outputs
+    ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+
+    // If the unary aggregate is a row reduction or a column reduction, it 
results in a vector
+    // which needs to be released. Otherwise a scala is produced and it is 
copied back to the host
+    // and set in the execution context by invoking the setScalarOutput
+    IndexFunction indexFunction = ((AggregateUnaryOperator) _optr).indexFn;
+    if (indexFunction instanceof ReduceRow || indexFunction instanceof 
ReduceCol) {
+      ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index c04e8a4..ae41bc3 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -39,9 +39,24 @@ public class ExecutionConfig {
        public int blockDimZ = 1;
        public int sharedMemBytes = 0;
        public CUstream stream = null;
+
+
        
        private static HashMap<Integer, Integer> maxBlockDimForDevice = new 
HashMap<Integer, Integer>();
-       
+
+       /**
+        * Convenience constructor for setting the number of blocks, number of 
threads and the
+        * shared memory size
+        * @param gridDimX
+        * @param blockDimX
+        * @param sharedMemBytes
+        */
+       public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) 
{
+               this.gridDimX = gridDimX;
+               this.blockDimX = blockDimX;
+               this.sharedMemBytes = sharedMemBytes;
+       }
+
        /**
         * Use this for simple vector operations and use following in the 
kernel 
         * <code> 

Reply via email to