http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/02040346/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index 8296f92..dfff5dd 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -514,14 +514,15 @@ BB2_36:
        ret;
 }
 
-       // .globl       _Z10reduce_rowI5SumOpEvPdS1_jjT_d
-.visible .func _Z10reduce_rowI5SumOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<20>;
@@ -530,11 +531,11 @@ BB2_36:
        .reg .b64       %rd<10>;
 
 
-       ld.param.u64    %rd2, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd3, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r7, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd40, [_Z10reduce_rowI5SumOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd2, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd3, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r7, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd40, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r1, %ctaid.x;
        setp.ge.u32     %p1, %r1, %r7;
        @%p1 bra        BB3_34;
@@ -682,14 +683,15 @@ BB3_34:
        ret;
 }
 
-       // .globl       _Z10reduce_colI5SumOpEvPdS1_jjT_d
-.visible .func _Z10reduce_colI5SumOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_colI5SumOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<4>;
@@ -698,11 +700,11 @@ BB3_34:
        .reg .b64       %rd<7>;
 
 
-       ld.param.u64    %rd1, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd2, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r5, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd6, [_Z10reduce_colI5SumOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd1, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd2, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r5, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd6, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r7, %ctaid.x;
        mov.u32         %r8, %ntid.x;
        mov.u32         %r9, %tid.x;
@@ -931,14 +933,15 @@ BB5_36:
        ret;
 }
 
-       // .globl       _Z10reduce_rowI5MaxOpEvPdS1_jjT_d
-.visible .func _Z10reduce_rowI5MaxOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<20>;
@@ -947,11 +950,11 @@ BB5_36:
        .reg .b64       %rd<10>;
 
 
-       ld.param.u64    %rd2, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd3, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r7, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd40, [_Z10reduce_rowI5MaxOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd2, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd3, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r7, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd40, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r1, %ctaid.x;
        setp.ge.u32     %p1, %r1, %r7;
        @%p1 bra        BB6_34;
@@ -1099,14 +1102,15 @@ BB6_34:
        ret;
 }
 
-       // .globl       _Z10reduce_colI5MaxOpEvPdS1_jjT_d
-.visible .func _Z10reduce_colI5MaxOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<4>;
@@ -1115,11 +1119,11 @@ BB6_34:
        .reg .b64       %rd<7>;
 
 
-       ld.param.u64    %rd1, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd2, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r5, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd6, [_Z10reduce_colI5MaxOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd1, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd2, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r5, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd6, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r7, %ctaid.x;
        mov.u32         %r8, %ntid.x;
        mov.u32         %r9, %tid.x;
@@ -1348,14 +1352,15 @@ BB8_36:
        ret;
 }
 
-       // .globl       _Z10reduce_rowI5MinOpEvPdS1_jjT_d
-.visible .func _Z10reduce_rowI5MinOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<20>;
@@ -1364,11 +1369,11 @@ BB8_36:
        .reg .b64       %rd<10>;
 
 
-       ld.param.u64    %rd2, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd3, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r7, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd40, [_Z10reduce_rowI5MinOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd2, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd3, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r7, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd40, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r1, %ctaid.x;
        setp.ge.u32     %p1, %r1, %r7;
        @%p1 bra        BB9_34;
@@ -1516,14 +1521,15 @@ BB9_34:
        ret;
 }
 
-       // .globl       _Z10reduce_colI5MinOpEvPdS1_jjT_d
-.visible .func _Z10reduce_colI5MinOpEvPdS1_jjT_d(
-       .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_0,
-       .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_1,
-       .param .b32 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_2,
-       .param .b32 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_3,
-       .param .align 1 .b8 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_4[1],
-       .param .b64 _Z10reduce_colI5MinOpEvPdS1_jjT_d_param_5
+       // .globl       _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 1 .b8 
_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
+       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6
 )
 {
        .reg .pred      %p<4>;
@@ -1532,11 +1538,11 @@ BB9_34:
        .reg .b64       %rd<7>;
 
 
-       ld.param.u64    %rd1, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_0];
-       ld.param.u64    %rd2, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_1];
-       ld.param.u32    %r5, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_2];
-       ld.param.u32    %r6, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_3];
-       ld.param.f64    %fd6, [_Z10reduce_colI5MinOpEvPdS1_jjT_d_param_5];
+       ld.param.u64    %rd1, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd2, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r5, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.f64    %fd6, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
        mov.u32         %r7, %ctaid.x;
        mov.u32         %r8, %ntid.x;
        mov.u32         %r9, %tid.x;
@@ -1570,6 +1576,431 @@ BB10_5:
        ret;
 }
 
+       // .globl       _Z6reduceI9ProductOpEvPdS1_jT_d
+.visible .func _Z6reduceI9ProductOpEvPdS1_jT_d(
+       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_0,
+       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_1,
+       .param .b32 _Z6reduceI9ProductOpEvPdS1_jT_d_param_2,
+       .param .align 1 .b8 _Z6reduceI9ProductOpEvPdS1_jT_d_param_3[1],
+       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_4
+)
+{
+       .reg .pred      %p<20>;
+       .reg .b32       %r<33>;
+       .reg .f64       %fd<79>;
+       .reg .b64       %rd<12>;
+
+
+       ld.param.u64    %rd2, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_0];
+       ld.param.u64    %rd3, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_1];
+       ld.param.u32    %r5, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_2];
+       ld.param.f64    %fd76, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_4];
+       mov.u32         %r6, %tid.x;
+       mov.u32         %r7, %ctaid.x;
+       shl.b32         %r8, %r7, 1;
+       mov.u32         %r9, %ntid.x;
+       mad.lo.s32      %r32, %r8, %r9, %r6;
+       setp.ge.u32     %p1, %r32, %r5;
+       @%p1 bra        BB11_5;
+
+       mov.f64         %fd77, %fd76;
+
+BB11_2:
+       mov.f64         %fd1, %fd77;
+       mul.wide.u32    %rd4, %r32, 8;
+       add.s64         %rd5, %rd2, %rd4;
+       ld.f64  %fd29, [%rd5];
+       mul.f64         %fd78, %fd1, %fd29;
+       add.s32         %r3, %r32, %r9;
+       setp.ge.u32     %p2, %r3, %r5;
+       @%p2 bra        BB11_4;
+
+       mul.wide.u32    %rd6, %r3, 8;
+       add.s64         %rd7, %rd2, %rd6;
+       ld.f64  %fd30, [%rd7];
+       mul.f64         %fd78, %fd78, %fd30;
+
+BB11_4:
+       mov.f64         %fd77, %fd78;
+       shl.b32         %r12, %r9, 1;
+       mov.u32         %r13, %nctaid.x;
+       mad.lo.s32      %r32, %r12, %r13, %r32;
+       setp.lt.u32     %p3, %r32, %r5;
+       mov.f64         %fd76, %fd77;
+       @%p3 bra        BB11_2;
+
+BB11_5:
+       mov.f64         %fd74, %fd76;
+       mul.wide.u32    %rd8, %r6, 8;
+       mov.u64         %rd9, sdata;
+       add.s64         %rd1, %rd9, %rd8;
+       st.shared.f64   [%rd1], %fd74;
+       bar.sync        0;
+       setp.lt.u32     %p4, %r9, 1024;
+       @%p4 bra        BB11_9;
+
+       setp.gt.u32     %p5, %r6, 511;
+       mov.f64         %fd75, %fd74;
+       @%p5 bra        BB11_8;
+
+       ld.shared.f64   %fd31, [%rd1+4096];
+       mul.f64         %fd75, %fd74, %fd31;
+       st.shared.f64   [%rd1], %fd75;
+
+BB11_8:
+       mov.f64         %fd74, %fd75;
+       bar.sync        0;
+
+BB11_9:
+       mov.f64         %fd72, %fd74;
+       setp.lt.u32     %p6, %r9, 512;
+       @%p6 bra        BB11_13;
+
+       setp.gt.u32     %p7, %r6, 255;
+       mov.f64         %fd73, %fd72;
+       @%p7 bra        BB11_12;
+
+       ld.shared.f64   %fd32, [%rd1+2048];
+       mul.f64         %fd73, %fd72, %fd32;
+       st.shared.f64   [%rd1], %fd73;
+
+BB11_12:
+       mov.f64         %fd72, %fd73;
+       bar.sync        0;
+
+BB11_13:
+       mov.f64         %fd70, %fd72;
+       setp.lt.u32     %p8, %r9, 256;
+       @%p8 bra        BB11_17;
+
+       setp.gt.u32     %p9, %r6, 127;
+       mov.f64         %fd71, %fd70;
+       @%p9 bra        BB11_16;
+
+       ld.shared.f64   %fd33, [%rd1+1024];
+       mul.f64         %fd71, %fd70, %fd33;
+       st.shared.f64   [%rd1], %fd71;
+
+BB11_16:
+       mov.f64         %fd70, %fd71;
+       bar.sync        0;
+
+BB11_17:
+       mov.f64         %fd68, %fd70;
+       setp.lt.u32     %p10, %r9, 128;
+       @%p10 bra       BB11_21;
+
+       setp.gt.u32     %p11, %r6, 63;
+       mov.f64         %fd69, %fd68;
+       @%p11 bra       BB11_20;
+
+       ld.shared.f64   %fd34, [%rd1+512];
+       mul.f64         %fd69, %fd68, %fd34;
+       st.shared.f64   [%rd1], %fd69;
+
+BB11_20:
+       mov.f64         %fd68, %fd69;
+       bar.sync        0;
+
+BB11_21:
+       mov.f64         %fd67, %fd68;
+       setp.gt.u32     %p12, %r6, 31;
+       @%p12 bra       BB11_34;
+
+       setp.lt.u32     %p13, %r9, 64;
+       @%p13 bra       BB11_24;
+
+       ld.volatile.shared.f64  %fd35, [%rd1+256];
+       mul.f64         %fd67, %fd67, %fd35;
+       st.volatile.shared.f64  [%rd1], %fd67;
+
+BB11_24:
+       mov.f64         %fd66, %fd67;
+       setp.lt.u32     %p14, %r9, 32;
+       @%p14 bra       BB11_26;
+
+       ld.volatile.shared.f64  %fd36, [%rd1+128];
+       mul.f64         %fd66, %fd66, %fd36;
+       st.volatile.shared.f64  [%rd1], %fd66;
+
+BB11_26:
+       mov.f64         %fd65, %fd66;
+       setp.lt.u32     %p15, %r9, 16;
+       @%p15 bra       BB11_28;
+
+       ld.volatile.shared.f64  %fd37, [%rd1+64];
+       mul.f64         %fd65, %fd65, %fd37;
+       st.volatile.shared.f64  [%rd1], %fd65;
+
+BB11_28:
+       mov.f64         %fd64, %fd65;
+       setp.lt.u32     %p16, %r9, 8;
+       @%p16 bra       BB11_30;
+
+       ld.volatile.shared.f64  %fd38, [%rd1+32];
+       mul.f64         %fd64, %fd64, %fd38;
+       st.volatile.shared.f64  [%rd1], %fd64;
+
+BB11_30:
+       mov.f64         %fd63, %fd64;
+       setp.lt.u32     %p17, %r9, 4;
+       @%p17 bra       BB11_32;
+
+       ld.volatile.shared.f64  %fd39, [%rd1+16];
+       mul.f64         %fd63, %fd63, %fd39;
+       st.volatile.shared.f64  [%rd1], %fd63;
+
+BB11_32:
+       setp.lt.u32     %p18, %r9, 2;
+       @%p18 bra       BB11_34;
+
+       ld.volatile.shared.f64  %fd40, [%rd1+8];
+       mul.f64         %fd41, %fd63, %fd40;
+       st.volatile.shared.f64  [%rd1], %fd41;
+
+BB11_34:
+       setp.ne.s32     %p19, %r6, 0;
+       @%p19 bra       BB11_36;
+
+       ld.shared.f64   %fd42, [sdata];
+       mul.wide.u32    %rd10, %r7, 8;
+       add.s64         %rd11, %rd3, %rd10;
+       st.f64  [%rd11], %fd42;
+
+BB11_36:
+       ret;
+}
+
+       // .globl       _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 8 .b8 
_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8],
+       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6
+)
+{
+       .reg .pred      %p<20>;
+       .reg .b32       %r<30>;
+       .reg .f64       %fd<43>;
+       .reg .b64       %rd<11>;
+
+
+       ld.param.u64    %rd2, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd3, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r6, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r5, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.u64    %rd4, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5];
+       ld.param.f64    %fd42, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
+       mov.u32         %r7, %ctaid.x;
+       setp.ge.u32     %p1, %r7, %r6;
+       @%p1 bra        BB12_34;
+
+       mov.u32         %r29, %tid.x;
+       mul.lo.s32      %r2, %r7, %r5;
+       setp.ge.u32     %p2, %r29, %r5;
+       @%p2 bra        BB12_3;
+
+BB12_2:
+       add.s32         %r9, %r29, %r2;
+       mul.wide.u32    %rd5, %r9, 8;
+       add.s64         %rd6, %rd2, %rd5;
+       ld.f64  %fd27, [%rd6];
+       add.f64         %fd42, %fd42, %fd27;
+       mov.u32         %r10, %ntid.x;
+       add.s32         %r29, %r10, %r29;
+       setp.lt.u32     %p3, %r29, %r5;
+       @%p3 bra        BB12_2;
+
+BB12_3:
+       mov.u32         %r11, %tid.x;
+       mul.wide.u32    %rd7, %r11, 8;
+       mov.u64         %rd8, sdata;
+       add.s64         %rd1, %rd8, %rd7;
+       st.shared.f64   [%rd1], %fd42;
+       bar.sync        0;
+       mov.u32         %r12, %ntid.x;
+       setp.lt.u32     %p4, %r12, 1024;
+       @%p4 bra        BB12_7;
+
+       setp.gt.u32     %p5, %r11, 511;
+       @%p5 bra        BB12_6;
+
+       ld.shared.f64   %fd28, [%rd1+4096];
+       add.f64         %fd42, %fd42, %fd28;
+       st.shared.f64   [%rd1], %fd42;
+
+BB12_6:
+       bar.sync        0;
+
+BB12_7:
+       setp.lt.u32     %p6, %r12, 512;
+       @%p6 bra        BB12_11;
+
+       setp.gt.u32     %p7, %r11, 255;
+       @%p7 bra        BB12_10;
+
+       ld.shared.f64   %fd29, [%rd1+2048];
+       add.f64         %fd42, %fd42, %fd29;
+       st.shared.f64   [%rd1], %fd42;
+
+BB12_10:
+       bar.sync        0;
+
+BB12_11:
+       setp.lt.u32     %p8, %r12, 256;
+       @%p8 bra        BB12_15;
+
+       setp.gt.u32     %p9, %r11, 127;
+       @%p9 bra        BB12_14;
+
+       ld.shared.f64   %fd30, [%rd1+1024];
+       add.f64         %fd42, %fd42, %fd30;
+       st.shared.f64   [%rd1], %fd42;
+
+BB12_14:
+       bar.sync        0;
+
+BB12_15:
+       setp.lt.u32     %p10, %r12, 128;
+       @%p10 bra       BB12_19;
+
+       setp.gt.u32     %p11, %r11, 63;
+       @%p11 bra       BB12_18;
+
+       ld.shared.f64   %fd31, [%rd1+512];
+       add.f64         %fd42, %fd42, %fd31;
+       st.shared.f64   [%rd1], %fd42;
+
+BB12_18:
+       bar.sync        0;
+
+BB12_19:
+       setp.gt.u32     %p12, %r11, 31;
+       @%p12 bra       BB12_32;
+
+       setp.lt.u32     %p13, %r12, 64;
+       @%p13 bra       BB12_22;
+
+       ld.volatile.shared.f64  %fd32, [%rd1+256];
+       add.f64         %fd42, %fd42, %fd32;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB12_22:
+       setp.lt.u32     %p14, %r12, 32;
+       @%p14 bra       BB12_24;
+
+       ld.volatile.shared.f64  %fd33, [%rd1+128];
+       add.f64         %fd42, %fd42, %fd33;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB12_24:
+       setp.lt.u32     %p15, %r12, 16;
+       @%p15 bra       BB12_26;
+
+       ld.volatile.shared.f64  %fd34, [%rd1+64];
+       add.f64         %fd42, %fd42, %fd34;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB12_26:
+       setp.lt.u32     %p16, %r12, 8;
+       @%p16 bra       BB12_28;
+
+       ld.volatile.shared.f64  %fd35, [%rd1+32];
+       add.f64         %fd42, %fd42, %fd35;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB12_28:
+       setp.lt.u32     %p17, %r12, 4;
+       @%p17 bra       BB12_30;
+
+       ld.volatile.shared.f64  %fd36, [%rd1+16];
+       add.f64         %fd42, %fd42, %fd36;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB12_30:
+       setp.lt.u32     %p18, %r12, 2;
+       @%p18 bra       BB12_32;
+
+       ld.volatile.shared.f64  %fd37, [%rd1+8];
+       add.f64         %fd38, %fd42, %fd37;
+       st.volatile.shared.f64  [%rd1], %fd38;
+
+BB12_32:
+       setp.ne.s32     %p19, %r11, 0;
+       @%p19 bra       BB12_34;
+
+       ld.shared.f64   %fd39, [sdata];
+       cvt.rn.f64.s64  %fd40, %rd4;
+       div.rn.f64      %fd41, %fd39, %fd40;
+       mul.wide.u32    %rd9, %r7, 8;
+       add.s64         %rd10, %rd3, %rd9;
+       st.f64  [%rd10], %fd41;
+
+BB12_34:
+       ret;
+}
+
+       // .globl       _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d
+.visible .func _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d(
+       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0,
+       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1,
+       .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2,
+       .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3,
+       .param .align 1 .b8 
_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1],
+       .param .align 8 .b8 
_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8],
+       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6
+)
+{
+       .reg .pred      %p<4>;
+       .reg .b32       %r<11>;
+       .reg .f64       %fd<9>;
+       .reg .b64       %rd<8>;
+
+
+       ld.param.u64    %rd1, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0];
+       ld.param.u64    %rd2, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1];
+       ld.param.u32    %r5, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2];
+       ld.param.u32    %r6, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3];
+       ld.param.u64    %rd3, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5];
+       ld.param.f64    %fd8, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %ctaid.x;
+       mov.u32         %r9, %tid.x;
+       mad.lo.s32      %r1, %r7, %r8, %r9;
+       setp.ge.u32     %p1, %r1, %r6;
+       @%p1 bra        BB13_5;
+
+       mul.lo.s32      %r2, %r6, %r5;
+       setp.ge.u32     %p2, %r1, %r2;
+       @%p2 bra        BB13_4;
+
+       mov.u32         %r10, %r1;
+
+BB13_3:
+       mov.u32         %r3, %r10;
+       mul.wide.u32    %rd4, %r3, 8;
+       add.s64         %rd5, %rd1, %rd4;
+       ld.f64  %fd5, [%rd5];
+       add.f64         %fd8, %fd8, %fd5;
+       add.s32         %r4, %r3, %r6;
+       setp.lt.u32     %p3, %r4, %r2;
+       mov.u32         %r10, %r4;
+       @%p3 bra        BB13_3;
+
+BB13_4:
+       cvt.rn.f64.s64  %fd6, %rd3;
+       div.rn.f64      %fd7, %fd8, %fd6;
+       mul.wide.u32    %rd6, %r1, 8;
+       add.s64         %rd7, %rd2, %rd6;
+       st.f64  [%rd7], %fd7;
+
+BB13_5:
+       ret;
+}
+
        // .globl       copyUpperToLowerTriangleDense
 .visible .entry copyUpperToLowerTriangleDense(
        .param .u64 copyUpperToLowerTriangleDense_param_0,
@@ -1598,10 +2029,10 @@ BB10_5:
        setp.gt.s32     %p1, %r2, %r1;
        setp.lt.s32     %p2, %r3, %r5;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB11_2;
-       bra.uni         BB11_1;
+       @!%p3 bra       BB14_2;
+       bra.uni         BB14_1;
 
-BB11_1:
+BB14_1:
        cvta.to.global.u64      %rd2, %rd1;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd3, %r12, 8;
@@ -1611,7 +2042,7 @@ BB11_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB11_2:
+BB14_2:
        ret;
 }
 
@@ -1644,14 +2075,14 @@ BB11_2:
        mad.lo.s32      %r1, %r8, %r9, %r11;
        mul.lo.s32      %r12, %r3, %r2;
        setp.ge.s32     %p1, %r1, %r12;
-       @%p1 bra        BB12_2;
+       @%p1 bra        BB15_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB12_2:
+BB15_2:
        ret;
 }
 
@@ -1685,10 +2116,10 @@ BB12_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB13_2;
-       bra.uni         BB13_1;
+       @!%p3 bra       BB16_2;
+       bra.uni         BB16_1;
 
-BB13_1:
+BB16_1:
        cvta.to.global.u64      %rd3, %rd1;
        mul.wide.s32    %rd4, %r1, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -1697,7 +2128,7 @@ BB13_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd1;
 
-BB13_2:
+BB16_2:
        ret;
 }
 
@@ -1730,10 +2161,10 @@ BB13_2:
        setp.lt.s32     %p1, %r1, %r4;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB14_2;
-       bra.uni         BB14_1;
+       @!%p3 bra       BB17_2;
+       bra.uni         BB17_1;
 
-BB14_1:
+BB17_1:
        cvta.to.global.u64      %rd3, %rd1;
        mad.lo.s32      %r11, %r1, %r3, %r2;
        mul.wide.s32    %rd4, %r11, 8;
@@ -1745,7 +2176,7 @@ BB14_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB14_2:
+BB17_2:
        ret;
 }
 
@@ -1780,10 +2211,10 @@ BB14_2:
        setp.lt.s32     %p1, %r1, %r5;
        setp.lt.s32     %p2, %r2, %r4;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB15_4;
-       bra.uni         BB15_1;
+       @!%p3 bra       BB18_4;
+       bra.uni         BB18_1;
 
-BB15_1:
+BB18_1:
        cvta.to.global.u64      %rd4, %rd1;
        mad.lo.s32      %r3, %r1, %r4, %r2;
        mul.wide.s32    %rd5, %r3, 8;
@@ -1791,18 +2222,18 @@ BB15_1:
        ld.global.f64   %fd4, [%rd6];
        mov.f64         %fd5, 0d0000000000000000;
        setp.leu.f64    %p4, %fd4, 0d0000000000000000;
-       @%p4 bra        BB15_3;
+       @%p4 bra        BB18_3;
 
        cvta.to.global.u64      %rd7, %rd2;
        add.s64         %rd9, %rd7, %rd5;
        ld.global.f64   %fd5, [%rd9];
 
-BB15_3:
+BB18_3:
        cvta.to.global.u64      %rd10, %rd3;
        add.s64         %rd12, %rd10, %rd5;
        st.global.f64   [%rd12], %fd5;
 
-BB15_4:
+BB18_4:
        ret;
 }
 
@@ -1839,10 +2270,10 @@ BB15_4:
        setp.lt.s32     %p1, %r1, %r5;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB16_2;
-       bra.uni         BB16_1;
+       @!%p3 bra       BB19_2;
+       bra.uni         BB19_1;
 
-BB16_1:
+BB19_1:
        cvta.to.global.u64      %rd4, %rd1;
        mad.lo.s32      %r12, %r1, %r3, %r2;
        mul.wide.s32    %rd5, %r12, 8;
@@ -1858,7 +2289,7 @@ BB16_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB16_2:
+BB19_2:
        ret;
 }
 
@@ -1902,10 +2333,10 @@ BB16_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB17_6;
-       bra.uni         BB17_1;
+       @!%p3 bra       BB20_6;
+       bra.uni         BB20_1;
 
-BB17_1:
+BB20_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -1915,26 +2346,26 @@ BB17_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB17_5;
-       bra.uni         BB17_2;
+       @%p4 bra        BB20_5;
+       bra.uni         BB20_2;
 
-BB17_5:
+BB20_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB17_6;
+       bra.uni         BB20_6;
 
-BB17_2:
+BB20_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB17_4;
-       bra.uni         BB17_3;
+       @%p5 bra        BB20_4;
+       bra.uni         BB20_3;
 
-BB17_4:
+BB20_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB17_6;
+       bra.uni         BB20_6;
 
-BB17_3:
+BB20_3:
        st.global.f64   [%rd1], %fd6;
 
-BB17_6:
+BB20_6:
        ret;
 }
 
@@ -1975,42 +2406,42 @@ BB17_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB18_55;
-       bra.uni         BB18_1;
+       @!%p4 bra       BB21_55;
+       bra.uni         BB21_1;
 
-BB18_1:
+BB21_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
        mov.u32         %r54, %r1;
-       @%p5 bra        BB18_5;
+       @%p5 bra        BB21_5;
 
        setp.ne.s32     %p6, %r11, 2;
        mov.u32         %r55, %r3;
-       @%p6 bra        BB18_4;
+       @%p6 bra        BB21_4;
 
        mov.u32         %r55, %r2;
 
-BB18_4:
+BB21_4:
        mov.u32         %r49, %r55;
        mov.u32         %r4, %r49;
        mov.u32         %r54, %r4;
 
-BB18_5:
+BB21_5:
        mov.u32         %r5, %r54;
        setp.eq.s32     %p7, %r12, 1;
        mov.u32         %r52, %r1;
-       @%p7 bra        BB18_9;
+       @%p7 bra        BB21_9;
 
        setp.ne.s32     %p8, %r12, 2;
        mov.u32         %r53, %r3;
-       @%p8 bra        BB18_8;
+       @%p8 bra        BB21_8;
 
        mov.u32         %r53, %r2;
 
-BB18_8:
+BB21_8:
        mov.u32         %r52, %r53;
 
-BB18_9:
+BB21_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
@@ -2021,47 +2452,47 @@ BB18_9:
        ld.global.f64   %fd2, [%rd10];
        mov.f64         %fd39, 0dC08F380000000000;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB18_19;
+       @%p9 bra        BB21_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB18_15;
+       @%p19 bra       BB21_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB18_53;
+       @%p23 bra       BB21_53;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB18_52;
-       bra.uni         BB18_13;
+       @%p24 bra       BB21_52;
+       bra.uni         BB21_13;
 
-BB18_52:
+BB21_52:
        sub.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_19:
+BB21_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB18_24;
+       @%p10 bra       BB21_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB18_34;
+       @%p16 bra       BB21_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB18_33;
-       bra.uni         BB18_22;
+       @%p17 bra       BB21_33;
+       bra.uni         BB21_22;
 
-BB18_33:
+BB21_33:
        setp.gt.f64     %p29, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_15:
+BB21_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB18_51;
+       @%p20 bra       BB21_51;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB18_35;
-       bra.uni         BB18_17;
+       @%p21 bra       BB21_35;
+       bra.uni         BB21_17;
 
-BB18_35:
+BB21_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -2097,10 +2528,10 @@ BB18_35:
        }// Callseq End 1
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB18_37;
-       bra.uni         BB18_36;
+       @!%p1 bra       BB21_37;
+       bra.uni         BB21_36;
 
-BB18_36:
+BB21_36:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r23}, %fd38;
@@ -2112,111 +2543,111 @@ BB18_36:
        }
        mov.b64         %fd38, {%r25, %r24};
 
-BB18_37:
+BB21_37:
        mov.f64         %fd37, %fd38;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB18_40;
-       bra.uni         BB18_38;
+       @%p34 bra       BB21_40;
+       bra.uni         BB21_38;
 
-BB18_40:
+BB21_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
        mov.b64         %fd37, {%r29, %r28};
-       bra.uni         BB18_41;
+       bra.uni         BB21_41;
 
-BB18_24:
+BB21_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB18_28;
+       @%p11 bra       BB21_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB18_32;
-       bra.uni         BB18_26;
+       @%p14 bra       BB21_32;
+       bra.uni         BB21_26;
 
-BB18_32:
+BB21_32:
        setp.eq.f64     %p27, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_28:
+BB21_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB18_31;
-       bra.uni         BB18_29;
+       @%p12 bra       BB21_31;
+       bra.uni         BB21_29;
 
-BB18_31:
+BB21_31:
        min.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_53:
+BB21_53:
        add.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_13:
+BB21_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB18_14;
-       bra.uni         BB18_54;
+       @%p25 bra       BB21_14;
+       bra.uni         BB21_54;
 
-BB18_14:
+BB21_14:
        mul.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_34:
+BB21_34:
        setp.le.f64     %p30, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_22:
+BB21_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB18_23;
-       bra.uni         BB18_54;
+       @%p18 bra       BB21_23;
+       bra.uni         BB21_54;
 
-BB18_23:
+BB21_23:
        setp.ge.f64     %p28, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_51:
+BB21_51:
        div.rn.f64      %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_17:
+BB21_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB18_18;
-       bra.uni         BB18_54;
+       @%p22 bra       BB21_18;
+       bra.uni         BB21_54;
 
-BB18_18:
+BB21_18:
        setp.lt.f64     %p31, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_26:
+BB21_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB18_27;
-       bra.uni         BB18_54;
+       @%p15 bra       BB21_27;
+       bra.uni         BB21_54;
 
-BB18_27:
+BB21_27:
        setp.neu.f64    %p26, %fd1, %fd2;
        selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_29:
+BB21_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB18_54;
+       @%p13 bra       BB21_54;
 
        max.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB18_54;
+       bra.uni         BB21_54;
 
-BB18_38:
+BB21_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB18_41;
+       @%p35 bra       BB21_41;
 
        cvt.rzi.f64.f64 %fd29, %fd2;
        setp.neu.f64    %p36, %fd29, %fd2;
        selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p36;
 
-BB18_41:
+BB21_41:
        mov.f64         %fd17, %fd37;
        add.f64         %fd18, %fd1, %fd2;
        {
@@ -2226,35 +2657,35 @@ BB18_41:
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
        mov.f64         %fd36, %fd17;
-       @%p39 bra       BB18_50;
+       @%p39 bra       BB21_50;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
        mov.f64         %fd36, %fd18;
-       @%p40 bra       BB18_50;
+       @%p40 bra       BB21_50;
 
        abs.f64         %fd30, %fd2;
        setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
        mov.f64         %fd35, %fd18;
        mov.f64         %fd36, %fd35;
-       @%p41 bra       BB18_50;
+       @%p41 bra       BB21_50;
 
        and.b32         %r32, %r9, 2147483647;
        setp.ne.s32     %p42, %r32, 2146435072;
-       @%p42 bra       BB18_46;
+       @%p42 bra       BB21_46;
 
        {
        .reg .b32 %temp; 
        mov.b64         {%r33, %temp}, %fd2;
        }
        setp.eq.s32     %p43, %r33, 0;
-       @%p43 bra       BB18_49;
+       @%p43 bra       BB21_49;
 
-BB18_46:
+BB21_46:
        and.b32         %r34, %r8, 2147483647;
        setp.ne.s32     %p44, %r34, 2146435072;
        mov.f64         %fd33, %fd17;
        mov.f64         %fd36, %fd33;
-       @%p44 bra       BB18_50;
+       @%p44 bra       BB21_50;
 
        {
        .reg .b32 %temp; 
@@ -2262,7 +2693,7 @@ BB18_46:
        }
        setp.ne.s32     %p45, %r35, 0;
        mov.f64         %fd36, %fd17;
-       @%p45 bra       BB18_50;
+       @%p45 bra       BB21_50;
 
        shr.s32         %r36, %r9, 31;
        and.b32         %r37, %r36, -2146435072;
@@ -2271,9 +2702,9 @@ BB18_46:
        selp.b32        %r40, %r39, %r38, %p1;
        mov.u32         %r41, 0;
        mov.b64         %fd36, {%r41, %r40};
-       bra.uni         BB18_50;
+       bra.uni         BB21_50;
 
-BB18_49:
+BB21_49:
        setp.gt.f64     %p46, %fd11, 0d3FF0000000000000;
        selp.b32        %r42, 2146435072, 0, %p46;
        xor.b32         %r43, %r42, 2146435072;
@@ -2284,19 +2715,19 @@ BB18_49:
        mov.u32         %r46, 0;
        mov.b64         %fd36, {%r46, %r45};
 
-BB18_50:
+BB21_50:
        setp.eq.f64     %p49, %fd2, 0d0000000000000000;
        setp.eq.f64     %p50, %fd1, 0d3FF0000000000000;
        or.pred         %p51, %p50, %p49;
        selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p51;
 
-BB18_54:
+BB21_54:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd39;
 
-BB18_55:
+BB21_55:
        ret;
 }
 
@@ -2335,7 +2766,7 @@ BB18_55:
        mad.lo.s32      %r1, %r14, %r15, %r17;
        mul.lo.s32      %r18, %r9, %r8;
        setp.ge.s32     %p3, %r1, %r18;
-       @%p3 bra        BB19_92;
+       @%p3 bra        BB22_92;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -2344,178 +2775,178 @@ BB18_55:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB19_47;
+       @%p4 bra        BB22_47;
 
        setp.eq.s32     %p5, %r6, 0;
-       @%p5 bra        BB19_45;
+       @%p5 bra        BB22_45;
 
        mov.f64         %fd67, 0dC08F380000000000;
        setp.gt.s32     %p6, %r6, 6;
-       @%p6 bra        BB19_13;
+       @%p6 bra        BB22_13;
 
        setp.gt.s32     %p14, %r6, 3;
-       @%p14 bra       BB19_9;
+       @%p14 bra       BB22_9;
 
        setp.eq.s32     %p18, %r6, 1;
-       @%p18 bra       BB19_44;
+       @%p18 bra       BB22_44;
 
        setp.eq.s32     %p19, %r6, 2;
-       @%p19 bra       BB19_43;
-       bra.uni         BB19_7;
+       @%p19 bra       BB22_43;
+       bra.uni         BB22_7;
 
-BB19_43:
+BB22_43:
        mul.f64         %fd67, %fd1, %fd52;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_47:
+BB22_47:
        setp.eq.s32     %p47, %r6, 0;
-       @%p47 bra       BB19_90;
+       @%p47 bra       BB22_90;
 
        mov.f64         %fd76, 0dC08F380000000000;
        setp.gt.s32     %p48, %r6, 6;
-       @%p48 bra       BB19_58;
+       @%p48 bra       BB22_58;
 
        setp.gt.s32     %p56, %r6, 3;
-       @%p56 bra       BB19_54;
+       @%p56 bra       BB22_54;
 
        setp.eq.s32     %p60, %r6, 1;
-       @%p60 bra       BB19_89;
+       @%p60 bra       BB22_89;
 
        setp.eq.s32     %p61, %r6, 2;
-       @%p61 bra       BB19_88;
-       bra.uni         BB19_52;
+       @%p61 bra       BB22_88;
+       bra.uni         BB22_52;
 
-BB19_88:
+BB22_88:
        mul.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_45:
+BB22_45:
        add.f64         %fd67, %fd1, %fd52;
 
-BB19_46:
+BB22_46:
        st.global.f64   [%rd1], %fd67;
-       bra.uni         BB19_92;
+       bra.uni         BB22_92;
 
-BB19_13:
+BB22_13:
        setp.gt.s32     %p7, %r6, 9;
-       @%p7 bra        BB19_18;
+       @%p7 bra        BB22_18;
 
        setp.eq.s32     %p11, %r6, 7;
-       @%p11 bra       BB19_25;
+       @%p11 bra       BB22_25;
 
        setp.eq.s32     %p12, %r6, 8;
-       @%p12 bra       BB19_24;
-       bra.uni         BB19_16;
+       @%p12 bra       BB22_24;
+       bra.uni         BB22_16;
 
-BB19_24:
+BB22_24:
        setp.le.f64     %p23, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_90:
+BB22_90:
        add.f64         %fd76, %fd1, %fd52;
 
-BB19_91:
+BB22_91:
        st.global.f64   [%rd1], %fd76;
 
-BB19_92:
+BB22_92:
        ret;
 
-BB19_58:
+BB22_58:
        setp.gt.s32     %p49, %r6, 9;
-       @%p49 bra       BB19_63;
+       @%p49 bra       BB22_63;
 
        setp.eq.s32     %p53, %r6, 7;
-       @%p53 bra       BB19_70;
+       @%p53 bra       BB22_70;
 
        setp.eq.s32     %p54, %r6, 8;
-       @%p54 bra       BB19_69;
-       bra.uni         BB19_61;
+       @%p54 bra       BB22_69;
+       bra.uni         BB22_61;
 
-BB19_69:
+BB22_69:
        setp.ge.f64     %p65, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_9:
+BB22_9:
        setp.eq.s32     %p15, %r6, 4;
-       @%p15 bra       BB19_27;
+       @%p15 bra       BB22_27;
 
        setp.eq.s32     %p16, %r6, 5;
-       @%p16 bra       BB19_26;
-       bra.uni         BB19_11;
+       @%p16 bra       BB22_26;
+       bra.uni         BB22_11;
 
-BB19_26:
+BB22_26:
        setp.gt.f64     %p26, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_18:
+BB22_18:
        setp.eq.s32     %p8, %r6, 10;
-       @%p8 bra        BB19_23;
+       @%p8 bra        BB22_23;
 
        setp.eq.s32     %p9, %r6, 11;
-       @%p9 bra        BB19_22;
-       bra.uni         BB19_20;
+       @%p9 bra        BB22_22;
+       bra.uni         BB22_20;
 
-BB19_22:
+BB22_22:
        min.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_54:
+BB22_54:
        setp.eq.s32     %p57, %r6, 4;
-       @%p57 bra       BB19_72;
+       @%p57 bra       BB22_72;
 
        setp.eq.s32     %p58, %r6, 5;
-       @%p58 bra       BB19_71;
-       bra.uni         BB19_56;
+       @%p58 bra       BB22_71;
+       bra.uni         BB22_56;
 
-BB19_71:
+BB22_71:
        setp.lt.f64     %p68, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_63:
+BB22_63:
        setp.eq.s32     %p50, %r6, 10;
-       @%p50 bra       BB19_68;
+       @%p50 bra       BB22_68;
 
        setp.eq.s32     %p51, %r6, 11;
-       @%p51 bra       BB19_67;
-       bra.uni         BB19_65;
+       @%p51 bra       BB22_67;
+       bra.uni         BB22_65;
 
-BB19_67:
+BB22_67:
        min.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_44:
+BB22_44:
        sub.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_7:
+BB22_7:
        setp.eq.s32     %p20, %r6, 3;
-       @%p20 bra       BB19_8;
-       bra.uni         BB19_46;
+       @%p20 bra       BB22_8;
+       bra.uni         BB22_46;
 
-BB19_8:
+BB22_8:
        div.rn.f64      %fd67, %fd52, %fd1;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_25:
+BB22_25:
        setp.lt.f64     %p24, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_16:
+BB22_16:
        setp.eq.s32     %p13, %r6, 9;
-       @%p13 bra       BB19_17;
-       bra.uni         BB19_46;
+       @%p13 bra       BB22_17;
+       bra.uni         BB22_46;
 
-BB19_17:
+BB22_17:
        setp.eq.f64     %p22, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_27:
+BB22_27:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd52;
@@ -2551,10 +2982,10 @@ BB19_27:
        }// Callseq End 2
        setp.lt.s32     %p28, %r2, 0;
        and.pred        %p1, %p28, %p27;
-       @!%p1 bra       BB19_29;
-       bra.uni         BB19_28;
+       @!%p1 bra       BB22_29;
+       bra.uni         BB22_28;
 
-BB19_28:
+BB22_28:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r21}, %fd66;
@@ -2566,72 +2997,72 @@ BB19_28:
        }
        mov.b64         %fd66, {%r23, %r22};
 
-BB19_29:
+BB22_29:
        mov.f64         %fd65, %fd66;
        setp.eq.f64     %p29, %fd52, 0d0000000000000000;
-       @%p29 bra       BB19_32;
-       bra.uni         BB19_30;
+       @%p29 bra       BB22_32;
+       bra.uni         BB22_30;
 
-BB19_32:
+BB22_32:
        selp.b32        %r24, %r2, 0, %p27;
        or.b32          %r25, %r24, 2146435072;
        setp.lt.s32     %p33, %r3, 0;
        selp.b32        %r26, %r25, %r24, %p33;
        mov.u32         %r27, 0;
        mov.b64         %fd65, {%r27, %r26};
-       bra.uni         BB19_33;
+       bra.uni         BB22_33;
 
-BB19_11:
+BB22_11:
        setp.eq.s32     %p17, %r6, 6;
-       @%p17 bra       BB19_12;
-       bra.uni         BB19_46;
+       @%p17 bra       BB22_12;
+       bra.uni         BB22_46;
 
-BB19_12:
+BB22_12:
        setp.ge.f64     %p25, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_23:
+BB22_23:
        setp.neu.f64    %p21, %fd1, %fd52;
        selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_20:
+BB22_20:
        setp.ne.s32     %p10, %r6, 12;
-       @%p10 bra       BB19_46;
+       @%p10 bra       BB22_46;
 
        max.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_89:
+BB22_89:
        sub.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_52:
+BB22_52:
        setp.eq.s32     %p62, %r6, 3;
-       @%p62 bra       BB19_53;
-       bra.uni         BB19_91;
+       @%p62 bra       BB22_53;
+       bra.uni         BB22_91;
 
-BB19_53:
+BB22_53:
        div.rn.f64      %fd76, %fd1, %fd52;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_70:
+BB22_70:
        setp.gt.f64     %p66, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_61:
+BB22_61:
        setp.eq.s32     %p55, %r6, 9;
-       @%p55 bra       BB19_62;
-       bra.uni         BB19_91;
+       @%p55 bra       BB22_62;
+       bra.uni         BB22_91;
 
-BB19_62:
+BB22_62:
        setp.eq.f64     %p64, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_72:
+BB22_72:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -2667,10 +3098,10 @@ BB19_72:
        }// Callseq End 3
        setp.lt.s32     %p70, %r4, 0;
        and.pred        %p2, %p70, %p69;
-       @!%p2 bra       BB19_74;
-       bra.uni         BB19_73;
+       @!%p2 bra       BB22_74;
+       bra.uni         BB22_73;
 
-BB19_73:
+BB22_73:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r47}, %fd75;
@@ -2682,52 +3113,52 @@ BB19_73:
        }
        mov.b64         %fd75, {%r49, %r48};
 
-BB19_74:
+BB22_74:
        mov.f64         %fd74, %fd75;
        setp.eq.f64     %p71, %fd1, 0d0000000000000000;
-       @%p71 bra       BB19_77;
-       bra.uni         BB19_75;
+       @%p71 bra       BB22_77;
+       bra.uni         BB22_75;
 
-BB19_77:
+BB22_77:
        selp.b32        %r50, %r4, 0, %p69;
        or.b32          %r51, %r50, 2146435072;
        setp.lt.s32     %p75, %r5, 0;
        selp.b32        %r52, %r51, %r50, %p75;
        mov.u32         %r53, 0;
        mov.b64         %fd74, {%r53, %r52};
-       bra.uni         BB19_78;
+       bra.uni         BB22_78;
 
-BB19_56:
+BB22_56:
        setp.eq.s32     %p59, %r6, 6;
-       @%p59 bra       BB19_57;
-       bra.uni         BB19_91;
+       @%p59 bra       BB22_57;
+       bra.uni         BB22_91;
 
-BB19_57:
+BB22_57:
        setp.le.f64     %p67, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_68:
+BB22_68:
        setp.neu.f64    %p63, %fd1, %fd52;
        selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_65:
+BB22_65:
        setp.ne.s32     %p52, %r6, 12;
-       @%p52 bra       BB19_91;
+       @%p52 bra       BB22_91;
 
        max.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 
-BB19_30:
+BB22_30:
        setp.gt.s32     %p30, %r2, -1;
-       @%p30 bra       BB19_33;
+       @%p30 bra       BB22_33;
 
        cvt.rzi.f64.f64 %fd54, %fd1;
        setp.neu.f64    %p31, %fd54, %fd1;
        selp.f64        %fd65, 0dFFF8000000000000, %fd65, %p31;
 
-BB19_33:
+BB22_33:
        mov.f64         %fd16, %fd65;
        add.f64         %fd17, %fd1, %fd52;
        {
@@ -2737,35 +3168,35 @@ BB19_33:
        and.b32         %r29, %r28, 2146435072;
        setp.ne.s32     %p34, %r29, 2146435072;
        mov.f64         %fd64, %fd16;
-       @%p34 bra       BB19_42;
+       @%p34 bra       BB22_42;
 
        setp.gtu.f64    %p35, %fd10, 0d7FF0000000000000;
        mov.f64         %fd64, %fd17;
-       @%p35 bra       BB19_42;
+       @%p35 bra       BB22_42;
 
        abs.f64         %fd55, %fd1;
        setp.gtu.f64    %p36, %fd55, 0d7FF0000000000000;
        mov.f64         %fd63, %fd17;
        mov.f64         %fd64, %fd63;
-       @%p36 bra       BB19_42;
+       @%p36 bra       BB22_42;
 
        and.b32         %r30, %r3, 2147483647;
        setp.ne.s32     %p37, %r30, 2146435072;
-       @%p37 bra       BB19_38;
+       @%p37 bra       BB22_38;
 
        {
        .reg .b32 %temp; 
        mov.b64         {%r31, %temp}, %fd1;
        }
        setp.eq.s32     %p38, %r31, 0;
-       @%p38 bra       BB19_41;
+       @%p38 bra       BB22_41;
 
-BB19_38:
+BB22_38:
        and.b32         %r32, %r2, 2147483647;
        setp.ne.s32     %p39, %r32, 2146435072;
        mov.f64         %fd61, %fd16;
        mov.f64         %fd64, %fd61;
-       @%p39 bra       BB19_42;
+       @%p39 bra       BB22_42;
 
        {
        .reg .b32 %temp; 
@@ -2773,7 +3204,7 @@ BB19_38:
        }
        setp.ne.s32     %p40, %r33, 0;
        mov.f64         %fd64, %fd16;
-       @%p40 bra       BB19_42;
+       @%p40 bra       BB22_42;
 
        shr.s32         %r34, %r3, 31;
        and.b32         %r35, %r34, -2146435072;
@@ -2782,17 +3213,17 @@ BB19_38:
        selp.b32        %r38, %r37, %r36, %p1;
        mov.u32         %r39, 0;
        mov.b64         %fd64, {%r39, %r38};
-       bra.uni         BB19_42;
+       bra.uni         BB22_42;
 
-BB19_75:
+BB22_75:
        setp.gt.s32     %p72, %r4, -1;
-       @%p72 bra       BB19_78;
+       @%p72 bra       BB22_78;
 
        cvt.rzi.f64.f64 %fd57, %fd52;
        setp.neu.f64    %p73, %fd57, %fd52;
        selp.f64        %fd74, 0dFFF8000000000000, %fd74, %p73;
 
-BB19_78:
+BB22_78:
        mov.f64         %fd41, %fd74;
        add.f64         %fd42, %fd1, %fd52;
        {
@@ -2802,35 +3233,35 @@ BB19_78:
        and.b32         %r55, %r54, 2146435072;
        setp.ne.s32     %p76, %r55, 2146435072;
        mov.f64         %fd73, %fd41;
-       @%p76 bra       BB19_87;
+       @%p76 bra       BB22_87;
 
        setp.gtu.f64    %p77, %fd35, 0d7FF0000000000000;
        mov.f64         %fd73, %fd42;
-       @%p77 bra       BB19_87;
+       @%p77 bra       BB22_87;
 
        abs.f64         %fd58, %fd52;
        setp.gtu.f64    %p78, %fd58, 0d7FF0000000000000;
        mov.f64         %fd72, %fd42;
        mov.f64         %fd73, %fd72;
-       @%p78 bra       BB19_87;
+       @%p78 bra       BB22_87;
 
        and.b32         %r56, %r5, 2147483647;
        setp.ne.s32     %p79, %r56, 2146435072;
-       @%p79 bra       BB19_83;
+       @%p79 bra       BB22_83;
 
        {
        .reg .b32 %temp; 
        mov.b64         {%r57, %temp}, %fd52;
        }
        setp.eq.s32     %p80, %r57, 0;
-       @%p80 bra       BB19_86;
+       @%p80 bra       BB22_86;
 
-BB19_83:
+BB22_83:
        and.b32         %r58, %r4, 2147483647;
        setp.ne.s32     %p81, %r58, 2146435072;
        mov.f64         %fd70, %fd41;
        mov.f64         %fd73, %fd70;
-       @%p81 bra       BB19_87;
+       @%p81 bra       BB22_87;
 
        {
        .reg .b32 %temp; 
@@ -2838,7 +3269,7 @@ BB19_83:
        }
        setp.ne.s32     %p82, %r59, 0;
        mov.f64         %fd73, %fd41;
-       @%p82 bra       BB19_87;
+       @%p82 bra       BB22_87;
 
        shr.s32         %r60, %r5, 31;
        and.b32         %r61, %r60, -2146435072;
@@ -2847,9 +3278,9 @@ BB19_83:
        selp.b32        %r64, %r63, %r62, %p2;
        mov.u32         %r65, 0;
        mov.b64         %fd73, {%r65, %r64};
-       bra.uni         BB19_87;
+       bra.uni         BB22_87;
 
-BB19_41:
+BB22_41:
        setp.gt.f64     %p41, %fd10, 0d3FF0000000000000;
        selp.b32        %r40, 2146435072, 0, %p41;
        xor.b32         %r41, %r40, 2146435072;
@@ -2860,14 +3291,14 @@ BB19_41:
        mov.u32         %r44, 0;
        mov.b64         %fd64, {%r44, %r43};
 
-BB19_42:
+BB22_42:
        setp.eq.f64     %p44, %fd1, 0d0000000000000000;
        setp.eq.f64     %p45, %fd52, 0d3FF0000000000000;
        or.pred         %p46, %p45, %p44;
        selp.f64        %fd67, 0d3FF0000000000000, %fd64, %p46;
-       bra.uni         BB19_46;
+       bra.uni         BB22_46;
 
-BB19_86:
+BB22_86:
        setp.gt.f64     %p83, %fd35, 0d3FF0000000000000;
        selp.b32        %r66, 2146435072, 0, %p83;
        xor.b32         %r67, %r66, 2146435072;
@@ -2878,12 +3309,12 @@ BB19_86:
        mov.u32         %r70, 0;
        mov.b64         %fd73, {%r70, %r69};
 
-BB19_87:
+BB22_87:
        setp.eq.f64     %p86, %fd52, 0d0000000000000000;
        setp.eq.f64     %p87, %fd1, 0d3FF0000000000000;
        or.pred         %p88, %p87, %p86;
        selp.f64        %fd76, 0d3FF0000000000000, %fd73, %p88;
-       bra.uni         BB19_91;
+       bra.uni         BB22_91;
 }
 
        // .globl       fill
@@ -2907,14 +3338,14 @@ BB19_87:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB20_2;
+       @%p1 bra        BB23_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB20_2:
+BB23_2:
        ret;
 }
 
@@ -2942,9 +3373,9 @@ BB20_2:
        mov.f64         %fd76, 0d0000000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB21_4;
+       @%p1 bra        BB24_4;
 
-BB21_1:
+BB24_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -2953,23 +3384,23 @@ BB21_1:
        add.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB21_3;
+       @%p2 bra        BB24_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        add.f64         %fd78, %fd78, %fd31;
 
-BB21_3:
+BB24_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB21_1;
+       @%p3 bra        BB24_1;
 
-BB21_4:
+BB24_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -2977,130 +3408,130 @@ BB21_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB21_8;
+       @%p4 bra        BB24_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB21_7;
+       @%p5 bra        BB24_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        add.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB21_7:
+BB24_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB21_8:
+BB24_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB21_12;
+       @%p6 bra        BB24_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB21_11;
+       @%p7 bra        BB24_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        add.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB21_11:
+BB24_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB21_12:
+BB24_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB21_16;
+       @%p8 bra        BB24_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB21_15;
+       @%p9 bra        BB24_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        add.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB21_15:
+BB24_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB21_16:
+BB24_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB21_20;
+       @%p10 bra       BB24_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB21_19;
+       @%p11 bra       BB24_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        add.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB21_19:
+BB24_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB21_20:
+BB24_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB21_33;
+       @%p12 bra       BB24_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB21_23;
+       @%p13 bra       BB24_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        add.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB21_23:
+BB24_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB21_25;
+       @%p14 bra       BB24_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        add.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB21_25:
+BB24_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB21_27;
+       @%p15 bra       BB24_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        add.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB21_27:
+BB24_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB21_29;
+       @%p16 bra       BB24_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        add.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB21_29:
+BB24_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB21_31;
+       @%p17 bra       BB24_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        add.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB21_31:
+BB24_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB21_33;
+       @%p18 bra       BB24_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        add.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB21_33:
+BB24_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB21_35;
+       @%p19 bra       BB24_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -3108,7 +3539,7 @@ BB21_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB21_35:
+BB24_35:
        ret;
 }
 
@@ -3132,29 +3563,466 @@ BB21_35:
        ld.param.u32    %r4, [reduce_row_sum_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB22_35;
+       @%p1 bra        BB25_35;
+
+       mov.u32         %r38, %tid.x;
+       mov.f64         %fd72, 0d0000000000000000;
+       mov.f64         %fd73, %fd72;
+       setp.ge.u32     %p2, %r38, %r4;
+       @%p2 bra        BB25_4;
+
+       cvta.to.global.u64      %rd3, %rd1;
+
+BB25_3:
+       mad.lo.s32      %r8, %r6, %r4, %r38;
+       mul.wide.u32    %rd4, %r8, 8;
+       add.s64         %rd5, %rd3, %rd4;
+       ld.global.f64   %fd28, [%rd5];
+       add.f64         %fd73, %fd73, %fd28;
+       mov.u32         %r9, %ntid.x;
+       add.s32         %r38, %r9, %r38;
+       setp.lt.u32     %p3, %r38, %r4;
+       mov.f64         %fd72, %fd73;
+       @%p3 bra        BB25_3;
+
+BB25_4:
+       mov.f64         %fd70, %fd72;
+       mov.u32         %r10, %tid.x;
+       mul.wide.u32    %rd6, %r10, 8;
+       mov.u64         %rd7, sdata;
+       add.s64         %rd8, %rd7, %rd6;
+       st.shared.f64   [%rd8], %fd70;
+       bar.sync        0;
+       mov.u32         %r11, %ntid.x;
+       setp.lt.u32     %p4, %r11, 1024;
+       @%p4 bra        BB25_8;
+
+       setp.gt.u32     %p5, %r10, 511;
+       mov.f64         %fd71, %fd70;
+       @%p5 bra        BB25_7;
+
+       ld.shared.f64   %fd29, [%rd8+4096];
+       add.f64         %fd71, %fd70, %fd29;
+       st.shared.f64   [%rd8], %fd71;
+
+BB25_7:
+       mov.f64         %fd70, %fd71;
+       bar.sync        0;
+
+BB25_8:
+       mov.f64         %fd68, %fd70;
+       setp.lt.u32     %p6, %r11, 512;
+       @%p6 bra        BB25_12;
+
+       setp.gt.u32     %p7, %r10, 255;
+       mov.f64         %fd69, %fd68;
+       @%p7 bra        BB25_11;
+
+       ld.shared.f64   %fd30, [%rd8+2048];
+       add.f64         %fd69, %fd68, %fd30;
+       st.shared.f64   [%rd8], %fd69;
+
+BB25_11:
+       mov.f64         %fd68, %fd69;
+       bar.sync        0;
+
+BB25_12:
+       mov.f64         %fd66, %fd68;
+       setp.lt.u32     %p8, %r11, 256;
+       @%p8 bra        BB25_16;
+
+       setp.gt.u32     %p9, %r10, 127;
+       mov.f64         %fd67, %fd66;
+       @%p9 bra        BB25_15;
+
+       ld.shared.f64   %fd31, [%rd8+1024];
+       add.f64         %fd67, %fd66, %fd31;
+       st.shared.f64   [%rd8], %fd67;
+
+BB25_15:
+       mov.f64         %fd66, %fd67;
+       bar.sync        0;
+
+BB25_16:
+       mov.f64         %fd64, %fd66;
+       setp.lt.u32     %p10, %r11, 128;
+       @%p10 bra       BB25_20;
+
+       setp.gt.u32     %p11, %r10, 63;
+       mov.f64         %fd65, %fd64;
+       @%p11 bra       BB25_19;
+
+       ld.shared.f64   %fd32, [%rd8+512];
+       add.f64         %fd65, %fd64, %fd32;
+       st.shared.f64   [%rd8], %fd65;
+
+BB25_19:
+       mov.f64         %fd64, %fd65;
+       bar.sync        0;
+
+BB25_20:
+       mov.f64         %fd63, %fd64;
+       setp.gt.u32     %p12, %r10, 31;
+       @%p12 bra       BB25_33;
+
+       setp.lt.u32     %p13, %r11, 64;
+       @%p13 bra       BB25_23;
+
+       ld.volatile.shared.f64  %fd33, [%rd8+256];
+       add.f64         %fd63, %fd63, %fd33;
+       st.volatile.shared.f64  [%rd8], %fd63;
+
+BB25_23:
+       mov.f64         %fd62, %fd63;
+       setp.lt.u32     %p14, %r11, 32;
+       @%p14 bra       BB25_25;
+
+       ld.volatile.shared.f64  %fd34, [%rd8+128];
+       add.f64         %fd62, %fd62, %fd34;
+       st.volatile.shared.f64  [%rd8], %fd62;
+
+BB25_25:
+       mov.f64         %fd61, %fd62;
+       setp.lt.u32     %p15, %r11, 16;
+       @%p15 bra       BB25_27;
+
+       ld.volatile.shared.f64  %fd35, [%rd8+64];
+       add.f64         %fd61, %fd61, %fd35;
+       st.volatile.shared.f64  [%rd8], %fd61;
+
+BB25_27:
+       mov.f64         %fd60, %fd61;
+       setp.lt.u32     %p16, %r11, 8;
+       @%p16 bra       BB25_29;
+
+       ld.volatile.shared.f64  %fd36, [%rd8+32];
+       add.f64         %fd60, %fd60, %fd36;
+       st.volatile.shared.f64  [%rd8], %fd60;
+
+BB25_29:
+       mov.f64         %fd59, %fd60;
+       setp.lt.u32     %p17, %r11, 4;
+       @%p17 bra       BB25_31;
+
+       ld.volatile.shared.f64  %fd37, [%rd8+16];
+       add.f64         %fd59, %fd59, %fd37;
+       st.volatile.shared.f64  [%rd8], %fd59;
+
+BB25_31:
+       setp.lt.u32     %p18, %r11, 2;
+       @%p18 bra       BB25_33;
+
+       ld.volatile.shared.f64  %fd38, [%rd8+8];
+       add.f64         %fd39, %fd59, %fd38;
+       st.volatile.shared.f64  [%rd8], %fd39;
+
+BB25_33:
+       setp.ne.s32     %p19, %r10, 0;
+       @%p19 bra       BB25_35;
+
+       ld.shared.f64   %fd40, [sdata];
+       cvta.to.global.u64      %rd39, %rd2;
+       mul.wide.u32    %rd40, %r6, 8;
+       add.s64         %rd41, %rd39, %rd40;
+       st.global.f64   [%rd41], %fd40;
+
+BB25_35:
+       ret;
+}
+
+       // .globl       reduce_col_sum
+.visible .entry reduce_col_sum(
+       .param .u64 reduce_col_sum_param_0,
+       .param .u64 reduce_col_sum_param_1,
+       .param .u32 reduce_col_sum_param_2,
+       .param .u32 reduce_col_sum_param_3
+)
+{
+       .reg .pred      %p<4>;
+       .reg .b32       %r<11>;
+       .reg .f64       %fd<10>;
+       .reg .b64       %rd<9>;
+
+
+       ld.param.u64    %rd2, [reduce_col_sum_param_0];
+       ld.param.u64    %rd3, [reduce_col_sum_param_1];
+       ld.param.u32    %r5, [reduce_col_sum_param_2];
+       ld.param.u32    %r6, [reduce_col_sum_param_3];
+       mov.u32         %r7, %ntid.x;
+       mov.u32         %r8, %ctaid.x;
+       mov.u32         %r9, %tid.x;
+       mad.lo.s32      %r1, %r7, %r8, %r9;
+       setp.ge.u32     %p1, %r1, %r6;
+       @%p1 bra        BB26_5;
+
+       cvta.to.global.u64      %rd1, %rd2;
+       mul.lo.s32      %r2, %r6, %r5;
+       mov.f64         %fd8, 0d0000000000000000;
+       mov.f64         %fd9, %fd8;
+       setp.ge.u32     %p2, %r1, %r2;
+       @%p2 bra        BB26_4;
+
+       mov.u32         %r10, %r1;
+
+BB26_3:
+       mov.u32         %r3, %r10;
+       mul.wide.u32    %rd4, %r3, 8;
+       add.s64         %rd5, %rd1, %rd4;
+       ld.global.f64   %fd6, [%rd5];
+       add.f64         %fd9, %fd9, %fd6;
+       add.s32         %r4, %r3, %r6;
+       setp.lt.u32     %p3, %r4, %r2;
+       mov.u32         %r10, %r4;
+       mov.f64         %fd8, %fd9;
+       @%p3 bra        BB26_3;
+
+BB26_4:
+       cvta.to.global.u64      %rd6, %rd3;
+       mul.wide.u32    %rd7, %r1, 8;
+       add.s64         %rd8, %rd6, %rd7;
+       st.global.f64   [%rd8], %fd8;
+
+BB26_5:
+       ret;
+}
+
+       // .globl       reduce_max
+.visible .entry reduce_max(
+       .param .u64 reduce_max_param_0,
+       .param .u64 reduce_max_param_1,
+       .param .u32 reduce_max_param_2
+)
+{
+       .reg .pred      %p<20>;
+       .reg .b32       %r<33>;
+       .reg .f64       %fd<79>;
+       .reg .b64       %rd<15>;
+
+
+       ld.param.u64    %rd2, [reduce_max_param_0];
+       ld.param.u64    %rd3, [reduce_max_param_1];
+       ld.param.u32    %r5, [reduce_max_param_2];
+       mov.u32         %r6, %tid.x;
+       mov.u32         %r7, %ctaid.x;
+       shl.b32         %r8, %r7, 1;
+       mov.u32         %r9, %ntid.x;
+       mad.lo.s32      %r32, %r8, %r9, %r6;
+       mov.f64         %fd76, 0d0010000000000000;
+       mov.f64         %fd77, %fd76;
+       setp.ge.u32     %p1, %r32, %r5;
+       @%p1 bra        BB27_4;
+
+BB27_1:
+       mov.f64         %fd1, %fd77;
+       cvta.to.global.u64      %rd4, %rd2;
+       mul.wide.u32    %rd5, %r32, 8;
+       add.s64         %rd6, %rd4, %rd5;
+       ld.global.f64   %fd30, [%rd6];
+       max.f64         %fd78, %fd1, %fd30;
+       add.s32         %r3, %r32, %r9;
+       setp.ge.u32     %p2, %r3, %r5;
+       @%p2 bra        BB27_3;
+
+       mul.wide.u32    %rd8, %r3, 8;
+       add.s64         %rd9, %rd4, %rd8;
+       ld.global.f64   %fd31, [%rd9];
+       max.f64         %fd78, %fd78, %fd31;
+
+BB27_3:
+       mov.f64         %fd77, %fd78;
+       shl.b32         %r12, %r9, 1;
+       mov.u32         %r13, %nctaid.x;
+       mad.lo.s32      %r32, %r12, %r13, %r32;
+       setp.lt.u32     %p3, %r32, %r5;
+       mov.f64         %fd76, %fd77;
+       @%p3 bra        BB27_1;
+
+BB27_4:
+       mov.f64         %fd74, %fd76;
+       mul.wide.u32    %rd10, %r6, 8;
+       mov.u64         %rd11, sdata;
+       add.s64         %rd1, %rd11, %rd10;
+       st.shared.f64   [%rd1], %fd74;
+       bar.sync        0;
+       setp.lt.u32     %p4, %r9, 1024;
+       @%p4 bra        BB27_8;
+
+       setp.gt.u32     %p5, %r6, 511;
+       mov.f64         %fd75, %fd74;
+       @%p5 bra        BB27_7;
+
+       ld.shared.f64   %fd32, [%rd1+4096];
+       max.f64         %fd75, %fd74, %fd32;
+       st.shared.f64   [%rd1], %fd75;
+
+BB27_7:
+       mov.f64         %fd74, %fd75;
+       bar.sync        0;
+
+BB27_8:
+       mov.f64         %fd72, %fd74;
+       setp.lt.u32     %p6, %r9, 512;
+       @%p6 bra        BB27_12;
+
+       setp.gt.u32     %p7, %r6, 255;
+       mov.f64         %fd73, %fd72;
+       @%p7 bra        BB27_11;
+
+       ld.shared.f64   %fd33, [%rd1+2048];
+       max.f64         %fd73, %fd72, %fd33;
+       st.shared.f64   [%rd1], %fd73;
+
+BB27_11:
+       mov.f64         %fd72, %fd73;
+       bar.sync        0;
+
+BB27_12:
+       mov.f64         %fd70, %fd72;
+       setp.lt.u32     %p8, %r9, 256;
+       @%p8 bra        BB27_16;
+
+       setp.gt.u32     %p9, %r6, 127;
+       mov.f64         %fd71, %fd70;
+       @%p9 bra        BB27_15;
+
+       ld.shared.f64   %fd34, [%rd1+1024];
+       max.f64         %fd71, %fd70, %fd34;
+       st.shared.f64   [%rd1], %fd71;
+
+BB27_15:
+       mov.f64         %fd70, %fd71;
+       bar.sync        0;
+
+BB27_16:
+       mov.f64         %fd68, %fd70;
+       setp.lt.u32     %p10, %r9, 128;
+       @%p10 bra       BB27_20;
+
+       setp.gt.u32     %p11, %r6, 63;
+       mov.f64         %fd69, %fd68;
+       @%p11 bra       BB27_19;
+
+       ld.shared.f64   %fd35, [%rd1+512];
+       max.f64         %fd69, %fd68, %fd35;
+       st.shared.f64   [%rd1], %fd69;
+
+BB27_19:
+       mov.f64         %fd68, %fd69;
+       bar.sync        0;
+
+BB27_20:
+       mov.f64         %fd67, %fd68;
+       setp.gt.u32     %p12, %r6, 31;
+       @%p12 bra       BB27_33;
+
+       setp.lt.u32     %p13, %r9, 64;
+       @%p13 bra       BB27_23;
+
+       ld.volatile.shared.f64  %fd36, [%rd1+256];
+       max.f64         %fd67, %fd67, %fd36;
+       st.volatile.shared.f64  [%rd1], %fd67;
+
+BB27_23:
+       mov.f64         %fd66, %fd67;
+       setp.lt.u32     %p14, %r9, 32;
+       @%p14 bra       BB27_25;
+
+       ld.volatile.shared.f64  %fd37, [%rd1+128];
+       max.f64         %fd66, %fd66, %fd37;
+       st.volatile.shared.f64  [%rd1], %fd66;
+
+BB27_25:
+       mov.f64         %fd65, %fd66;
+       setp.lt.u32     %p15, %r9, 16;
+       @%p15 bra       BB27_27;
+
+       ld.volatile.shared.f64  %fd38, [%rd1+64];
+       max.f64         %fd65, %fd65, %fd38;
+       st.volatile.shared.f64  [%rd1], %fd65;
+
+BB27_27:
+       mov.f64         %fd64, %fd65;
+       setp.lt.u32     %p16, %r9, 8;
+       @%p16 bra       BB27_29;
+
+       ld.volatile.shared.f64  %fd39, [%rd1+32];
+       max.f64         %fd64, %fd64, %fd39;
+       st.volatile.shared.f64  [%rd1], %fd64;
+
+BB27_29:
+       mov.f64         %fd63, %fd64;
+       setp.lt.u32     %p17, %r9, 4;
+       @%p17 bra       BB27_31;
+
+       ld.volatile.shared.f64  %fd40, [%rd1+16];
+       max.f64         %fd63, %fd63, %fd40;
+       st.volatile.shared.f64  [%rd1], %fd63;
+
+BB27_31:
+       setp.lt.u32     %p18, %r9, 2;
+       @%p18 bra       BB27_33;
+
+       ld.volatile.shared.f64  %fd41, [%rd1+8];
+       max.f64         %fd42, %fd63, %fd41;
+       st.volatile.shared.f64  [%rd1], %fd42;
+
+BB27_33:
+       setp.ne.s32     %p19, %r6, 0;
+       @%p19 bra       BB27_35;
+
+       ld.shared.f64   %fd43, [sdata];
+       cvta.to.global.u64      %rd12, %rd3;
+       mul.wide.u32    %rd13, %r7, 8;
+       add.s64         %rd14, %rd12, %rd13;
+       st.global.f64   [%rd14], %fd43;
+
+BB27_35:
+       ret;
+}
+
+       // .globl       reduce_row_max
+.visible .entry reduce_row_max(
+       .param .u64 reduce_row_max_param_0,
+       .param .u64 reduce_row_max_param_1,
+       .param .u32 reduce_row_max_param_2,
+       .param .u32 reduce_row_max_param_3
+)
+{
+       .reg .pred      %p<20>;
+       .reg .b32       %r<39>;
+       .reg .f64       %fd<74>;
+       .reg .b64       %rd<42>;
+
+
+       ld.param.u64    %rd1, [reduce_row_max_param_0];
+       ld.param.u64    %rd2, [reduce_row_max_param_1];
+       ld.param.u32    %r5, [reduce_row_max_param_2];
+       ld.param.u32    %r4, [reduce_row_max_param_3];
+       mov.u32         %r6, %ctaid.x;
+       setp.ge.u32     %p1, %r6, %r5;
+       @%p1 bra        BB28_35;
 
        mov.u32         %r38, %tid.x;
-       mov.f64         %fd72, 0d0000000000000000;
+       mov.f64         %fd72, 0d0010000000000000;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB22_4;
+       @%p2 bra        BB28_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB22_3:
+BB28_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
        ld.global.f64   %fd28, [%rd5];
-       add.f64         %fd73, %fd73, %fd28;
+       max.f64         %fd73, %fd73, %fd28;
        mov.u32         %r9, %ntid.x;
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB22_3;
+       @%p3 bra        BB28_3;
 
-BB22_4:
+BB28_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -3164,130 +4032,130 @@ BB22_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB22_8;
+       @%p4 bra        BB28_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB22_7;
+       @%p5 bra        BB28_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
-       add.f64         %fd71, %fd70, %fd29;
+       max.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB22_7:
+BB28_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB22_8:
+BB28_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB22_12;
+       @%p6 bra        BB28_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB22_11;
+       @%p7 bra        BB28_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
-       add.f64         %fd69, %fd68, %fd30;
+       max.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB22_11:
+BB28_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB22_12:
+BB28_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB22_16;
+       @%p8 bra        BB28_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB22_15;
+       @%p9 bra        BB28_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
-       add.f64         %fd67, %fd66, %fd31;
+       max.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB22_15:
+BB28_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB22_16:
+BB28_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB22_20;
+       @%p10 bra       BB28_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB22_19;
+       @%p11 bra       BB28_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
-       add.f64         %fd65, %fd64, %fd32;
+       max.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB22_19:
+BB28_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB22_20:
+BB28_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB22_33;
+       @%p12 bra       BB28_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB22_23;
+       @%p13 bra       BB28_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
-       add.f64         %fd63, %fd63, %fd33;
+       max.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB22_23:
+BB28_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB22_25;
+       @%p14 bra       BB28_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
-       add.f64         %fd62, %fd62, %fd34;
+       max.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB22_25:
+BB28_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB22_27;
+       @%p15 bra       BB28_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
-       add.f64         %fd61, %fd61, %fd35;
+       max.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB22_27:
+BB28_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB22_29;
+       @%p16 bra       BB28_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
-       add.f64         %fd60, %fd60, %fd36;
+       max.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB22_29:
+BB28_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB22_31;
+       @%p17 bra       BB28_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
-       add.f64         %fd59, %fd59, %fd37;
+       max.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB22_31:
+BB28_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB22_33;
+       @%p18 bra       BB28_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
-       add.f64         %fd39, %fd59, %fd38;
+       max.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB22_33:
+BB28_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB22_35;
+       @%p19 bra       BB28_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -3295,16 +4163,16 @@ BB22_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB22_35:
+BB28_35:
        ret;
 }
 
-       // .globl       reduce_col_sum
-.visible .entry reduce_col_sum(
-       .param .u64 reduce_col_sum_param_0,
-       .param .u64 reduce_col_sum_param_1,
-       .param .u32 reduce_col_sum_param_2,
-       .param .u32 reduce_col_sum_param_3
+       // .globl       reduce_col_max
+.visible .entry reduce_col_max(
+       .param .u64 reduce_col_max_param_0,
+       .param .u64 reduce_col_max_param_1,
+       .param .u32 reduce_col_max_param_2,
+       .param .u32 reduce_col_max_param_3
 )
 {
        .reg .pred      %p<4>;
@@ -3313,53 +4181,53 @@ BB22_35:
        .reg .b64       %rd<9>;
 
 
-       ld.param.u64    %rd2, [reduce_col_sum_param_0];
-       ld.param.u64    %rd3, [reduce_col_sum_param_1];
-       ld.param.u32    %r5, [reduce_col_sum_param_2];
-       ld.param.u32    %r6, [reduce_col_sum_param_3];
+       ld.param.u64    %rd2, [reduce_col_max_param_0];
+       ld.param.u64    %rd3, [reduce_col_max_param_1];
+       ld.param.u32    %r5, [reduce_col_max_param_2];
+       ld.param.u32    %r6, [reduce_col_max_param_3];
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %ctaid.x;
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB23_5;
+       @%p1 bra        BB29_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
-       mov.f64         %fd8, 0d0000000000000000;
+       mov.f64         %fd8, 0d0010000000000000;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB23_4;
+       @%p2 bra        BB29_4;
 
        mov.u32         %r10, %r1;
 
-BB23_3:
+BB29_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f64   %fd6, [%rd5];
-       add.f64         %fd9, %fd9, %fd6;
+       max.f64         %fd9, %fd9, %fd6;
        add.s32         %r4, %r3, %r6;
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB23_3;
+       @%p3 bra        BB29_3;
 
-BB23_4:
+BB29_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB23_5:
+BB29_5:
        ret;
 }
 
-       // .globl       reduce_max
-.visible .entry reduce_max(
-       .param .u64 reduce_max_param_0,
-       .param .u64 reduce_max_param_1,
-       .param .u32 reduce_max_param_2
+       // .globl       reduce_min
+.visible .entry reduce_min(
+       .param .u64 reduce_min_param_0,
+       .param .u64 reduce_min_param_1,
+       .param .u32 reduce_min_param_2
 )
 {
        .reg .pred      %p<20>;
@@ -3368,45 +4236,45 @@ BB23_5:
        .reg .b64       %rd<15>;
 
 
-       ld.param.u64    %rd2, [reduce_max_param_0];
-       ld.param.u64    %rd3, [reduce_max_param_1];
-       ld.param.u32    %r5, [reduce_max_param_2];
+       ld.param.u64    %rd2, [reduce_min_param_0];
+       ld.param.u64    %rd3, [reduce_min_param_1];
+       ld.param.u32    %r5, [reduce_min_param_2];
        mov.u32         %r6, %tid.x;
        mov.u32         %r7, %ctaid.x;
        shl.b32         %r8, %r7, 1;
        mov.u32         %r9, %ntid.x;
        mad.lo.s32      %r32, %r8, %r9, %r6;
-       mov.f64         %fd76, 0d0010000000000000;
+       mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB24_4;
+       @%p1 bra        BB30_4;
 
-BB24_1:
+BB30_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f64   %fd30, [%rd6];
-       max.f64         %fd78, %fd1, %fd30;
+       min.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB24_3;
+       @%p2 bra        BB30_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
-       max.f64         %fd78, %fd78, %fd31;
+       min.f64         %fd78, %fd78, %fd31;
 
-BB24_3:
+BB30_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB24_1;
+       @%p3 bra        BB30_1;
 
-BB24_4:
+BB30_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -3414,130 +4282,130 @@ BB24_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB24_8;
+       @%p4 bra        BB30_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB24_7;
+       @%p5 bra        BB30_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
-       max.f64         %fd75, %fd74, %fd32;
+       min.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB24_7:
+BB30_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB24_8:
+BB30_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB24_12;
+       @%p6 bra        BB30_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB24_11;
+       @%p7 bra        BB30_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
-       max.f64         %fd73, %fd72, %fd33;
+       min.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB24_11:
+BB30_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB24_12:
+BB30_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB24_16;
+       @%p8 bra        BB30_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB24_15;
+       @%p9 bra        BB30_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
-       max.f64         %fd71, %fd70, %fd34;
+       min.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB24_15:
+BB30_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB24_16:
+BB30_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB24_20;
+       @%p10 bra       BB30_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB24_19;
+       @%p11 bra       BB30_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
-       max.f64         %fd69, %fd68, %fd35;
+       min.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB24_19:
+BB30_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB24_20:
+BB30_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB24_33;
+       @%p12 bra       BB30_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB24_23;
+       @%p13 bra       BB30_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
-       max.f64         %fd67, %fd67, %fd36;
+       min.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB24_23:
+BB30_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB24_25;
+       @%p14 bra       BB30_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
-       max.f64         %fd66, %fd66, %fd37;
+       min.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB24_25:
+BB30_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB24_27;
+       @%p15 bra       BB30_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
-       max.f64         %fd65, %fd65, %fd38;
+       min.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB24_27:
+BB30_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB24_29;
+       @%p16 bra       BB30_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
-       max.f64         %fd64, %fd64, %fd39;
+       min.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB24_29:
+BB30_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB24_31;
+       @%p17 bra       BB30_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
-       max.f64         %fd63, %fd63, %fd40;
+       min.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB24_31:
+BB30_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB24_33;
+       @%p18 bra       BB30_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
-       max.f64         %fd42, %fd63, %fd41;
+       min.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB24_33:
+BB30_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB24_35;
+       @%p19 bra       BB30_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -3545,16 +4413,16 @@ BB24_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB24_35:
+BB30_35:
        ret;
 }
 
-       // .globl       reduce_row_max
-.visible .entry reduce_row_max(
-       .param .u64 reduce_row_max_param_0,
-       .param .u64 reduce_row_max_param_1,
-       .param .u32 reduce_row_max_param_2,
-       .param .u32 reduce_row_max_param_3
+       // .globl       reduce_row_min
+.visible .entry reduce_row_min(
+       .param .u64 reduce_row_min_param_0,
+       .param .u64 reduce_row_min_param_1,
+       .param .u32 reduce_row_min_param_2,
+       .param .u32 reduce_row_min_param_3
 )
 {
        .reg .pred      %p<20>;
@@ -3563,35 +4431,35 @@ BB24_35:
        .reg .b64       %rd<42>;
 
 
-       ld.param.u64    %rd1, [reduce_row_max_param_0];
-       ld.param.u64    %rd2, [reduce_row_max_param_1];
-       ld.param.u32    %r5, [reduce_row_max_param_2];
-       ld.param.u32    %r4, [reduce_row_max_param_3];
+       ld.param.u64    %rd1, [reduce_row_min_param_0];
+       ld.param.u64    %rd2, [reduce_row_min_param_1];
+       ld.param.u32    %r5, [reduce_row_min_param_2];
+       ld.param.u32    %r4, [reduce_row_min_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB25_35;
+       @%p1 bra        BB31_35;
 
        mov.u32         %r38, %tid.x;
-       mov.f64         %fd72, 0d0010000000000000;
+       mov.f64         %fd72, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB25_4;
+       @%p2 bra        BB31_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB25_3:
+BB31_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
        ld.global.f64   %fd28, [%rd5];
-       max.f64         %fd73, %fd73, %fd28;
+       min.f64         %fd73, %fd73, %fd28;
        mov.u32         %r9, %ntid.x;
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB25_3;
+       @%p3 bra        BB31_3;
 
-BB25_4:
+BB31_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -3601,130 +4469,130 @@ BB25_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB25_8;
+       @%p4 bra        BB31_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB25_7;
+       @%p5 bra        BB31_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
-       max.f64         %fd71, %fd70, %fd29;
+       min.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB25_7:
+BB31_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB25_8:
+BB31_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB25_12;
+       @%p6 bra        BB31_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB25_11;
+       @%p7 bra        BB31_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
-       max.f64         %fd69, %fd68, %fd30;
+       min.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB25_11:
+BB31_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB25_12:
+BB31_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB25_16;
+       @%p8 bra        BB31_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB25_15;
+       @%p9 bra        BB31_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
-       max.f64         %fd67, %fd66, %fd31;
+       min.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB25_15:
+BB31_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB25_16:
+BB31_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB25_20;
+       @%p10 bra       BB31_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB25_19;
+       @%p11 bra       BB31_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
-       max.f64         %fd65, %fd64, %fd32;
+       min.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB25_19:
+BB31_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB25_20:
+BB31_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB25_33;
+       @%p12 bra       BB31_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB25_23;
+       @%p13 bra       BB31_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
-       max.f64         %fd63, %fd63, %fd33;
+       min.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB25_23:
+BB31_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB25_25;
+       @%p14 bra       BB31_25;
 
        ld.volatile.shared.f64  %fd34, [%rd8+128];
-       max.f64         %fd62, %fd62, %fd34;
+       min.f64         %fd62, %fd62, %fd34;
        st.volatile.shared.f64  [%rd8], %fd62;
 
-BB25_25:
+BB31_25:
        mov.f64         %fd61, %fd62;
        setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB25_27;
+       @%p15 bra       BB31_27;
 
        ld.volatile.shared.f64  %fd35, [%rd8+64];
-       max.f64         %fd61, %fd61, %fd35;
+       min.f64         %fd61, %fd61, %fd35;
        st.volatile.shared.f64  [%rd8], %fd61;
 
-BB25_27:
+BB31_27:
        mov.f64         %fd60, %fd61;
        setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB25_29;
+       @%p16 bra       BB31_29;
 
        ld.volatile.shared.f64  %fd36, [%rd8+32];
-       max.f64         %fd60, %fd60, %fd36;
+       min.f64         %fd60, %fd60, %fd36;
        st.volatile.shared.f64  [%rd8], %fd60;
 
-BB25_29:
+BB31_29:
        mov.f64         %fd59, %fd60;
        setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB25_31;
+       @%p17 bra       BB31_31;
 
        ld.volatile.shared.f64  %fd37, [%rd8+16];
-       max.f64         %fd59, %fd59, %fd37;
+       min.f64         %fd59, %fd59, %fd37;
        st.volatile.shared.f64  [%rd8], %fd59;
 
-BB25_31:
+BB31_31:
        setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB25_33;
+       @%p18 bra       BB31_33;
 
        ld.volatile.shared.f64  %fd38, [%rd8+8];
-       max.f64         %fd39, %fd59, %fd38;
+       min.f64         %fd39, %fd59, %fd38;
        st.volatile.shared.f64  [%rd8], %fd39;
 
-BB25_33:
+BB31_33:
        setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB25_35;
+       @%p19 bra       BB31_35;
 
        ld.shared.f64   %fd40, [sdata];
        cvta.to.global.u64      %rd39, %rd2;
@@ -3732,16 +4600,16 @@ BB25_33:
        add.s64         %rd41, %rd39, %rd40;
        st.global.f64   [%rd41], %fd40;
 
-BB25_35:
+BB31_35:
        ret;
 }
 
-       // .globl       reduce_col_max
-.visible .entry reduce_col_max(
-       .param .u64 reduce_col_max_param_0,
-       .param .u64 reduce_col_max_param_1,
-       .param .u32 reduce_col_max_param_2,
-       .param .u32 reduce_col_max_param_3
+       // .globl       reduce_col_min
+.visible .entry reduce_col_min(
+       .param .u64 reduce_col_min_param_0,
+       .param .u64 reduce_col_min_param_1,
+       .param .u32 reduce_col_min_param_2,
+       .param .u32 reduce_col_min_param_3
 )
 {
        .reg .pred      %p<4>;
@@ -3750,53 +4618,53 @@ BB25_35:
        .reg .b64       %rd<9>;
 
 
-       ld.param.u64    %rd2, [reduce_col_max_param_0];
-       ld.param.u64    %rd3, [reduce_col_max_param_1];
-       ld.param.u32    %r5, [reduce_col_max_param_2];
-       ld.param.u32    %r6, [reduce_col_max_param_3];
+       ld.param.u64    %rd2, [reduce_col_min_param_0];
+       ld.param.u64    %rd3, [reduce_col_min_param_1];
+       ld.param.u32    %r5, [reduce_col_min_param_2];
+       ld.param.u32    %r6, [reduce_col_min_param_3];
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %ctaid.x;
        mov.u32         %r9, %tid.x;
        mad.lo.s32      %r1, %r7, %r8, %r9;
        setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB26_5;
+       @%p1 bra        BB32_5;
 
        cvta.to.global.u64      %rd1, %rd2;
        mul.lo.s32      %r2, %r6, %r5;
-       mov.f64         %fd8, 0d0010000000000000;
+       mov.f64         %fd8, 0d7FEFFFFFFFFFFFFF;
        mov.f64         %fd9, %fd8;
        setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB26_4;
+       @%p2 bra        BB32_4;
 
        mov.u32         %r10, %r1;
 
-BB26_3:
+BB32_3:
        mov.u32         %r3, %r10;
        mul.wide.u32    %rd4, %r3, 8;
        add.s64         %rd5, %rd1, %rd4;
        ld.global.f64   %fd6, [%rd5];
-       max.f64         %fd9, %fd9, %fd6;
+       min.f64         %fd9, %fd9, %fd6;
        add.s32         %r4, %r3, %r6;
        setp.lt.u32     %p3, %r4, %r2;
        mov.u32         %r10, %r4;
        mov.f64         %fd8, %fd9;
-       @%p3 bra        BB26_3;
+       @%p3 bra        BB32_3;
 
-BB26_4:
+BB32_4:
        cvta.to.global.u64      %rd6, %rd3;
        mul.wide.u32    %rd7, %r1, 8;
        add.s64         %rd8, %rd6, %rd7;
        st.global.f64   [%rd8], %fd8;
 
-BB26_5:
+BB32_5:
        ret;
 }
 
-       // .globl       reduce_min
-.visible .entry reduce_min(
-       .param .u64 reduce_min_param_0,
-       .param .u64 reduce_min_param_1,
-       .param .u32 reduce_min_param_2
+       // .globl       reduce_prod
+.visible .entry reduce_prod(
+       .param .u64 reduce_prod_param_0,
+       .param .u64 reduce_prod_param_1,
+       .param .u32 reduce_prod_param_2
 )
 {
        .reg .pred      %p<20>;
@@ -3805,45 +4673,45 @@ BB26_5:
        .reg .b64       %rd<15>;
 
 
-       ld.param.u64    %rd2, [reduce_min_param_0];
-       ld.param.u64    %rd3, [reduce_min_param_1];
-       ld.param.u32    %r5, [reduce_min_param_2];
+       ld.param.u64    %rd2, [reduce_prod_param_0];
+       ld.param.u64    %rd3, [reduce_prod_param_1];
+       ld.param.u32    %r5, [reduce_prod_param_2];
        mov.u32         %r6, %tid.x;
        mov.u32         %r7, %ctaid.x;
        shl.b32         %r8, %r7, 1;
        mov.u32         %r9, %ntid.x;
        mad.lo.s32      %r32, %r8, %r9, %r6;
-       mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd76, 0d3FF0000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB27_4;
+       @%p1 bra        BB33_4;
 
-BB27_1:
+BB33_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f64   %fd30, [%rd6];
-       min.f64         %fd78, %fd1, %fd30;
+       mul.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB27_3;
+       @%p2 bra        BB33_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
-       min.f64         %fd78, %fd78, %fd31;
+       mul.f64         %fd78, %fd78, %fd31;
 
-BB27_3:
+BB33_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB27_1;
+       @%p3 bra        BB33_1;
 
-BB27_4:
+BB33_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -3851,130 +4719,130 @@ BB27_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB27_8;
+       @%p4 bra        BB33_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB27_7;
+       @%p5 bra        BB33_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
-       min.f64         %fd75, %fd74, %fd32;
+       mul.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB27_7:
+BB33_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB27_8:
+BB33_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB27_12;
+       @%p6 bra        BB33_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB27_11;
+       @%p7 bra        BB33_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
-       min.f64         %fd73, %fd72, %fd33;
+       mul.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB27_11:
+BB33_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB27_12:
+BB33_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB27_16;
+       @%p8 bra        BB33_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB27_15;
+       @%p9 bra        BB33_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
-       min.f64         %fd71, %fd70, %fd34;
+       mul.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB27_15:
+BB33_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB27_16:
+BB33_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB27_20;
+       @%p10 bra       BB33_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB27_19;
+       @%p11 bra       BB33_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
-       min.f64         %fd69, %fd68, %fd35;
+       mul.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB27_19:
+BB33_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB27_20:
+BB33_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB27_33;
+       @%p12 bra       BB33_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB27_23;
+       @%p13 bra       BB33_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
-       min.f64         %fd67, %fd67, %fd36;
+       mul.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB27_23:
+BB33_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB27_25;
+       @%p14 bra       BB33_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
-       min.f64         %fd66, %fd66, %fd37;
+       mul.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB27_25:
+BB33_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB27_27;
+       @%p15 bra       BB33_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
-       min.f64         %fd65, %fd65, %fd38;
+       mul.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB27_27:
+BB33_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB27_29;
+       @%p16 bra       BB33_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
-       min.f64         %fd64, %fd64, %fd39;
+       mul.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB27_29:
+BB33_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB27_31;
+       @%p17 bra       BB33_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
-       min.f64         %fd63, %fd63, %fd40;
+       mul.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB27_31:
+BB33_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB27_33;
+       @%p18 bra       BB33_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
-       min.f64         %fd42, %fd63, %fd41;
+       mul.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB27_33:
+BB33_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB27_35;
+       @%p19 bra       BB33_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -3982,250 +4850,257 @@ BB27_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB27_35:
+BB33_35:
        ret;
 }
 
-       // .globl       reduce_row_min
-.visible .entry reduce_row_min(
-       .param .u64 reduce_row_min_param_0,
-       .param .u64 reduce_row_min_param_1,
-       .param .u32 reduce_row_min_param_2,
-       .param .u32 reduce_row_min_param_3
+       // .globl       reduce_row_mean
+.visible .entry reduce_row_mean(
+       .param .u64 reduce_row_mean_param_0,
+       .param .u64 reduce_row_mean_param_1,
+       .param .u32 reduce_row_mean_param_2,
+       .param .u32 reduce_row_mean_param_3
 )
 {
        .reg .pred      %p<20>;
-       .reg .b32       %r<39>;
-       .reg .f64       %fd<74>;
-       .reg .b64       %rd<42>;
+       .reg .b32       %r<40>;
+       .reg .f64       %fd<76>;
+       .reg .b64       %rd<43>;
 
 
-       ld.param.u64    %rd1, [reduce_row_min_param_0];
-       ld.param.u64    %rd2, [reduce_row_min_param_1];
-       ld.param.u32    %r5, [reduce_row_min_param_2];
-       ld.param.u32    %r4, [reduce_row_min_param_3];
+       ld.param.u64    %rd1, [reduce_row_mean_param_0];
+       ld.param.u64    %rd2, [reduce_row_mean_param_1];
+       ld.param.u32    %r4, [reduce_row_mean_param_2];
+       ld.param.u32    %r5, [reduce_row_mean_param_3];
        mov.u32         %r6, %ctaid.x;
-       setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB28_35;
+       setp.ge.u32     %p1, %r6, %r4;
+       @%p1 bra        BB34_35;
 
-       mov.u32         %r38, %tid.x;
-       mov.f64         %fd72, 0d7FEFFFFFFFFFFFFF;
-       mov.f64         %fd73, %fd72;
-       setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB28_4;
+       mov.u32         %r39, %tid.x;
+       mov.f64         %fd74, 0d0000000000000000;
+       mov.f64         %fd75, %fd74;
+       setp.ge.u32     %p2, %r39, %r5;
+       @%p2 bra        BB34_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB28_3:
-       mad.lo.s32      %r8, %r6, %r4, %r38;
+BB34_3:
+       mad.lo.s32      %r8, %r6, %r5, %r39;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
        ld.global.f64   %fd28, [%rd5];
-       min.f64         %fd73, %fd73, %fd28;
+       add.f64         %fd75, %fd75, %fd28;
        mov.u32         %r9, %ntid.x;
-       add.s32         %r38, %r9, %r38;
-       setp.lt.u32     %p3, %r38, %r4;
-       mov.f64         %fd72, %fd73;
-       @%p3 bra        BB28_3;
+       add.s32         %r39, %r9, %r39;
+       setp.lt.u32     %p3, %r39, %r5;
+       mov.f64         %fd74, %fd75;
+       @%p3 bra        BB34_3;
 
-BB28_4:
-       mov.f64         %fd70, %fd72;
+BB34_4:
+       mov.f64         %fd72, %fd74;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
        mov.u64         %rd7, sdata;
        add.s64         %rd8, %rd7, %rd6;
-       st.shared.f64   [%rd8], %fd70;
+       st.shared.f64   [%rd8], %fd72;
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB28_8;
+       @%p4 bra        BB34_8;
 
        setp.gt.u32     %p5, %r10, 511;
-       mov.f64         %fd71, %fd70;
-       @%p5 bra        BB28_7;
+       mov.f64         %fd73, %fd72;
+       @%p5 bra        BB34_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
-       min.f64         %fd71, %fd70, %fd29;
-       st.shared.f64   [%rd8], %fd71;
+       add.f64         %fd73, %fd72, %fd29;
+       st.shared.f64   [%rd8], %fd73;
 
-BB28_7:
-       mov.f64         %fd70, %fd71;
+BB34_7:
+       mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB28_8:
-       mov.f64         %fd68, %fd70;
+BB34_8:
+       mov.f64         %fd70, %fd72;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB28_12;
+       @%p6 bra        BB34_12;
 
        setp.gt.u32     %p7, %r10, 255;
-       mov.f64         %fd69, %fd68;
-       @%p7 bra        BB28_11;
+       mov.f64         %fd71, %fd70;
+       @%p7 bra        BB34_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
-       min.f64         %fd69, %fd68, %fd30;
-       st.shared.f64   [%rd8], %fd69;
+       add.f64         %fd71, %fd70, %fd30;
+       st.shared.f64   [%rd8], %fd71;
 
-BB28_11:
-       mov.f64         %fd68, %fd69;
+BB34_11:
+       mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB28_12:
-       mov.f64         %fd66, %fd68;
+BB34_12:
+       mov.f64         %fd68, %fd70;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB28_16;
+       @%p8 bra        BB34_16;
 
        setp.gt.u32     %p9, %r10, 127;
-       mov.f64         %fd67, %fd66;
-       @%p9 bra        BB28_15;
+       mov.f64         %fd69, %fd68;
+       @%p9 bra        BB34_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
-       min.f64         %fd67, %fd66, %fd31;
-       st.shared.f64   [%rd8], %fd67;
+       add.f64         %fd69, %fd68, %fd31;
+       st.shared.f64   [%rd8], %fd69;
 
-BB28_15:
-       mov.f64         %fd66, %fd67;
+BB34_15:
+       mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB28_16:
-       mov.f64         %fd64, %fd66;
+BB34_16:
+       mov.f64         %fd66, %fd68;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB28_20;
+       @%p10 bra       BB34_20;
 
        setp.gt.u32     %p11, %r10, 63;
-       mov.f64         %fd65, %fd64;
-       @%p11 bra       BB28_19;
+       mov.f64         %fd67, %fd66;
+       @%p11 bra       BB34_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
-       min.f64         %fd65, %fd64, %fd32;
-       st.shared.f64   [%rd8], %fd65;
+       add.f64         %fd67, %fd66, %fd32;
+       st.shared.f64   [%rd8], %fd67;
 
-BB28_19:
-       mov.f64         %fd64, %fd65;
+BB

<TRUNCATED>

Reply via email to