http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx 
b/src/main/cpp/kernels/SystemML.ptx
index b9efd9b..efaf29b 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,16 +1,16 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-19856038
-// Cuda compilation tools, release 7.5, V7.5.17
+// Compiler Build ID: CL-21124049
+// Cuda compilation tools, release 8.0, V8.0.44
 // Based on LLVM 3.4svn
 //
 
-.version 4.3
+.version 5.0
 .target sm_30
 .address_size 64
 
-       // .globl       _Z6reduceI5SumOpEvPdS1_jT_d
+       // .globl       copy_u2l_dense
 .func  (.param .b64 func_retval0) __internal_accurate_pow
 (
        .param .b64 __internal_accurate_pow_param_0,
@@ -19,1692 +19,10 @@
 ;
 .extern .shared .align 8 .b8 sdata[];
 
-.visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<33>;
-       .reg .f64       %fd<79>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd76, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r32, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB0_5;
-
-       mov.f64         %fd77, %fd76;
-
-BB0_2:
-       mov.f64         %fd1, %fd77;
-       mul.wide.u32    %rd4, %r32, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd29, [%rd5];
-       add.f64         %fd78, %fd1, %fd29;
-       add.s32         %r3, %r32, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB0_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd30, [%rd7];
-       add.f64         %fd78, %fd78, %fd30;
-
-BB0_4:
-       mov.f64         %fd77, %fd78;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r32, %r12, %r13, %r32;
-       setp.lt.u32     %p3, %r32, %r5;
-       mov.f64         %fd76, %fd77;
-       @%p3 bra        BB0_2;
-
-BB0_5:
-       mov.f64         %fd74, %fd76;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd74;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB0_9;
-
-       setp.gt.u32     %p5, %r6, 511;
-       mov.f64         %fd75, %fd74;
-       @%p5 bra        BB0_8;
-
-       ld.shared.f64   %fd31, [%rd1+4096];
-       add.f64         %fd75, %fd74, %fd31;
-       st.shared.f64   [%rd1], %fd75;
-
-BB0_8:
-       mov.f64         %fd74, %fd75;
-       bar.sync        0;
-
-BB0_9:
-       mov.f64         %fd72, %fd74;
-       setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB0_13;
-
-       setp.gt.u32     %p7, %r6, 255;
-       mov.f64         %fd73, %fd72;
-       @%p7 bra        BB0_12;
-
-       ld.shared.f64   %fd32, [%rd1+2048];
-       add.f64         %fd73, %fd72, %fd32;
-       st.shared.f64   [%rd1], %fd73;
-
-BB0_12:
-       mov.f64         %fd72, %fd73;
-       bar.sync        0;
-
-BB0_13:
-       mov.f64         %fd70, %fd72;
-       setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB0_17;
-
-       setp.gt.u32     %p9, %r6, 127;
-       mov.f64         %fd71, %fd70;
-       @%p9 bra        BB0_16;
-
-       ld.shared.f64   %fd33, [%rd1+1024];
-       add.f64         %fd71, %fd70, %fd33;
-       st.shared.f64   [%rd1], %fd71;
-
-BB0_16:
-       mov.f64         %fd70, %fd71;
-       bar.sync        0;
-
-BB0_17:
-       mov.f64         %fd68, %fd70;
-       setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB0_21;
-
-       setp.gt.u32     %p11, %r6, 63;
-       mov.f64         %fd69, %fd68;
-       @%p11 bra       BB0_20;
-
-       ld.shared.f64   %fd34, [%rd1+512];
-       add.f64         %fd69, %fd68, %fd34;
-       st.shared.f64   [%rd1], %fd69;
-
-BB0_20:
-       mov.f64         %fd68, %fd69;
-       bar.sync        0;
-
-BB0_21:
-       mov.f64         %fd67, %fd68;
-       setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB0_34;
-
-       setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB0_24;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+256];
-       add.f64         %fd67, %fd67, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd67;
-
-BB0_24:
-       mov.f64         %fd66, %fd67;
-       setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB0_26;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+128];
-       add.f64         %fd66, %fd66, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd66;
-
-BB0_26:
-       mov.f64         %fd65, %fd66;
-       setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB0_28;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+64];
-       add.f64         %fd65, %fd65, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd65;
-
-BB0_28:
-       mov.f64         %fd64, %fd65;
-       setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB0_30;
-
-       ld.volatile.shared.f64  %fd38, [%rd1+32];
-       add.f64         %fd64, %fd64, %fd38;
-       st.volatile.shared.f64  [%rd1], %fd64;
-
-BB0_30:
-       mov.f64         %fd63, %fd64;
-       setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB0_32;
-
-       ld.volatile.shared.f64  %fd39, [%rd1+16];
-       add.f64         %fd63, %fd63, %fd39;
-       st.volatile.shared.f64  [%rd1], %fd63;
-
-BB0_32:
-       setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB0_34;
-
-       ld.volatile.shared.f64  %fd40, [%rd1+8];
-       add.f64         %fd41, %fd63, %fd40;
-       st.volatile.shared.f64  [%rd1], %fd41;
-
-BB0_34:
-       setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB0_36;
-
-       ld.shared.f64   %fd42, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd42;
-
-BB0_36:
-       ret;
-}
-
-       // .globl       _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<29>;
-       .reg .f64       %fd<41>;
-       .reg .b64       %rd<10>;
-
-
-       ld.param.u64    %rd2, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd3, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r7, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd40, 
[_Z10reduce_rowI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r1, %ctaid.x;
-       setp.ge.u32     %p1, %r1, %r7;
-       @%p1 bra        BB1_34;
-
-       mov.u32         %r28, %tid.x;
-       mul.lo.s32      %r3, %r1, %r6;
-       setp.ge.u32     %p2, %r28, %r6;
-       @%p2 bra        BB1_3;
-
-BB1_2:
-       add.s32         %r8, %r28, %r3;
-       mul.wide.u32    %rd4, %r8, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd27, [%rd5];
-       add.f64         %fd40, %fd40, %fd27;
-       mov.u32         %r9, %ntid.x;
-       add.s32         %r28, %r9, %r28;
-       setp.lt.u32     %p3, %r28, %r6;
-       @%p3 bra        BB1_2;
-
-BB1_3:
-       mov.u32         %r10, %tid.x;
-       mul.wide.u32    %rd6, %r10, 8;
-       mov.u64         %rd7, sdata;
-       add.s64         %rd1, %rd7, %rd6;
-       st.shared.f64   [%rd1], %fd40;
-       bar.sync        0;
-       mov.u32         %r11, %ntid.x;
-       setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB1_7;
-
-       setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB1_6;
-
-       ld.shared.f64   %fd28, [%rd1+4096];
-       add.f64         %fd40, %fd40, %fd28;
-       st.shared.f64   [%rd1], %fd40;
-
-BB1_6:
-       bar.sync        0;
-
-BB1_7:
-       setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB1_11;
-
-       setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB1_10;
-
-       ld.shared.f64   %fd29, [%rd1+2048];
-       add.f64         %fd40, %fd40, %fd29;
-       st.shared.f64   [%rd1], %fd40;
-
-BB1_10:
-       bar.sync        0;
-
-BB1_11:
-       setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB1_15;
-
-       setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB1_14;
-
-       ld.shared.f64   %fd30, [%rd1+1024];
-       add.f64         %fd40, %fd40, %fd30;
-       st.shared.f64   [%rd1], %fd40;
-
-BB1_14:
-       bar.sync        0;
-
-BB1_15:
-       setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB1_19;
-
-       setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB1_18;
-
-       ld.shared.f64   %fd31, [%rd1+512];
-       add.f64         %fd40, %fd40, %fd31;
-       st.shared.f64   [%rd1], %fd40;
-
-BB1_18:
-       bar.sync        0;
-
-BB1_19:
-       setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB1_32;
-
-       setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB1_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+256];
-       add.f64         %fd40, %fd40, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB1_22:
-       setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB1_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+128];
-       add.f64         %fd40, %fd40, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB1_24:
-       setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB1_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+64];
-       add.f64         %fd40, %fd40, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB1_26:
-       setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB1_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+32];
-       add.f64         %fd40, %fd40, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB1_28:
-       setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB1_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+16];
-       add.f64         %fd40, %fd40, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB1_30:
-       setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB1_32;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+8];
-       add.f64         %fd38, %fd40, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd38;
-
-BB1_32:
-       setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB1_34;
-
-       ld.shared.f64   %fd39, [sdata];
-       mul.wide.u32    %rd8, %r1, 8;
-       add.s64         %rd9, %rd3, %rd8;
-       st.f64  [%rd9], %fd39;
-
-BB1_34:
-       ret;
-}
-
-       // .globl       _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<4>;
-       .reg .b32       %r<11>;
-       .reg .f64       %fd<7>;
-       .reg .b64       %rd<7>;
-
-
-       ld.param.u64    %rd1, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd2, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r5, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd6, 
[_Z10reduce_colI5SumOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r7, %ctaid.x;
-       mov.u32         %r8, %ntid.x;
-       mov.u32         %r9, %tid.x;
-       mad.lo.s32      %r1, %r8, %r7, %r9;
-       setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB2_5;
-
-       mul.lo.s32      %r2, %r6, %r5;
-       setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB2_4;
-
-       mov.u32         %r10, %r1;
-
-BB2_3:
-       mov.u32         %r3, %r10;
-       mul.wide.u32    %rd3, %r3, 8;
-       add.s64         %rd4, %rd1, %rd3;
-       ld.f64  %fd5, [%rd4];
-       add.f64         %fd6, %fd6, %fd5;
-       add.s32         %r4, %r3, %r6;
-       setp.lt.u32     %p3, %r4, %r2;
-       mov.u32         %r10, %r4;
-       @%p3 bra        BB2_3;
-
-BB2_4:
-       mul.wide.u32    %rd5, %r1, 8;
-       add.s64         %rd6, %rd2, %rd5;
-       st.f64  [%rd6], %fd6;
-
-BB2_5:
-       ret;
-}
-
-       // .globl       _Z6reduceI5MaxOpEvPdS1_jT_d
-.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<33>;
-       .reg .f64       %fd<79>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd76, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r32, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB3_5;
-
-       mov.f64         %fd77, %fd76;
-
-BB3_2:
-       mov.f64         %fd1, %fd77;
-       mul.wide.u32    %rd4, %r32, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd29, [%rd5];
-       max.f64         %fd78, %fd1, %fd29;
-       add.s32         %r3, %r32, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB3_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd30, [%rd7];
-       max.f64         %fd78, %fd78, %fd30;
-
-BB3_4:
-       mov.f64         %fd77, %fd78;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r32, %r12, %r13, %r32;
-       setp.lt.u32     %p3, %r32, %r5;
-       mov.f64         %fd76, %fd77;
-       @%p3 bra        BB3_2;
-
-BB3_5:
-       mov.f64         %fd74, %fd76;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd74;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB3_9;
-
-       setp.gt.u32     %p5, %r6, 511;
-       mov.f64         %fd75, %fd74;
-       @%p5 bra        BB3_8;
-
-       ld.shared.f64   %fd31, [%rd1+4096];
-       max.f64         %fd75, %fd74, %fd31;
-       st.shared.f64   [%rd1], %fd75;
-
-BB3_8:
-       mov.f64         %fd74, %fd75;
-       bar.sync        0;
-
-BB3_9:
-       mov.f64         %fd72, %fd74;
-       setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB3_13;
-
-       setp.gt.u32     %p7, %r6, 255;
-       mov.f64         %fd73, %fd72;
-       @%p7 bra        BB3_12;
-
-       ld.shared.f64   %fd32, [%rd1+2048];
-       max.f64         %fd73, %fd72, %fd32;
-       st.shared.f64   [%rd1], %fd73;
-
-BB3_12:
-       mov.f64         %fd72, %fd73;
-       bar.sync        0;
-
-BB3_13:
-       mov.f64         %fd70, %fd72;
-       setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB3_17;
-
-       setp.gt.u32     %p9, %r6, 127;
-       mov.f64         %fd71, %fd70;
-       @%p9 bra        BB3_16;
-
-       ld.shared.f64   %fd33, [%rd1+1024];
-       max.f64         %fd71, %fd70, %fd33;
-       st.shared.f64   [%rd1], %fd71;
-
-BB3_16:
-       mov.f64         %fd70, %fd71;
-       bar.sync        0;
-
-BB3_17:
-       mov.f64         %fd68, %fd70;
-       setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB3_21;
-
-       setp.gt.u32     %p11, %r6, 63;
-       mov.f64         %fd69, %fd68;
-       @%p11 bra       BB3_20;
-
-       ld.shared.f64   %fd34, [%rd1+512];
-       max.f64         %fd69, %fd68, %fd34;
-       st.shared.f64   [%rd1], %fd69;
-
-BB3_20:
-       mov.f64         %fd68, %fd69;
-       bar.sync        0;
-
-BB3_21:
-       mov.f64         %fd67, %fd68;
-       setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB3_34;
-
-       setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB3_24;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+256];
-       max.f64         %fd67, %fd67, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd67;
-
-BB3_24:
-       mov.f64         %fd66, %fd67;
-       setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB3_26;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+128];
-       max.f64         %fd66, %fd66, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd66;
-
-BB3_26:
-       mov.f64         %fd65, %fd66;
-       setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB3_28;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+64];
-       max.f64         %fd65, %fd65, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd65;
-
-BB3_28:
-       mov.f64         %fd64, %fd65;
-       setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB3_30;
-
-       ld.volatile.shared.f64  %fd38, [%rd1+32];
-       max.f64         %fd64, %fd64, %fd38;
-       st.volatile.shared.f64  [%rd1], %fd64;
-
-BB3_30:
-       mov.f64         %fd63, %fd64;
-       setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB3_32;
-
-       ld.volatile.shared.f64  %fd39, [%rd1+16];
-       max.f64         %fd63, %fd63, %fd39;
-       st.volatile.shared.f64  [%rd1], %fd63;
-
-BB3_32:
-       setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB3_34;
-
-       ld.volatile.shared.f64  %fd40, [%rd1+8];
-       max.f64         %fd41, %fd63, %fd40;
-       st.volatile.shared.f64  [%rd1], %fd41;
-
-BB3_34:
-       setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB3_36;
-
-       ld.shared.f64   %fd42, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd42;
-
-BB3_36:
-       ret;
-}
-
-       // .globl       _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<29>;
-       .reg .f64       %fd<41>;
-       .reg .b64       %rd<10>;
-
-
-       ld.param.u64    %rd2, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd3, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r7, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd40, 
[_Z10reduce_rowI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r1, %ctaid.x;
-       setp.ge.u32     %p1, %r1, %r7;
-       @%p1 bra        BB4_34;
-
-       mov.u32         %r28, %tid.x;
-       mul.lo.s32      %r3, %r1, %r6;
-       setp.ge.u32     %p2, %r28, %r6;
-       @%p2 bra        BB4_3;
-
-BB4_2:
-       add.s32         %r8, %r28, %r3;
-       mul.wide.u32    %rd4, %r8, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd27, [%rd5];
-       max.f64         %fd40, %fd40, %fd27;
-       mov.u32         %r9, %ntid.x;
-       add.s32         %r28, %r9, %r28;
-       setp.lt.u32     %p3, %r28, %r6;
-       @%p3 bra        BB4_2;
-
-BB4_3:
-       mov.u32         %r10, %tid.x;
-       mul.wide.u32    %rd6, %r10, 8;
-       mov.u64         %rd7, sdata;
-       add.s64         %rd1, %rd7, %rd6;
-       st.shared.f64   [%rd1], %fd40;
-       bar.sync        0;
-       mov.u32         %r11, %ntid.x;
-       setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB4_7;
-
-       setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB4_6;
-
-       ld.shared.f64   %fd28, [%rd1+4096];
-       max.f64         %fd40, %fd40, %fd28;
-       st.shared.f64   [%rd1], %fd40;
-
-BB4_6:
-       bar.sync        0;
-
-BB4_7:
-       setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB4_11;
-
-       setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB4_10;
-
-       ld.shared.f64   %fd29, [%rd1+2048];
-       max.f64         %fd40, %fd40, %fd29;
-       st.shared.f64   [%rd1], %fd40;
-
-BB4_10:
-       bar.sync        0;
-
-BB4_11:
-       setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB4_15;
-
-       setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB4_14;
-
-       ld.shared.f64   %fd30, [%rd1+1024];
-       max.f64         %fd40, %fd40, %fd30;
-       st.shared.f64   [%rd1], %fd40;
-
-BB4_14:
-       bar.sync        0;
-
-BB4_15:
-       setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB4_19;
-
-       setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB4_18;
-
-       ld.shared.f64   %fd31, [%rd1+512];
-       max.f64         %fd40, %fd40, %fd31;
-       st.shared.f64   [%rd1], %fd40;
-
-BB4_18:
-       bar.sync        0;
-
-BB4_19:
-       setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB4_32;
-
-       setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB4_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+256];
-       max.f64         %fd40, %fd40, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB4_22:
-       setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB4_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+128];
-       max.f64         %fd40, %fd40, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB4_24:
-       setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB4_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+64];
-       max.f64         %fd40, %fd40, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB4_26:
-       setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB4_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+32];
-       max.f64         %fd40, %fd40, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB4_28:
-       setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB4_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+16];
-       max.f64         %fd40, %fd40, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB4_30:
-       setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB4_32;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+8];
-       max.f64         %fd38, %fd40, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd38;
-
-BB4_32:
-       setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB4_34;
-
-       ld.shared.f64   %fd39, [sdata];
-       mul.wide.u32    %rd8, %r1, 8;
-       add.s64         %rd9, %rd3, %rd8;
-       st.f64  [%rd9], %fd39;
-
-BB4_34:
-       ret;
-}
-
-       // .globl       _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<4>;
-       .reg .b32       %r<11>;
-       .reg .f64       %fd<7>;
-       .reg .b64       %rd<7>;
-
-
-       ld.param.u64    %rd1, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd2, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r5, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd6, 
[_Z10reduce_colI5MaxOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r7, %ctaid.x;
-       mov.u32         %r8, %ntid.x;
-       mov.u32         %r9, %tid.x;
-       mad.lo.s32      %r1, %r8, %r7, %r9;
-       setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB5_5;
-
-       mul.lo.s32      %r2, %r6, %r5;
-       setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB5_4;
-
-       mov.u32         %r10, %r1;
-
-BB5_3:
-       mov.u32         %r3, %r10;
-       mul.wide.u32    %rd3, %r3, 8;
-       add.s64         %rd4, %rd1, %rd3;
-       ld.f64  %fd5, [%rd4];
-       max.f64         %fd6, %fd6, %fd5;
-       add.s32         %r4, %r3, %r6;
-       setp.lt.u32     %p3, %r4, %r2;
-       mov.u32         %r10, %r4;
-       @%p3 bra        BB5_3;
-
-BB5_4:
-       mul.wide.u32    %rd5, %r1, 8;
-       add.s64         %rd6, %rd2, %rd5;
-       st.f64  [%rd6], %fd6;
-
-BB5_5:
-       ret;
-}
-
-       // .globl       _Z6reduceI5MinOpEvPdS1_jT_d
-.visible .func _Z6reduceI5MinOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<33>;
-       .reg .f64       %fd<79>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd76, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r32, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB6_5;
-
-       mov.f64         %fd77, %fd76;
-
-BB6_2:
-       mov.f64         %fd1, %fd77;
-       mul.wide.u32    %rd4, %r32, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd29, [%rd5];
-       min.f64         %fd78, %fd1, %fd29;
-       add.s32         %r3, %r32, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB6_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd30, [%rd7];
-       min.f64         %fd78, %fd78, %fd30;
-
-BB6_4:
-       mov.f64         %fd77, %fd78;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r32, %r12, %r13, %r32;
-       setp.lt.u32     %p3, %r32, %r5;
-       mov.f64         %fd76, %fd77;
-       @%p3 bra        BB6_2;
-
-BB6_5:
-       mov.f64         %fd74, %fd76;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd74;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB6_9;
-
-       setp.gt.u32     %p5, %r6, 511;
-       mov.f64         %fd75, %fd74;
-       @%p5 bra        BB6_8;
-
-       ld.shared.f64   %fd31, [%rd1+4096];
-       min.f64         %fd75, %fd74, %fd31;
-       st.shared.f64   [%rd1], %fd75;
-
-BB6_8:
-       mov.f64         %fd74, %fd75;
-       bar.sync        0;
-
-BB6_9:
-       mov.f64         %fd72, %fd74;
-       setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB6_13;
-
-       setp.gt.u32     %p7, %r6, 255;
-       mov.f64         %fd73, %fd72;
-       @%p7 bra        BB6_12;
-
-       ld.shared.f64   %fd32, [%rd1+2048];
-       min.f64         %fd73, %fd72, %fd32;
-       st.shared.f64   [%rd1], %fd73;
-
-BB6_12:
-       mov.f64         %fd72, %fd73;
-       bar.sync        0;
-
-BB6_13:
-       mov.f64         %fd70, %fd72;
-       setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB6_17;
-
-       setp.gt.u32     %p9, %r6, 127;
-       mov.f64         %fd71, %fd70;
-       @%p9 bra        BB6_16;
-
-       ld.shared.f64   %fd33, [%rd1+1024];
-       min.f64         %fd71, %fd70, %fd33;
-       st.shared.f64   [%rd1], %fd71;
-
-BB6_16:
-       mov.f64         %fd70, %fd71;
-       bar.sync        0;
-
-BB6_17:
-       mov.f64         %fd68, %fd70;
-       setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB6_21;
-
-       setp.gt.u32     %p11, %r6, 63;
-       mov.f64         %fd69, %fd68;
-       @%p11 bra       BB6_20;
-
-       ld.shared.f64   %fd34, [%rd1+512];
-       min.f64         %fd69, %fd68, %fd34;
-       st.shared.f64   [%rd1], %fd69;
-
-BB6_20:
-       mov.f64         %fd68, %fd69;
-       bar.sync        0;
-
-BB6_21:
-       mov.f64         %fd67, %fd68;
-       setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB6_34;
-
-       setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB6_24;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+256];
-       min.f64         %fd67, %fd67, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd67;
-
-BB6_24:
-       mov.f64         %fd66, %fd67;
-       setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB6_26;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+128];
-       min.f64         %fd66, %fd66, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd66;
-
-BB6_26:
-       mov.f64         %fd65, %fd66;
-       setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB6_28;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+64];
-       min.f64         %fd65, %fd65, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd65;
-
-BB6_28:
-       mov.f64         %fd64, %fd65;
-       setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB6_30;
-
-       ld.volatile.shared.f64  %fd38, [%rd1+32];
-       min.f64         %fd64, %fd64, %fd38;
-       st.volatile.shared.f64  [%rd1], %fd64;
-
-BB6_30:
-       mov.f64         %fd63, %fd64;
-       setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB6_32;
-
-       ld.volatile.shared.f64  %fd39, [%rd1+16];
-       min.f64         %fd63, %fd63, %fd39;
-       st.volatile.shared.f64  [%rd1], %fd63;
-
-BB6_32:
-       setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB6_34;
-
-       ld.volatile.shared.f64  %fd40, [%rd1+8];
-       min.f64         %fd41, %fd63, %fd40;
-       st.volatile.shared.f64  [%rd1], %fd41;
-
-BB6_34:
-       setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB6_36;
-
-       ld.shared.f64   %fd42, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd42;
-
-BB6_36:
-       ret;
-}
-
-       // .globl       _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<29>;
-       .reg .f64       %fd<41>;
-       .reg .b64       %rd<10>;
-
-
-       ld.param.u64    %rd2, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd3, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r7, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd40, 
[_Z10reduce_rowI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r1, %ctaid.x;
-       setp.ge.u32     %p1, %r1, %r7;
-       @%p1 bra        BB7_34;
-
-       mov.u32         %r28, %tid.x;
-       mul.lo.s32      %r3, %r1, %r6;
-       setp.ge.u32     %p2, %r28, %r6;
-       @%p2 bra        BB7_3;
-
-BB7_2:
-       add.s32         %r8, %r28, %r3;
-       mul.wide.u32    %rd4, %r8, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd27, [%rd5];
-       min.f64         %fd40, %fd40, %fd27;
-       mov.u32         %r9, %ntid.x;
-       add.s32         %r28, %r9, %r28;
-       setp.lt.u32     %p3, %r28, %r6;
-       @%p3 bra        BB7_2;
-
-BB7_3:
-       mov.u32         %r10, %tid.x;
-       mul.wide.u32    %rd6, %r10, 8;
-       mov.u64         %rd7, sdata;
-       add.s64         %rd1, %rd7, %rd6;
-       st.shared.f64   [%rd1], %fd40;
-       bar.sync        0;
-       mov.u32         %r11, %ntid.x;
-       setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB7_7;
-
-       setp.gt.u32     %p5, %r10, 511;
-       @%p5 bra        BB7_6;
-
-       ld.shared.f64   %fd28, [%rd1+4096];
-       min.f64         %fd40, %fd40, %fd28;
-       st.shared.f64   [%rd1], %fd40;
-
-BB7_6:
-       bar.sync        0;
-
-BB7_7:
-       setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB7_11;
-
-       setp.gt.u32     %p7, %r10, 255;
-       @%p7 bra        BB7_10;
-
-       ld.shared.f64   %fd29, [%rd1+2048];
-       min.f64         %fd40, %fd40, %fd29;
-       st.shared.f64   [%rd1], %fd40;
-
-BB7_10:
-       bar.sync        0;
-
-BB7_11:
-       setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB7_15;
-
-       setp.gt.u32     %p9, %r10, 127;
-       @%p9 bra        BB7_14;
-
-       ld.shared.f64   %fd30, [%rd1+1024];
-       min.f64         %fd40, %fd40, %fd30;
-       st.shared.f64   [%rd1], %fd40;
-
-BB7_14:
-       bar.sync        0;
-
-BB7_15:
-       setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB7_19;
-
-       setp.gt.u32     %p11, %r10, 63;
-       @%p11 bra       BB7_18;
-
-       ld.shared.f64   %fd31, [%rd1+512];
-       min.f64         %fd40, %fd40, %fd31;
-       st.shared.f64   [%rd1], %fd40;
-
-BB7_18:
-       bar.sync        0;
-
-BB7_19:
-       setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB7_32;
-
-       setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB7_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+256];
-       min.f64         %fd40, %fd40, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB7_22:
-       setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB7_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+128];
-       min.f64         %fd40, %fd40, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB7_24:
-       setp.lt.u32     %p15, %r11, 16;
-       @%p15 bra       BB7_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+64];
-       min.f64         %fd40, %fd40, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB7_26:
-       setp.lt.u32     %p16, %r11, 8;
-       @%p16 bra       BB7_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+32];
-       min.f64         %fd40, %fd40, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB7_28:
-       setp.lt.u32     %p17, %r11, 4;
-       @%p17 bra       BB7_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+16];
-       min.f64         %fd40, %fd40, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd40;
-
-BB7_30:
-       setp.lt.u32     %p18, %r11, 2;
-       @%p18 bra       BB7_32;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+8];
-       min.f64         %fd38, %fd40, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd38;
-
-BB7_32:
-       setp.ne.s32     %p19, %r10, 0;
-       @%p19 bra       BB7_34;
-
-       ld.shared.f64   %fd39, [sdata];
-       mul.wide.u32    %rd8, %r1, 8;
-       add.s64         %rd9, %rd3, %rd8;
-       st.f64  [%rd9], %fd39;
-
-BB7_34:
-       ret;
-}
-
-       // .globl       _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 1 .b8 
_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_5[1],
-       .param .b64 _Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<4>;
-       .reg .b32       %r<11>;
-       .reg .f64       %fd<7>;
-       .reg .b64       %rd<7>;
-
-
-       ld.param.u64    %rd1, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd2, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r5, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.f64    %fd6, 
[_Z10reduce_colI5MinOp10IdentityOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r7, %ctaid.x;
-       mov.u32         %r8, %ntid.x;
-       mov.u32         %r9, %tid.x;
-       mad.lo.s32      %r1, %r8, %r7, %r9;
-       setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB8_5;
-
-       mul.lo.s32      %r2, %r6, %r5;
-       setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB8_4;
-
-       mov.u32         %r10, %r1;
-
-BB8_3:
-       mov.u32         %r3, %r10;
-       mul.wide.u32    %rd3, %r3, 8;
-       add.s64         %rd4, %rd1, %rd3;
-       ld.f64  %fd5, [%rd4];
-       min.f64         %fd6, %fd6, %fd5;
-       add.s32         %r4, %r3, %r6;
-       setp.lt.u32     %p3, %r4, %r2;
-       mov.u32         %r10, %r4;
-       @%p3 bra        BB8_3;
-
-BB8_4:
-       mul.wide.u32    %rd5, %r1, 8;
-       add.s64         %rd6, %rd2, %rd5;
-       st.f64  [%rd6], %fd6;
-
-BB8_5:
-       ret;
-}
-
-       // .globl       _Z6reduceI9ProductOpEvPdS1_jT_d
-.visible .func _Z6reduceI9ProductOpEvPdS1_jT_d(
-       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_0,
-       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_1,
-       .param .b32 _Z6reduceI9ProductOpEvPdS1_jT_d_param_2,
-       .param .align 1 .b8 _Z6reduceI9ProductOpEvPdS1_jT_d_param_3[1],
-       .param .b64 _Z6reduceI9ProductOpEvPdS1_jT_d_param_4
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<33>;
-       .reg .f64       %fd<79>;
-       .reg .b64       %rd<12>;
-
-
-       ld.param.u64    %rd2, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_0];
-       ld.param.u64    %rd3, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_1];
-       ld.param.u32    %r5, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_2];
-       ld.param.f64    %fd76, [_Z6reduceI9ProductOpEvPdS1_jT_d_param_4];
-       mov.u32         %r6, %tid.x;
-       mov.u32         %r7, %ctaid.x;
-       shl.b32         %r8, %r7, 1;
-       mov.u32         %r9, %ntid.x;
-       mad.lo.s32      %r32, %r8, %r9, %r6;
-       setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB9_5;
-
-       mov.f64         %fd77, %fd76;
-
-BB9_2:
-       mov.f64         %fd1, %fd77;
-       mul.wide.u32    %rd4, %r32, 8;
-       add.s64         %rd5, %rd2, %rd4;
-       ld.f64  %fd29, [%rd5];
-       mul.f64         %fd78, %fd1, %fd29;
-       add.s32         %r3, %r32, %r9;
-       setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB9_4;
-
-       mul.wide.u32    %rd6, %r3, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       ld.f64  %fd30, [%rd7];
-       mul.f64         %fd78, %fd78, %fd30;
-
-BB9_4:
-       mov.f64         %fd77, %fd78;
-       shl.b32         %r12, %r9, 1;
-       mov.u32         %r13, %nctaid.x;
-       mad.lo.s32      %r32, %r12, %r13, %r32;
-       setp.lt.u32     %p3, %r32, %r5;
-       mov.f64         %fd76, %fd77;
-       @%p3 bra        BB9_2;
-
-BB9_5:
-       mov.f64         %fd74, %fd76;
-       mul.wide.u32    %rd8, %r6, 8;
-       mov.u64         %rd9, sdata;
-       add.s64         %rd1, %rd9, %rd8;
-       st.shared.f64   [%rd1], %fd74;
-       bar.sync        0;
-       setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB9_9;
-
-       setp.gt.u32     %p5, %r6, 511;
-       mov.f64         %fd75, %fd74;
-       @%p5 bra        BB9_8;
-
-       ld.shared.f64   %fd31, [%rd1+4096];
-       mul.f64         %fd75, %fd74, %fd31;
-       st.shared.f64   [%rd1], %fd75;
-
-BB9_8:
-       mov.f64         %fd74, %fd75;
-       bar.sync        0;
-
-BB9_9:
-       mov.f64         %fd72, %fd74;
-       setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB9_13;
-
-       setp.gt.u32     %p7, %r6, 255;
-       mov.f64         %fd73, %fd72;
-       @%p7 bra        BB9_12;
-
-       ld.shared.f64   %fd32, [%rd1+2048];
-       mul.f64         %fd73, %fd72, %fd32;
-       st.shared.f64   [%rd1], %fd73;
-
-BB9_12:
-       mov.f64         %fd72, %fd73;
-       bar.sync        0;
-
-BB9_13:
-       mov.f64         %fd70, %fd72;
-       setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB9_17;
-
-       setp.gt.u32     %p9, %r6, 127;
-       mov.f64         %fd71, %fd70;
-       @%p9 bra        BB9_16;
-
-       ld.shared.f64   %fd33, [%rd1+1024];
-       mul.f64         %fd71, %fd70, %fd33;
-       st.shared.f64   [%rd1], %fd71;
-
-BB9_16:
-       mov.f64         %fd70, %fd71;
-       bar.sync        0;
-
-BB9_17:
-       mov.f64         %fd68, %fd70;
-       setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB9_21;
-
-       setp.gt.u32     %p11, %r6, 63;
-       mov.f64         %fd69, %fd68;
-       @%p11 bra       BB9_20;
-
-       ld.shared.f64   %fd34, [%rd1+512];
-       mul.f64         %fd69, %fd68, %fd34;
-       st.shared.f64   [%rd1], %fd69;
-
-BB9_20:
-       mov.f64         %fd68, %fd69;
-       bar.sync        0;
-
-BB9_21:
-       mov.f64         %fd67, %fd68;
-       setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB9_34;
-
-       setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB9_24;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+256];
-       mul.f64         %fd67, %fd67, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd67;
-
-BB9_24:
-       mov.f64         %fd66, %fd67;
-       setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB9_26;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+128];
-       mul.f64         %fd66, %fd66, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd66;
-
-BB9_26:
-       mov.f64         %fd65, %fd66;
-       setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB9_28;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+64];
-       mul.f64         %fd65, %fd65, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd65;
-
-BB9_28:
-       mov.f64         %fd64, %fd65;
-       setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB9_30;
-
-       ld.volatile.shared.f64  %fd38, [%rd1+32];
-       mul.f64         %fd64, %fd64, %fd38;
-       st.volatile.shared.f64  [%rd1], %fd64;
-
-BB9_30:
-       mov.f64         %fd63, %fd64;
-       setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB9_32;
-
-       ld.volatile.shared.f64  %fd39, [%rd1+16];
-       mul.f64         %fd63, %fd63, %fd39;
-       st.volatile.shared.f64  [%rd1], %fd63;
-
-BB9_32:
-       setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB9_34;
-
-       ld.volatile.shared.f64  %fd40, [%rd1+8];
-       mul.f64         %fd41, %fd63, %fd40;
-       st.volatile.shared.f64  [%rd1], %fd41;
-
-BB9_34:
-       setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB9_36;
-
-       ld.shared.f64   %fd42, [sdata];
-       mul.wide.u32    %rd10, %r7, 8;
-       add.s64         %rd11, %rd3, %rd10;
-       st.f64  [%rd11], %fd42;
-
-BB9_36:
-       ret;
-}
-
-       // .globl       _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 8 .b8 
_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8],
-       .param .b64 _Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<20>;
-       .reg .b32       %r<30>;
-       .reg .f64       %fd<43>;
-       .reg .b64       %rd<11>;
-
-
-       ld.param.u64    %rd2, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd3, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r6, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r5, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.u64    %rd4, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5];
-       ld.param.f64    %fd42, 
[_Z10reduce_rowI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r7, %ctaid.x;
-       setp.ge.u32     %p1, %r7, %r6;
-       @%p1 bra        BB10_34;
-
-       mov.u32         %r29, %tid.x;
-       mul.lo.s32      %r2, %r7, %r5;
-       setp.ge.u32     %p2, %r29, %r5;
-       @%p2 bra        BB10_3;
-
-BB10_2:
-       add.s32         %r9, %r29, %r2;
-       mul.wide.u32    %rd5, %r9, 8;
-       add.s64         %rd6, %rd2, %rd5;
-       ld.f64  %fd27, [%rd6];
-       add.f64         %fd42, %fd42, %fd27;
-       mov.u32         %r10, %ntid.x;
-       add.s32         %r29, %r10, %r29;
-       setp.lt.u32     %p3, %r29, %r5;
-       @%p3 bra        BB10_2;
-
-BB10_3:
-       mov.u32         %r11, %tid.x;
-       mul.wide.u32    %rd7, %r11, 8;
-       mov.u64         %rd8, sdata;
-       add.s64         %rd1, %rd8, %rd7;
-       st.shared.f64   [%rd1], %fd42;
-       bar.sync        0;
-       mov.u32         %r12, %ntid.x;
-       setp.lt.u32     %p4, %r12, 1024;
-       @%p4 bra        BB10_7;
-
-       setp.gt.u32     %p5, %r11, 511;
-       @%p5 bra        BB10_6;
-
-       ld.shared.f64   %fd28, [%rd1+4096];
-       add.f64         %fd42, %fd42, %fd28;
-       st.shared.f64   [%rd1], %fd42;
-
-BB10_6:
-       bar.sync        0;
-
-BB10_7:
-       setp.lt.u32     %p6, %r12, 512;
-       @%p6 bra        BB10_11;
-
-       setp.gt.u32     %p7, %r11, 255;
-       @%p7 bra        BB10_10;
-
-       ld.shared.f64   %fd29, [%rd1+2048];
-       add.f64         %fd42, %fd42, %fd29;
-       st.shared.f64   [%rd1], %fd42;
-
-BB10_10:
-       bar.sync        0;
-
-BB10_11:
-       setp.lt.u32     %p8, %r12, 256;
-       @%p8 bra        BB10_15;
-
-       setp.gt.u32     %p9, %r11, 127;
-       @%p9 bra        BB10_14;
-
-       ld.shared.f64   %fd30, [%rd1+1024];
-       add.f64         %fd42, %fd42, %fd30;
-       st.shared.f64   [%rd1], %fd42;
-
-BB10_14:
-       bar.sync        0;
-
-BB10_15:
-       setp.lt.u32     %p10, %r12, 128;
-       @%p10 bra       BB10_19;
-
-       setp.gt.u32     %p11, %r11, 63;
-       @%p11 bra       BB10_18;
-
-       ld.shared.f64   %fd31, [%rd1+512];
-       add.f64         %fd42, %fd42, %fd31;
-       st.shared.f64   [%rd1], %fd42;
-
-BB10_18:
-       bar.sync        0;
-
-BB10_19:
-       setp.gt.u32     %p12, %r11, 31;
-       @%p12 bra       BB10_32;
-
-       setp.lt.u32     %p13, %r12, 64;
-       @%p13 bra       BB10_22;
-
-       ld.volatile.shared.f64  %fd32, [%rd1+256];
-       add.f64         %fd42, %fd42, %fd32;
-       st.volatile.shared.f64  [%rd1], %fd42;
-
-BB10_22:
-       setp.lt.u32     %p14, %r12, 32;
-       @%p14 bra       BB10_24;
-
-       ld.volatile.shared.f64  %fd33, [%rd1+128];
-       add.f64         %fd42, %fd42, %fd33;
-       st.volatile.shared.f64  [%rd1], %fd42;
-
-BB10_24:
-       setp.lt.u32     %p15, %r12, 16;
-       @%p15 bra       BB10_26;
-
-       ld.volatile.shared.f64  %fd34, [%rd1+64];
-       add.f64         %fd42, %fd42, %fd34;
-       st.volatile.shared.f64  [%rd1], %fd42;
-
-BB10_26:
-       setp.lt.u32     %p16, %r12, 8;
-       @%p16 bra       BB10_28;
-
-       ld.volatile.shared.f64  %fd35, [%rd1+32];
-       add.f64         %fd42, %fd42, %fd35;
-       st.volatile.shared.f64  [%rd1], %fd42;
-
-BB10_28:
-       setp.lt.u32     %p17, %r12, 4;
-       @%p17 bra       BB10_30;
-
-       ld.volatile.shared.f64  %fd36, [%rd1+16];
-       add.f64         %fd42, %fd42, %fd36;
-       st.volatile.shared.f64  [%rd1], %fd42;
-
-BB10_30:
-       setp.lt.u32     %p18, %r12, 2;
-       @%p18 bra       BB10_32;
-
-       ld.volatile.shared.f64  %fd37, [%rd1+8];
-       add.f64         %fd38, %fd42, %fd37;
-       st.volatile.shared.f64  [%rd1], %fd38;
-
-BB10_32:
-       setp.ne.s32     %p19, %r11, 0;
-       @%p19 bra       BB10_34;
-
-       ld.shared.f64   %fd39, [sdata];
-       cvt.rn.f64.s64  %fd40, %rd4;
-       div.rn.f64      %fd41, %fd39, %fd40;
-       mul.wide.u32    %rd9, %r7, 8;
-       add.s64         %rd10, %rd3, %rd9;
-       st.f64  [%rd10], %fd41;
-
-BB10_34:
-       ret;
-}
-
-       // .globl       _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d
-.visible .func _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d(
-       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0,
-       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1,
-       .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2,
-       .param .b32 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3,
-       .param .align 1 .b8 
_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_4[1],
-       .param .align 8 .b8 
_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5[8],
-       .param .b64 _Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6
-)
-{
-       .reg .pred      %p<4>;
-       .reg .b32       %r<11>;
-       .reg .f64       %fd<9>;
-       .reg .b64       %rd<8>;
-
-
-       ld.param.u64    %rd1, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_0];
-       ld.param.u64    %rd2, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_1];
-       ld.param.u32    %r5, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_2];
-       ld.param.u32    %r6, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_3];
-       ld.param.u64    %rd3, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_5];
-       ld.param.f64    %fd8, 
[_Z10reduce_colI5SumOp6MeanOpEvPdS2_jjT_T0_d_param_6];
-       mov.u32         %r7, %ntid.x;
-       mov.u32         %r8, %ctaid.x;
-       mov.u32         %r9, %tid.x;
-       mad.lo.s32      %r1, %r7, %r8, %r9;
-       setp.ge.u32     %p1, %r1, %r6;
-       @%p1 bra        BB11_5;
-
-       mul.lo.s32      %r2, %r6, %r5;
-       setp.ge.u32     %p2, %r1, %r2;
-       @%p2 bra        BB11_4;
-
-       mov.u32         %r10, %r1;
-
-BB11_3:
-       mov.u32         %r3, %r10;
-       mul.wide.u32    %rd4, %r3, 8;
-       add.s64         %rd5, %rd1, %rd4;
-       ld.f64  %fd5, [%rd5];
-       add.f64         %fd8, %fd8, %fd5;
-       add.s32         %r4, %r3, %r6;
-       setp.lt.u32     %p3, %r4, %r2;
-       mov.u32         %r10, %r4;
-       @%p3 bra        BB11_3;
-
-BB11_4:
-       cvt.rn.f64.s64  %fd6, %rd3;
-       div.rn.f64      %fd7, %fd8, %fd6;
-       mul.wide.u32    %rd6, %r1, 8;
-       add.s64         %rd7, %rd2, %rd6;
-       st.f64  [%rd7], %fd7;
-
-BB11_5:
-       ret;
-}
-
-       // .globl       copyUpperToLowerTriangleDense
-.visible .entry copyUpperToLowerTriangleDense(
-       .param .u64 copyUpperToLowerTriangleDense_param_0,
-       .param .u32 copyUpperToLowerTriangleDense_param_1,
-       .param .u32 copyUpperToLowerTriangleDense_param_2
+.visible .entry copy_u2l_dense(
+       .param .u64 copy_u2l_dense_param_0,
+       .param .u32 copy_u2l_dense_param_1,
+       .param .u32 copy_u2l_dense_param_2
 )
 {
        .reg .pred      %p<4>;
@@ -1713,9 +31,9 @@ BB11_5:
        .reg .b64       %rd<7>;
 
 
-       ld.param.u64    %rd1, [copyUpperToLowerTriangleDense_param_0];
-       ld.param.u32    %r4, [copyUpperToLowerTriangleDense_param_1];
-       ld.param.u32    %r5, [copyUpperToLowerTriangleDense_param_2];
+       ld.param.u64    %rd1, [copy_u2l_dense_param_0];
+       ld.param.u32    %r4, [copy_u2l_dense_param_1];
+       ld.param.u32    %r5, [copy_u2l_dense_param_2];
        mov.u32         %r6, %ntid.x;
        mov.u32         %r7, %ctaid.x;
        mov.u32         %r8, %tid.x;
@@ -1728,10 +46,10 @@ BB11_5:
        setp.gt.s32     %p1, %r2, %r1;
        setp.lt.s32     %p2, %r3, %r5;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB12_2;
-       bra.uni         BB12_1;
+       @!%p3 bra       BB0_2;
+       bra.uni         BB0_1;
 
-BB12_1:
+BB0_1:
        cvta.to.global.u64      %rd2, %rd1;
        mad.lo.s32      %r12, %r1, %r4, %r2;
        mul.wide.s32    %rd3, %r12, 8;
@@ -1741,93 +59,7 @@ BB12_1:
        add.s64         %rd6, %rd2, %rd5;
        st.global.f64   [%rd6], %fd1;
 
-BB12_2:
-       ret;
-}
-
-       // .globl       dense_matrix_set
-.visible .entry dense_matrix_set(
-       .param .u64 dense_matrix_set_param_0,
-       .param .f64 dense_matrix_set_param_1,
-       .param .u32 dense_matrix_set_param_2,
-       .param .u32 dense_matrix_set_param_3
-)
-{
-       .reg .pred      %p<2>;
-       .reg .b32       %r<13>;
-       .reg .f64       %fd<2>;
-       .reg .b64       %rd<5>;
-
-
-       ld.param.u64    %rd1, [dense_matrix_set_param_0];
-       ld.param.f64    %fd1, [dense_matrix_set_param_1];
-       ld.param.u32    %r2, [dense_matrix_set_param_2];
-       ld.param.u32    %r3, [dense_matrix_set_param_3];
-       mov.u32         %r4, %ctaid.x;
-       mov.u32         %r5, %ntid.x;
-       mov.u32         %r6, %tid.x;
-       mad.lo.s32      %r7, %r5, %r4, %r6;
-       mov.u32         %r8, %ntid.y;
-       mov.u32         %r9, %ctaid.y;
-       mov.u32         %r10, %tid.y;
-       mad.lo.s32      %r11, %r7, %r3, %r10;
-       mad.lo.s32      %r1, %r8, %r9, %r11;
-       mul.lo.s32      %r12, %r3, %r2;
-       setp.ge.s32     %p1, %r1, %r12;
-       @%p1 bra        BB13_2;
-
-       cvta.to.global.u64      %rd2, %rd1;
-       mul.wide.s32    %rd3, %r1, 8;
-       add.s64         %rd4, %rd2, %rd3;
-       st.global.f64   [%rd4], %fd1;
-
-BB13_2:
-       ret;
-}
-
-       // .globl       dense_matrix_copy
-.visible .entry dense_matrix_copy(
-       .param .u64 dense_matrix_copy_param_0,
-       .param .u64 dense_matrix_copy_param_1,
-       .param .u32 dense_matrix_copy_param_2,
-       .param .u32 dense_matrix_copy_param_3
-)
-{
-       .reg .pred      %p<4>;
-       .reg .b32       %r<12>;
-       .reg .f64       %fd<2>;
-       .reg .b64       %rd<8>;
-
-
-       ld.param.u64    %rd1, [dense_matrix_copy_param_0];
-       ld.param.u64    %rd2, [dense_matrix_copy_param_1];
-       ld.param.u32    %r2, [dense_matrix_copy_param_2];
-       ld.param.u32    %r3, [dense_matrix_copy_param_3];
-       mov.u32         %r4, %ctaid.x;
-       mov.u32         %r5, %ntid.x;
-       mov.u32         %r6, %tid.x;
-       mad.lo.s32      %r7, %r5, %r4, %r6;
-       mov.u32         %r8, %ntid.y;
-       mov.u32         %r9, %ctaid.y;
-       mov.u32         %r10, %tid.y;
-       mad.lo.s32      %r11, %r8, %r9, %r10;
-       mad.lo.s32      %r1, %r7, %r3, %r11;
-       setp.lt.s32     %p1, %r7, %r2;
-       setp.lt.s32     %p2, %r11, %r3;
-       and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB14_2;
-       bra.uni         BB14_1;
-
-BB14_1:
-       cvta.to.global.u64      %rd3, %rd1;
-       mul.wide.s32    %rd4, %r1, 8;
-       add.s64         %rd5, %rd3, %rd4;
-       ld.global.f64   %fd1, [%rd5];
-       cvta.to.global.u64      %rd6, %rd2;
-       add.s64         %rd7, %rd6, %rd4;
-       st.global.f64   [%rd7], %fd1;
-
-BB14_2:
+BB0_2:
        ret;
 }
 
@@ -1860,10 +92,10 @@ BB14_2:
        setp.lt.s32     %p1, %r1, %r4;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB15_2;
-       bra.uni         BB15_1;
+       @!%p3 bra       BB1_2;
+       bra.uni         BB1_1;
 
-BB15_1:
+BB1_1:
        cvta.to.global.u64      %rd3, %rd1;
        mad.lo.s32      %r11, %r1, %r3, %r2;
        mul.wide.s32    %rd4, %r11, 8;
@@ -1875,75 +107,78 @@ BB15_1:
        add.s64         %rd7, %rd6, %rd4;
        st.global.f64   [%rd7], %fd3;
 
-BB15_2:
+BB1_2:
        ret;
 }
 
-       // .globl       reluBackward
-.visible .entry reluBackward(
-       .param .u64 reluBackward_param_0,
-       .param .u64 reluBackward_param_1,
-       .param .u64 reluBackward_param_2,
-       .param .u32 reluBackward_param_3,
-       .param .u32 reluBackward_param_4
+       // .globl       relu_backward
+.visible .entry relu_backward(
+       .param .u64 relu_backward_param_0,
+       .param .u64 relu_backward_param_1,
+       .param .u64 relu_backward_param_2,
+       .param .u32 relu_backward_param_3,
+       .param .u32 relu_backward_param_4
 )
 {
        .reg .pred      %p<5>;
        .reg .b32       %r<12>;
        .reg .f64       %fd<6>;
-       .reg .b64       %rd<13>;
+       .reg .b64       %rd<14>;
 
 
-       ld.param.u64    %rd1, [reluBackward_param_0];
-       ld.param.u64    %rd2, [reluBackward_param_1];
-       ld.param.u64    %rd3, [reluBackward_param_2];
-       ld.param.u32    %r5, [reluBackward_param_3];
-       ld.param.u32    %r4, [reluBackward_param_4];
-       mov.u32         %r6, %ntid.x;
-       mov.u32         %r7, %ctaid.x;
-       mov.u32         %r8, %tid.x;
-       mad.lo.s32      %r1, %r6, %r7, %r8;
-       mov.u32         %r9, %ntid.y;
-       mov.u32         %r10, %ctaid.y;
-       mov.u32         %r11, %tid.y;
-       mad.lo.s32      %r2, %r9, %r10, %r11;
-       setp.lt.s32     %p1, %r1, %r5;
-       setp.lt.s32     %p2, %r2, %r4;
+       ld.param.u64    %rd2, [relu_backward_param_0];
+       ld.param.u64    %rd3, [relu_backward_param_1];
+       ld.param.u64    %rd4, [relu_backward_param_2];
+       ld.param.u32    %r4, [relu_backward_param_3];
+       ld.param.u32    %r3, [relu_backward_param_4];
+       mov.u32         %r5, %ntid.x;
+       mov.u32         %r6, %ctaid.x;
+       mov.u32         %r7, %tid.x;
+       mad.lo.s32      %r1, %r5, %r6, %r7;
+       mov.u32         %r8, %ntid.y;
+       mov.u32         %r9, %ctaid.y;
+       mov.u32         %r10, %tid.y;
+       mad.lo.s32      %r2, %r8, %r9, %r10;
+       setp.lt.s32     %p1, %r1, %r4;
+       setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB16_4;
-       bra.uni         BB16_1;
+       @!%p3 bra       BB2_4;
+       bra.uni         BB2_1;
 
-BB16_1:
-       cvta.to.global.u64      %rd4, %rd1;
-       mad.lo.s32      %r3, %r1, %r4, %r2;
-       mul.wide.s32    %rd5, %r3, 8;
-       add.s64         %rd6, %rd4, %rd5;
-       ld.global.f64   %fd4, [%rd6];
+BB2_1:
+       cvta.to.global.u64      %rd5, %rd2;
+       mad.lo.s32      %r11, %r1, %r3, %r2;
+       cvt.s64.s32     %rd1, %r11;
+       mul.wide.s32    %rd6, %r11, 8;
+       add.s64         %rd7, %rd5, %rd6;
+       ld.global.f64   %fd4, [%rd7];
        mov.f64         %fd5, 0d0000000000000000;
        setp.leu.f64    %p4, %fd4, 0d0000000000000000;
-       @%p4 bra        BB16_3;
+       @%p4 bra        BB2_3;
 
-       cvta.to.global.u64      %rd7, %rd2;
-       add.s64         %rd9, %rd7, %rd5;
-       ld.global.f64   %fd5, [%rd9];
+       cvta.to.global.u64      %rd8, %rd3;
+       shl.b64         %rd9, %rd1, 3;
+       add.s64         %rd10, %rd8, %rd9;
+       ld.global.f64   %fd5, [%rd10];
 
-BB16_3:
-       cvta.to.global.u64      %rd10, %rd3;
-       add.s64         %rd12, %rd10, %rd5;
-       st.global.f64   [%rd12], %fd5;
+BB2_3:
+       cvta.to.global.u64      %rd11, %rd4;
+       shl.b64         %rd12, %rd1, 3;
+       add.s64         %rd13, %rd11, %rd12;
+       st.global.f64   [%rd13], %fd5;
 
-BB16_4:
+BB2_4:
        ret;
 }
 
-       // .globl       biasAdd
-.visible .entry biasAdd(
-       .param .u64 biasAdd_param_0,
-       .param .u64 biasAdd_param_1,
-       .param .u64 biasAdd_param_2,
-       .param .u32 biasAdd_param_3,
-       .param .u32 biasAdd_param_4,
-       .param .u32 biasAdd_param_5
+       // .globl       bias_add
+.visible .entry bias_add(
+       .param .u64 bias_add_param_0,
+       .param .u64 bias_add_param_1,
+       .param .u64 bias_add_param_2,
+       .param .u32 bias_add_param_3,
+       .param .u32 bias_add_param_4,
+       .param .u32 bias_add_param_5
 )
 {
        .reg .pred      %p<4>;
@@ -1952,12 +187,12 @@ BB16_4:
        .reg .b64       %rd<12>;
 
 
-       ld.param.u64    %rd1, [biasAdd_param_0];
-       ld.param.u64    %rd2, [biasAdd_param_1];
-       ld.param.u64    %rd3, [biasAdd_param_2];
-       ld.param.u32    %r5, [biasAdd_param_3];
-       ld.param.u32    %r3, [biasAdd_param_4];
-       ld.param.u32    %r4, [biasAdd_param_5];
+       ld.param.u64    %rd1, [bias_add_param_0];
+       ld.param.u64    %rd2, [bias_add_param_1];
+       ld.param.u64    %rd3, [bias_add_param_2];
+       ld.param.u32    %r5, [bias_add_param_3];
+       ld.param.u32    %r3, [bias_add_param_4];
+       ld.param.u32    %r4, [bias_add_param_5];
        mov.u32         %r6, %ctaid.x;
        mov.u32         %r7, %ntid.x;
        mov.u32         %r8, %tid.x;
@@ -1969,10 +204,10 @@ BB16_4:
        setp.lt.s32     %p1, %r1, %r5;
        setp.lt.s32     %p2, %r2, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB17_2;
-       bra.uni         BB17_1;
+       @!%p3 bra       BB3_2;
+       bra.uni         BB3_1;
 
-BB17_1:
+BB3_1:
        cvta.to.global.u64      %rd4, %rd1;
        mad.lo.s32      %r12, %r1, %r3, %r2;
        mul.wide.s32    %rd5, %r12, 8;
@@ -1988,21 +223,21 @@ BB17_1:
        add.s64         %rd11, %rd10, %rd5;
        st.global.f64   [%rd11], %fd3;
 
-BB17_2:
+BB3_2:
        ret;
 }
 
-       // .globl       compareAndSet
-.visible .entry compareAndSet(
-       .param .u64 compareAndSet_param_0,
-       .param .u64 compareAndSet_param_1,
-       .param .u32 compareAndSet_param_2,
-       .param .u32 compareAndSet_param_3,
-       .param .f64 compareAndSet_param_4,
-       .param .f64 compareAndSet_param_5,
-       .param .f64 compareAndSet_param_6,
-       .param .f64 compareAndSet_param_7,
-       .param .f64 compareAndSet_param_8
+       // .globl       compare_and_set
+.visible .entry compare_and_set(
+       .param .u64 compare_and_set_param_0,
+       .param .u64 compare_and_set_param_1,
+       .param .u32 compare_and_set_param_2,
+       .param .u32 compare_and_set_param_3,
+       .param .f64 compare_and_set_param_4,
+       .param .f64 compare_and_set_param_5,
+       .param .f64 compare_and_set_param_6,
+       .param .f64 compare_and_set_param_7,
+       .param .f64 compare_and_set_param_8
 )
 {
        .reg .pred      %p<6>;
@@ -2011,15 +246,15 @@ BB17_2:
        .reg .b64       %rd<8>;
 
 
-       ld.param.u64    %rd2, [compareAndSet_param_0];
-       ld.param.u64    %rd3, [compareAndSet_param_1];
-       ld.param.u32    %r2, [compareAndSet_param_2];
-       ld.param.u32    %r3, [compareAndSet_param_3];
-       ld.param.f64    %fd2, [compareAndSet_param_4];
-       ld.param.f64    %fd3, [compareAndSet_param_5];
-       ld.param.f64    %fd4, [compareAndSet_param_6];
-       ld.param.f64    %fd5, [compareAndSet_param_7];
-       ld.param.f64    %fd6, [compareAndSet_param_8];
+       ld.param.u64    %rd2, [compare_and_set_param_0];
+       ld.param.u64    %rd3, [compare_and_set_param_1];
+       ld.param.u32    %r2, [compare_and_set_param_2];
+       ld.param.u32    %r3, [compare_and_set_param_3];
+       ld.param.f64    %fd2, [compare_and_set_param_4];
+       ld.param.f64    %fd3, [compare_and_set_param_5];
+       ld.param.f64    %fd4, [compare_and_set_param_6];
+       ld.param.f64    %fd5, [compare_and_set_param_7];
+       ld.param.f64    %fd6, [compare_and_set_param_8];
        mov.u32         %r4, %ctaid.x;
        mov.u32         %r5, %ntid.x;
        mov.u32         %r6, %tid.x;
@@ -2032,10 +267,10 @@ BB17_2:
        setp.lt.s32     %p1, %r7, %r2;
        setp.lt.s32     %p2, %r11, %r3;
        and.pred        %p3, %p1, %p2;
-       @!%p3 bra       BB18_6;
-       bra.uni         BB18_1;
+       @!%p3 bra       BB4_6;
+       bra.uni         BB4_1;
 
-BB18_1:
+BB4_1:
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.s32    %rd5, %r1, 8;
        add.s64         %rd6, %rd4, %rd5;
@@ -2045,26 +280,26 @@ BB18_1:
        setp.lt.f64     %p4, %fd8, %fd3;
        cvta.to.global.u64      %rd7, %rd3;
        add.s64         %rd1, %rd7, %rd5;
-       @%p4 bra        BB18_5;
-       bra.uni         BB18_2;
+       @%p4 bra        BB4_5;
+       bra.uni         BB4_2;
 
-BB18_5:
+BB4_5:
        st.global.f64   [%rd1], %fd4;
-       bra.uni         BB18_6;
+       bra.uni         BB4_6;
 
-BB18_2:
+BB4_2:
        setp.lt.f64     %p5, %fd1, %fd2;
-       @%p5 bra        BB18_4;
-       bra.uni         BB18_3;
+       @%p5 bra        BB4_4;
+       bra.uni         BB4_3;
 
-BB18_4:
+BB4_4:
        st.global.f64   [%rd1], %fd5;
-       bra.uni         BB18_6;
+       bra.uni         BB4_6;
 
-BB18_3:
+BB4_3:
        st.global.f64   [%rd1], %fd6;
 
-BB18_6:
+BB4_6:
        ret;
 }
 
@@ -2080,9 +315,9 @@ BB18_6:
        .param .u32 matrix_matrix_cellwise_op_param_7
 )
 {
-       .reg .pred      %p<52>;
-       .reg .b32       %r<56>;
-       .reg .f64       %fd<40>;
+       .reg .pred      %p<54>;
+       .reg .b32       %r<55>;
+       .reg .f64       %fd<39>;
        .reg .b64       %rd<15>;
 
 
@@ -2105,93 +340,93 @@ BB18_6:
        setp.lt.s32     %p2, %r1, %r14;
        setp.lt.s32     %p3, %r2, %r10;
        and.pred        %p4, %p2, %p3;
-       @!%p4 bra       BB19_55;
-       bra.uni         BB19_1;
+       @!%p4 bra       BB5_53;
+       bra.uni         BB5_1;
 
-BB19_1:
+BB5_1:
        mad.lo.s32      %r3, %r1, %r10, %r2;
        setp.eq.s32     %p5, %r11, 1;
-       mov.u32         %r54, %r1;
-       @%p5 bra        BB19_5;
+       mov.u32         %r53, %r1;
+       @%p5 bra        BB5_5;
 
        setp.ne.s32     %p6, %r11, 2;
-       mov.u32         %r55, %r3;
-       @%p6 bra        BB19_4;
+       mov.u32         %r54, %r3;
+       @%p6 bra        BB5_4;
 
-       mov.u32         %r55, %r2;
+       mov.u32         %r54, %r2;
 
-BB19_4:
-       mov.u32         %r49, %r55;
-       mov.u32         %r4, %r49;
-       mov.u32         %r54, %r4;
+BB5_4:
+       mov.u32         %r48, %r54;
+       mov.u32         %r4, %r48;
+       mov.u32         %r53, %r4;
 
-BB19_5:
-       mov.u32         %r5, %r54;
+BB5_5:
+       mov.u32         %r5, %r53;
        setp.eq.s32     %p7, %r12, 1;
-       mov.u32         %r52, %r1;
-       @%p7 bra        BB19_9;
+       mov.u32         %r51, %r1;
+       @%p7 bra        BB5_9;
 
        setp.ne.s32     %p8, %r12, 2;
-       mov.u32         %r53, %r3;
-       @%p8 bra        BB19_8;
+       mov.u32         %r52, %r3;
+       @%p8 bra        BB5_8;
 
-       mov.u32         %r53, %r2;
+       mov.u32         %r52, %r2;
 
-BB19_8:
-       mov.u32         %r52, %r53;
+BB5_8:
+       mov.u32         %r51, %r52;
 
-BB19_9:
+BB5_9:
        cvta.to.global.u64      %rd5, %rd3;
        cvta.to.global.u64      %rd6, %rd2;
        mul.wide.s32    %rd7, %r5, 8;
        add.s64         %rd8, %rd6, %rd7;
        ld.global.f64   %fd1, [%rd8];
-       mul.wide.s32    %rd9, %r52, 8;
+       mul.wide.s32    %rd9, %r51, 8;
        add.s64         %rd10, %rd5, %rd9;
        ld.global.f64   %fd2, [%rd10];
-       mov.f64         %fd39, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd38, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p9, %r13, 5;
-       @%p9 bra        BB19_19;
+       @%p9 bra        BB5_19;
 
        setp.gt.s32     %p19, %r13, 2;
-       @%p19 bra       BB19_15;
+       @%p19 bra       BB5_15;
 
        setp.eq.s32     %p23, %r13, 0;
-       @%p23 bra       BB19_53;
+       @%p23 bra       BB5_51;
 
        setp.eq.s32     %p24, %r13, 1;
-       @%p24 bra       BB19_52;
-       bra.uni         BB19_13;
+       @%p24 bra       BB5_50;
+       bra.uni         BB5_13;
 
-BB19_52:
-       sub.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+BB5_50:
+       sub.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_19:
+BB5_19:
        setp.gt.s32     %p10, %r13, 8;
-       @%p10 bra       BB19_24;
+       @%p10 bra       BB5_24;
 
        setp.eq.s32     %p16, %r13, 6;
-       @%p16 bra       BB19_34;
+       @%p16 bra       BB5_34;
 
        setp.eq.s32     %p17, %r13, 7;
-       @%p17 bra       BB19_33;
-       bra.uni         BB19_22;
+       @%p17 bra       BB5_33;
+       bra.uni         BB5_22;
 
-BB19_33:
+BB5_33:
        setp.gt.f64     %p29, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p29;
+       bra.uni         BB5_52;
 
-BB19_15:
+BB5_15:
        setp.eq.s32     %p20, %r13, 3;
-       @%p20 bra       BB19_51;
+       @%p20 bra       BB5_49;
 
        setp.eq.s32     %p21, %r13, 4;
-       @%p21 bra       BB19_35;
-       bra.uni         BB19_17;
+       @%p21 bra       BB5_35;
+       bra.uni         BB5_17;
 
-BB19_35:
+BB5_35:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r8}, %fd1;
@@ -2221,133 +456,133 @@ BB19_35:
        param0, 
        param1
        );
-       ld.param.f64    %fd38, [retval0+0];
+       ld.param.f64    %fd37, [retval0+0];
        
        //{
        }// Callseq End 0
        setp.lt.s32     %p33, %r8, 0;
        and.pred        %p1, %p33, %p32;
-       @!%p1 bra       BB19_37;
-       bra.uni         BB19_36;
+       @!%p1 bra       BB5_37;
+       bra.uni         BB5_36;
 
-BB19_36:
+BB5_36:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r23}, %fd38;
+       mov.b64         {%temp, %r23}, %fd37;
        }
        xor.b32         %r24, %r23, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r25, %temp}, %fd38;
+       mov.b64         {%r25, %temp}, %fd37;
        }
-       mov.b64         %fd38, {%r25, %r24};
+       mov.b64         %fd37, {%r25, %r24};
 
-BB19_37:
-       mov.f64         %fd37, %fd38;
+BB5_37:
+       mov.f64         %fd36, %fd37;
        setp.eq.f64     %p34, %fd1, 0d0000000000000000;
-       @%p34 bra       BB19_40;
-       bra.uni         BB19_38;
+       @%p34 bra       BB5_40;
+       bra.uni         BB5_38;
 
-BB19_40:
+BB5_40:
        selp.b32        %r26, %r8, 0, %p32;
        or.b32          %r27, %r26, 2146435072;
        setp.lt.s32     %p38, %r9, 0;
        selp.b32        %r28, %r27, %r26, %p38;
        mov.u32         %r29, 0;
-       mov.b64         %fd37, {%r29, %r28};
-       bra.uni         BB19_41;
+       mov.b64         %fd36, {%r29, %r28};
+       bra.uni         BB5_41;
 
-BB19_24:
+BB5_24:
        setp.gt.s32     %p11, %r13, 10;
-       @%p11 bra       BB19_28;
+       @%p11 bra       BB5_28;
 
        setp.eq.s32     %p14, %r13, 9;
-       @%p14 bra       BB19_32;
-       bra.uni         BB19_26;
+       @%p14 bra       BB5_32;
+       bra.uni         BB5_26;
 
-BB19_32:
+BB5_32:
        setp.eq.f64     %p27, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB5_52;
 
-BB19_28:
+BB5_28:
        setp.eq.s32     %p12, %r13, 11;
-       @%p12 bra       BB19_31;
-       bra.uni         BB19_29;
+       @%p12 bra       BB5_31;
+       bra.uni         BB5_29;
 
-BB19_31:
-       min.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+BB5_31:
+       min.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_53:
-       add.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+BB5_51:
+       add.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_13:
+BB5_13:
        setp.eq.s32     %p25, %r13, 2;
-       @%p25 bra       BB19_14;
-       bra.uni         BB19_54;
+       @%p25 bra       BB5_14;
+       bra.uni         BB5_52;
 
-BB19_14:
-       mul.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+BB5_14:
+       mul.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_34:
+BB5_34:
        setp.le.f64     %p30, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p30;
+       bra.uni         BB5_52;
 
-BB19_22:
+BB5_22:
        setp.eq.s32     %p18, %r13, 8;
-       @%p18 bra       BB19_23;
-       bra.uni         BB19_54;
+       @%p18 bra       BB5_23;
+       bra.uni         BB5_52;
 
-BB19_23:
+BB5_23:
        setp.ge.f64     %p28, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p28;
+       bra.uni         BB5_52;
 
-BB19_51:
-       div.rn.f64      %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+BB5_49:
+       div.rn.f64      %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_17:
+BB5_17:
        setp.eq.s32     %p22, %r13, 5;
-       @%p22 bra       BB19_18;
-       bra.uni         BB19_54;
+       @%p22 bra       BB5_18;
+       bra.uni         BB5_52;
 
-BB19_18:
+BB5_18:
        setp.lt.f64     %p31, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p31;
+       bra.uni         BB5_52;
 
-BB19_26:
+BB5_26:
        setp.eq.s32     %p15, %r13, 10;
-       @%p15 bra       BB19_27;
-       bra.uni         BB19_54;
+       @%p15 bra       BB5_27;
+       bra.uni         BB5_52;
 
-BB19_27:
+BB5_27:
        setp.neu.f64    %p26, %fd1, %fd2;
-       selp.f64        %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB19_54;
+       selp.f64        %fd38, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB5_52;
 
-BB19_29:
+BB5_29:
        setp.ne.s32     %p13, %r13, 12;
-       @%p13 bra       BB19_54;
+       @%p13 bra       BB5_52;
 
-       max.f64         %fd39, %fd1, %fd2;
-       bra.uni         BB19_54;
+       max.f64         %fd38, %fd1, %fd2;
+       bra.uni         BB5_52;
 
-BB19_38:
+BB5_38:
        setp.gt.s32     %p35, %r8, -1;
-       @%p35 bra       BB19_41;
+       @%p35 bra       BB5_41;
 
        cvt.rzi.f64.f64 %fd29, %fd2;
        setp.neu.f64    %p36, %fd29, %fd2;
-       selp.f64        %fd37, 0dFFF8000000000000, %fd37, %p36;
+       selp.f64        %fd36, 0dFFF8000000000000, %fd36, %p36;
 
-BB19_41:
-       mov.f64         %fd17, %fd37;
+BB5_41:
+       mov.f64         %fd17, %fd36;
        add.f64         %fd18, %fd1, %fd2;
        {
        .reg .b32 %temp; 
@@ -2355,79 +590,77 @@ BB19_41:
        }
        and.b32         %r31, %r30, 2146435072;
        setp.ne.s32     %p39, %r31, 2146435072;
-       mov.f64         %fd36, %fd17;
-       @%p39 bra       BB19_50;
+       mov.f64         %fd35, %fd17;
+       @%p39 bra       BB5_48;
 
        setp.gtu.f64    %p40, %fd11, 0d7FF0000000000000;
-       mov.f64         %fd36, %fd18;
-       @%p40 bra       BB19_50;
+       mov.f64         %fd35, %fd18;
+       @%p40 bra       BB5_48;
 
        abs.f64         %fd30, %fd2;
        setp.gtu.f64    %p41, %fd30, 0d7FF0000000000000;
-       mov.f64         %fd35, %fd18;
-       mov.f64         %fd36, %fd35;
-       @%p41 bra       BB19_50;
-
-       and.b32         %r32, %r9, 2147483647;
-       setp.ne.s32     %p42, %r32, 2146435072;
-       @%p42 bra       BB19_46;
+       mov.f64         %fd34, %fd18;
+       mov.f64         %fd35, %fd34;
+       @%p41 bra       BB5_48;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r33, %temp}, %fd2;
+       mov.b64         {%r32, %temp}, %fd2;
        }
-       setp.eq.s32     %p43, %r33, 0;
-       @%p43 bra       BB19_49;
-
-BB19_46:
-       and.b32         %r34, %r8, 2147483647;
-       setp.ne.s32     %p44, %r34, 2146435072;
-       mov.f64         %fd33, %fd17;
-       mov.f64         %fd36, %fd33;
-       @%p44 bra       BB19_50;
-
+       and.b32         %r33, %r9, 2147483647;
+       setp.eq.s32     %p42, %r33, 2146435072;
+       setp.eq.s32     %p43, %r32, 0;
+       and.pred        %p44, %p42, %p43;
+       @%p44 bra       BB5_47;
+       bra.uni         BB5_45;
+
+BB5_47:
+       setp.gt.f64     %p48, %fd11, 0d3FF0000000000000;
+       selp.b32        %r41, 2146435072, 0, %p48;
+       xor.b32         %r42, %r41, 2146435072;
+       setp.lt.s32     %p49, %r9, 0;
+       selp.b32        %r43, %r42, %r41, %p49;
+       setp.eq.f64     %p50, %fd1, 0dBFF0000000000000;
+       selp.b32        %r44, 1072693248, %r43, %p50;
+       mov.u32         %r45, 0;
+       mov.b64         %fd35, {%r45, %r44};
+       bra.uni         BB5_48;
+
+BB5_45:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r35, %temp}, %fd1;
+       mov.b64         {%r34, %temp}, %fd1;
        }
-       setp.ne.s32     %p45, %r35, 0;
-       mov.f64         %fd36, %fd17;
-       @%p45 bra       BB19_50;
-
+       and.b32         %r35, %r8, 2147483647;
+       setp.eq.s32     %p45, %r35, 2146435072;
+       setp.eq.s32     %p46, %r34, 0;
+       and.pred        %p47, %p45, %p46;
+       mov.f64         %fd35, %fd17;
+       @!%p47 bra      BB5_48;
+       bra.uni         BB5_46;
+
+BB5_46:
        shr.s32         %r36, %r9, 31;
        and.b32         %r37, %r36, -2146435072;
-       add.s32         %r38, %r37, 2146435072;
-       or.b32          %r39, %r38, -2147483648;
-       selp.b32        %r40, %r39, %r38, %p1;
-       mov.u32         %r41, 0;
-       mov.b64         %fd36, {%r41, %r40};
-       bra.uni         BB19_50;
-
-BB19_49:
-       setp.gt.f64     %p46, %fd11, 0d3FF0000000000000;
-       selp.b32        %r42, 2146435072, 0, %p46;
-       xor.b32         %r43, %r42, 2146435072;
-       setp.lt.s32     %p47, %r9, 0;
-       selp.b32        %r44, %r43, %r42, %p47;
-       setp.eq.f64     %p48, %fd1, 0dBFF0000000000000;
-       selp.b32        %r45, 1072693248, %r44, %p48;
-       mov.u32         %r46, 0;
-       mov.b64         %fd36, {%r46, %r45};
-
-BB19_50:
-       setp.eq.f64     %p49, %fd2, 0d0000000000000000;
-       setp.eq.f64     %p50, %fd1, 0d3FF0000000000000;
-       or.pred         %p51, %p50, %p49;
-       selp.f64        %fd39, 0d3FF0000000000000, %fd36, %p51;
-
-BB19_54:
+       selp.b32        %r38, -1048576, 2146435072, %p1;
+       add.s32         %r39, %r38, %r37;
+       mov.u32         %r40, 0;
+       mov.b64         %fd35, {%r40, %r39};
+
+BB5_48:
+       setp.eq.f64     %p51, %fd2, 0d0000000000000000;
+       setp.eq.f64     %p52, %fd1, 0d3FF0000000000000;
+       or.pred         %p53, %p52, %p51;
+       selp.f64        %fd38, 0d3FF0000000000000, %fd35, %p53;
+
+BB5_52:
        cvta.to.global.u64      %rd12, %rd4;
        mul.wide.s32    %rd13, %r3, 8;
        add.s64         %rd14, %rd12, %rd13;
-       st.global.f64   [%rd14], %fd39;
+       st.global.f64   [%rd14], %fd38;
        bar.sync        0;
 
-BB19_55:
+BB5_53:
        ret;
 }
 
@@ -2441,9 +674,9 @@ BB19_55:
        .param .u32 matrix_scalar_op_param_5
 )
 {
-       .reg .pred      %p<91>;
-       .reg .b32       %r<64>;
-       .reg .f64       %fd<77>;
+       .reg .pred      %p<95>;
+       .reg .b32       %r<62>;
+       .reg .f64       %fd<75>;
        .reg .b64       %rd<12>;
 
 
@@ -2458,7 +691,7 @@ BB19_55:
        mov.u32         %r11, %tid.x;
        mad.lo.s32      %r1, %r10, %r9, %r11;
        setp.ge.s32     %p3, %r1, %r8;
-       @%p3 bra        BB20_94;
+       @%p3 bra        BB6_90;
 
        cvta.to.global.u64      %rd6, %rd5;
        cvta.to.global.u64      %rd7, %rd4;
@@ -2467,86 +700,86 @@ BB19_55:
        ld.global.f64   %fd1, [%rd9];
        add.s64         %rd1, %rd6, %rd8;
        setp.eq.s32     %p4, %r7, 0;
-       @%p4 bra        BB20_48;
+       @%p4 bra        BB6_46;
 
-       mov.f64         %fd67, 0d7FEFFFFFFFFFFFFF;
+       mov.f64         %fd66, 0d7FEFFFFFFFFFFFFF;
        setp.gt.s32     %p5, %r6, 5;
-       @%p5 bra        BB20_12;
+       @%p5 bra        BB6_12;
 
        setp.gt.s32     %p15, %r6, 2;
-       @%p15 bra       BB20_8;
+       @%p15 bra       BB6_8;
 
        setp.eq.s32     %p19, %r6, 0;
-       @%p19 bra       BB20_46;
+       @%p19 bra       BB6_44;
 
        setp.eq.s32     %p20, %r6, 1;
-       @%p20 bra       BB20_45;
-       bra.uni         BB20_6;
+       @%p20 bra       BB6_43;
+       bra.uni         BB6_6;
 
-BB20_45:
-       sub.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB20_47;
+BB6_43:
+       sub.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB6_45;
 
-BB20_48:
-       mov.f64         %fd76, 0d7FEFFFFFFFFFFFFF;
-       setp.gt.s32     %p48, %r6, 5;
-       @%p48 bra       BB20_58;
+BB6_46:
+       mov.f64         %fd74, 0d7FEFFFFFFFFFFFFF;
+       setp.gt.s32     %p50, %r6, 5;
+       @%p50 bra       BB6_56;
 
-       setp.gt.s32     %p58, %r6, 2;
-       @%p58 bra       BB20_54;
+       setp.gt.s32     %p60, %r6, 2;
+       @%p60 bra       BB6_52;
 
-       setp.eq.s32     %p62, %r6, 0;
-       @%p62 bra       BB20_92;
+       setp.eq.s32     %p64, %r6, 0;
+       @%p64 bra       BB6_88;
 
-       setp.eq.s32     %p63, %r6, 1;
-       @%p63 bra       BB20_91;
-       bra.uni         BB20_52;
+       setp.eq.s32     %p65, %r6, 1;
+       @%p65 bra       BB6_87;
+       bra.uni         BB6_50;
 
-BB20_91:
-       sub.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
+BB6_87:
+       sub.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
 
-BB20_12:
+BB6_12:
        setp.gt.s32     %p6, %r6, 8;
-       @%p6 bra        BB20_17;
+       @%p6 bra        BB6_17;
 
        setp.eq.s32     %p12, %r6, 6;
-       @%p12 bra       BB20_27;
+       @%p12 bra       BB6_27;
 
        setp.eq.s32     %p13, %r6, 7;
-       @%p13 bra       BB20_26;
-       bra.uni         BB20_15;
+       @%p13 bra       BB6_26;
+       bra.uni         BB6_15;
 
-BB20_26:
+BB6_26:
        setp.lt.f64     %p25, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p25;
+       bra.uni         BB6_45;
 
-BB20_58:
-       setp.gt.s32     %p49, %r6, 8;
-       @%p49 bra       BB20_63;
+BB6_56:
+       setp.gt.s32     %p51, %r6, 8;
+       @%p51 bra       BB6_61;
 
-       setp.eq.s32     %p55, %r6, 6;
-       @%p55 bra       BB20_73;
+       setp.eq.s32     %p57, %r6, 6;
+       @%p57 bra       BB6_71;
 
-       setp.eq.s32     %p56, %r6, 7;
-       @%p56 bra       BB20_72;
-       bra.uni         BB20_61;
+       setp.eq.s32     %p58, %r6, 7;
+       @%p58 bra       BB6_70;
+       bra.uni         BB6_59;
 
-BB20_72:
-       setp.gt.f64     %p68, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
-       bra.uni         BB20_93;
+BB6_70:
+       setp.gt.f64     %p70, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p70;
+       bra.uni         BB6_89;
 
-BB20_8:
+BB6_8:
        setp.eq.s32     %p16, %r6, 3;
-       @%p16 bra       BB20_44;
+       @%p16 bra       BB6_42;
 
        setp.eq.s32     %p17, %r6, 4;
-       @%p17 bra       BB20_28;
-       bra.uni         BB20_10;
+       @%p17 bra       BB6_28;
+       bra.uni         BB6_10;
 
-BB20_28:
+BB6_28:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r2}, %fd52;
@@ -2576,64 +809,64 @@ BB20_28:
        param0, 
        param1
        );
-       ld.param.f64    %fd66, [retval0+0];
+       ld.param.f64    %fd65, [retval0+0];
        
        //{
        }// Callseq End 1
        setp.lt.s32     %p29, %r2, 0;
        and.pred        %p1, %p29, %p28;
-       @!%p1 bra       BB20_30;
-       bra.uni         BB20_29;
+       @!%p1 bra       BB6_30;
+       bra.uni         BB6_29;
 
-BB20_29:
+BB6_29:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r14}, %fd66;
+       mov.b64         {%temp, %r14}, %fd65;
        }
        xor.b32         %r15, %r14, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r16, %temp}, %fd66;
+       mov.b64         {%r16, %temp}, %fd65;
        }
-       mov.b64         %fd66, {%r16, %r15};
+       mov.b64         %fd65, {%r16, %r15};
 
-BB20_30:
-       mov.f64         %fd65, %fd66;
+BB6_30:
+       mov.f64         %fd64, %fd65;
        setp.eq.f64     %p30, %fd52, 0d0000000000000000;
-       @%p30 bra       BB20_33;
-       bra.uni         BB20_31;
+       @%p30 bra       BB6_33;
+       bra.uni         BB6_31;
 
-BB20_33:
+BB6_33:
        selp.b32        %r17, %r2, 0, %p28;
        or.b32          %r18, %r17, 2146435072;
        setp.lt.s32     %p34, %r3, 0;
        selp.b32        %r19, %r18, %r17, %p34;
        mov.u32         %r20, 0;
-       mov.b64         %fd65, {%r20, %r19};
-       bra.uni         BB20_34;
+       mov.b64         %fd64, {%r20, %r19};
+       bra.uni         BB6_34;
 
-BB20_17:
+BB6_17:
        setp.gt.s32     %p7, %r6, 10;
-       @%p7 bra        BB20_21;
+       @%p7 bra        BB6_21;
 
        setp.eq.s32     %p10, %r6, 9;
-       @%p10 bra       BB20_25;
-       bra.uni         BB20_19;
+       @%p10 bra       BB6_25;
+       bra.uni         BB6_19;
 
-BB20_25:
+BB6_25:
        setp.eq.f64     %p23, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p23;
+       bra.uni         BB6_45;
 
-BB20_54:
-       setp.eq.s32     %p59, %r6, 3;
-       @%p59 bra       BB20_90;
+BB6_52:
+       setp.eq.s32     %p61, %r6, 3;
+       @%p61 bra       BB6_86;
 
-       setp.eq.s32     %p60, %r6, 4;
-       @%p60 bra       BB20_74;
-       bra.uni         BB20_56;
+       setp.eq.s32     %p62, %r6, 4;
+       @%p62 bra       BB6_72;
+       bra.uni         BB6_54;
 
-BB20_74:
+BB6_72:
        {
        .reg .b32 %temp; 
        mov.b64         {%temp, %r4}, %fd1;
@@ -2642,11 +875,11 @@ BB20_74:
        .reg .b32 %temp; 
        mov.b64         {%temp, %r5}, %fd52;
        }
-       bfe.u32         %r38, %r5, 20, 11;
-       add.s32         %r39, %r38, -1012;
+       bfe.u32         %r37, %r5, 20, 11;
+       add.s32         %r38, %r37, -1012;
        mov.b64          %rd11, %fd52;
-       shl.b64         %rd3, %rd11, %r39;
-       setp.eq.s64     %p71, %rd3, -9223372036854775808;
+       shl.b64         %rd3, %rd11, %r38;
+       setp.eq.s64     %p73, %rd3, -9223372036854775808;
        abs.f64         %fd35, %fd1;
        // Callseq Start 2
        {
@@ -2663,201 +896,201 @@ BB20_74:
        param0, 
        param1
        );
-       ld.param.f64    %fd75, [retval0+0];
+       ld.param.f64    %fd73, [retval0+0];
        
        //{
        }// Callseq End 2
-       setp.lt.s32     %p72, %r4, 0;
-       and.pred        %p2, %p72, %p71;
-       @!%p2 bra       BB20_76;
-       bra.uni         BB20_75;
+       setp.lt.s32     %p74, %r4, 0;
+       and.pred        %p2, %p74, %p73;
+       @!%p2 bra       BB6_74;
+       bra.uni         BB6_73;
 
-BB20_75:
+BB6_73:
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r40}, %fd75;
+       mov.b64         {%temp, %r39}, %fd73;
        }
-       xor.b32         %r41, %r40, -2147483648;
+       xor.b32         %r40, %r39, -2147483648;
        {
        .reg .b32 %temp; 
-       mov.b64         {%r42, %temp}, %fd75;
+       mov.b64         {%r41, %temp}, %fd73;
        }
-       mov.b64         %fd75, {%r42, %r41};
+       mov.b64         %fd73, {%r41, %r40};
 
-BB20_76:
-       mov.f64         %fd74, %fd75;
-       setp.eq.f64     %p73, %fd1, 0d0000000000000000;
-       @%p73 bra       BB20_79;
-       bra.uni         BB20_77;
-
-BB20_79:
-       selp.b32        %r43, %r4, 0, %p71;
-       or.b32          %r44, %r43, 2146435072;
-       setp.lt.s32     %p77, %r5, 0;
-       selp.b32        %r45, %r44, %r43, %p77;
-       mov.u32         %r46, 0;
-       mov.b64         %fd74, {%r46, %r45};
-       bra.uni         BB20_80;
-
-BB20_63:
-       setp.gt.s32     %p50, %r6, 10;
-       @%p50 bra       BB20_67;
-
-       setp.eq.s32     %p53, %r6, 9;
-       @%p53 bra       BB20_71;
-       bra.uni         BB20_65;
-
-BB20_71:
-       setp.eq.f64     %p66, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
-       bra.uni         BB20_93;
-
-BB20_21:
+BB6_74:
+       mov.f64         %fd72, %fd73;
+       setp.eq.f64     %p75, %fd1, 0d0000000000000000;
+       @%p75 bra       BB6_77;
+       bra.uni         BB6_75;
+
+BB6_77:
+       selp.b32        %r42, %r4, 0, %p73;
+       or.b32          %r43, %r42, 2146435072;
+       setp.lt.s32     %p79, %r5, 0;
+       selp.b32        %r44, %r43, %r42, %p79;
+       mov.u32         %r45, 0;
+       mov.b64         %fd72, {%r45, %r44};
+       bra.uni         BB6_78;
+
+BB6_61:
+       setp.gt.s32     %p52, %r6, 10;
+       @%p52 bra       BB6_65;
+
+       setp.eq.s32     %p55, %r6, 9;
+       @%p55 bra       BB6_69;
+       bra.uni         BB6_63;
+
+BB6_69:
+       setp.eq.f64     %p68, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p68;
+       bra.uni         BB6_89;
+
+BB6_21:
        setp.eq.s32     %p8, %r6, 11;
-       @%p8 bra        BB20_24;
-       bra.uni         BB20_22;
+       @%p8 bra        BB6_24;
+       bra.uni         BB6_22;
 
-BB20_24:
-       min.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB20_47;
+BB6_24:
+       min.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB6_45;
 
-BB20_46:
-       add.f64         %fd67, %fd1, %fd52;
-       bra.uni         BB20_47;
+BB6_44:
+       add.f64         %fd66, %fd1, %fd52;
+       bra.uni         BB6_45;
 
-BB20_6:
+BB6_6:
        setp.eq.s32     %p21, %r6, 2;
-       @%p21 bra       BB20_7;
-       bra.uni         BB20_47;
+       @%p21 bra       BB6_7;
+       bra.uni         BB6_45;
 
-BB20_7:
-       mul.f64         %fd67, %fd1, %fd52;
-       bra.uni         BB20_47;
+BB6_7:
+       mul.f64         %fd66, %fd1, %fd52;
+       bra.uni         BB6_45;
 
-BB20_27:
+BB6_27:
        setp.ge.f64     %p26, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p26;
+       bra.uni         BB6_45;
 
-BB20_15:
+BB6_15:
        setp.eq.s32     %p14, %r6, 8;
-       @%p14 bra       BB20_16;
-       bra.uni         BB20_47;
+       @%p14 bra       BB6_16;
+       bra.uni         BB6_45;
 
-BB20_16:
+BB6_16:
        setp.le.f64     %p24, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p24;
+       bra.uni         BB6_45;
 
-BB20_44:
-       div.rn.f64      %fd67, %fd52, %fd1;
-       bra.uni         BB20_47;
+BB6_42:
+       div.rn.f64      %fd66, %fd52, %fd1;
+       bra.uni         BB6_45;
 
-BB20_10:
+BB6_10:
        setp.eq.s32     %p18, %r6, 5;
-       @%p18 bra       BB20_11;
-       bra.uni         BB20_47;
+       @%p18 bra       BB6_11;
+       bra.uni         BB6_45;
 
-BB20_11:
+BB6_11:
        setp.gt.f64     %p27, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p27;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p27;
+       bra.uni         BB6_45;
 
-BB20_67:
-       setp.eq.s32     %p51, %r6, 11;
-       @%p51 bra       BB20_70;
-       bra.uni         BB20_68;
+BB6_65:
+       setp.eq.s32     %p53, %r6, 11;
+       @%p53 bra       BB6_68;
+       bra.uni         BB6_66;
 
-BB20_70:
-       min.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
+BB6_68:
+       min.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
 
-BB20_19:
+BB6_19:
        setp.eq.s32     %p11, %r6, 10;
-       @%p11 bra       BB20_20;
-       bra.uni         BB20_47;
+       @%p11 bra       BB6_20;
+       bra.uni         BB6_45;
 
-BB20_20:
+BB6_20:
        setp.neu.f64    %p22, %fd1, %fd52;
-       selp.f64        %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
-       bra.uni         BB20_47;
+       selp.f64        %fd66, 0d3FF0000000000000, 0d0000000000000000, %p22;
+       bra.uni         BB6_45;
 
-BB20_22:
+BB6_22:
        setp.ne.s32     %p9, %r6, 12;
-       @%p9 bra        BB20_47;
-
-       max.f64         %fd67, %fd52, %fd1;
-       bra.uni         BB20_47;
-
-BB20_92:
-       add.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
-
-BB20_52:
-       setp.eq.s32     %p64, %r6, 2;
-       @%p64 bra       BB20_53;
-       bra.uni         BB20_93;
-
-BB20_53:
-       mul.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
-
-BB20_73:
-       setp.le.f64     %p69, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p69;
-       bra.uni         BB20_93;
-
-BB20_61:
-       setp.eq.s32     %p57, %r6, 8;
-       @%p57 bra       BB20_62;
-       bra.uni         BB20_93;
-
-BB20_62:
-       setp.ge.f64     %p67, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
-       bra.uni         BB20_93;
-
-BB20_90:
-       div.rn.f64      %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
-
-BB20_56:
-       setp.eq.s32     %p61, %r6, 5;
-       @%p61 bra       BB20_57;
-       bra.uni         BB20_93;
-
-BB20_57:
-       setp.lt.f64     %p70, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p70;
-       bra.uni         BB20_93;
-
-BB20_65:
-       setp.eq.s32     %p54, %r6, 10;
-       @%p54 bra       BB20_66;
-       bra.uni         BB20_93;
-
-BB20_66:
-       setp.neu.f64    %p65, %fd1, %fd52;
-       selp.f64        %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
-       bra.uni         BB20_93;
-
-BB20_68:
-       setp.ne.s32     %p52, %r6, 12;
-       @%p52 bra       BB20_93;
-
-       max.f64         %fd76, %fd1, %fd52;
-       bra.uni         BB20_93;
-
-BB20_31:
+       @%p9 bra        BB6_45;
+
+       max.f64         %fd66, %fd52, %fd1;
+       bra.uni         BB6_45;
+
+BB6_88:
+       add.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
+
+BB6_50:
+       setp.eq.s32     %p66, %r6, 2;
+       @%p66 bra       BB6_51;
+       bra.uni         BB6_89;
+
+BB6_51:
+       mul.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
+
+BB6_71:
+       setp.le.f64     %p71, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p71;
+       bra.uni         BB6_89;
+
+BB6_59:
+       setp.eq.s32     %p59, %r6, 8;
+       @%p59 bra       BB6_60;
+       bra.uni         BB6_89;
+
+BB6_60:
+       setp.ge.f64     %p69, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p69;
+       bra.uni         BB6_89;
+
+BB6_86:
+       div.rn.f64      %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
+
+BB6_54:
+       setp.eq.s32     %p63, %r6, 5;
+       @%p63 bra       BB6_55;
+       bra.uni         BB6_89;
+
+BB6_55:
+       setp.lt.f64     %p72, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p72;
+       bra.uni         BB6_89;
+
+BB6_63:
+       setp.eq.s32     %p56, %r6, 10;
+       @%p56 bra       BB6_64;
+       bra.uni         BB6_89;
+
+BB6_64:
+       setp.neu.f64    %p67, %fd1, %fd52;
+       selp.f64        %fd74, 0d3FF0000000000000, 0d0000000000000000, %p67;
+       bra.uni         BB6_89;
+
+BB6_66:
+       setp.ne.s32     %p54, %r6, 12;
+       @%p54 bra       BB6_89;
+
+       max.f64         %fd74, %fd1, %fd52;
+       bra.uni         BB6_89;
+
+BB6_31:
        setp.gt.s32     %p31, %r2, -1;
-       @%p31 bra       BB20_34;
+       @%p31 bra       BB6_34;
 
        cvt.rzi.f64.f64 %fd54, %fd1;
        setp.neu.f64    %p32, %fd54, %fd1;
-       selp.f64        %fd65, 0dFFF8000000000000, %fd65, %p32;
+       selp.f64        %fd64, 0dFFF8000000000000, %fd64, %p32;
 
-BB20_34:
-       mov.f64         %fd16, %fd65;
+BB6_34:
+       mov.f64         %fd16, %fd64;
        add.f64         %fd17, %fd1, %fd52;
        {
        .reg .b32 %temp; 
@@ -2865,161 +1098,157 @@ BB20_34:
        }
        and.b32         %r22, %r21, 2146435072;
        setp.ne.s32     %p35, %r22, 2146435072;
-       mov.f64         %fd64, %fd16;
-       @%p35 bra       BB20_43;
+       mov.f64         %fd63, %fd16;
+       @%p35 bra       BB6_41;
 
        setp.gtu.f64    %p36, %fd10, 0d7FF0000000000000;
-       mov.f64         %fd64, %fd17;
-       @%p36 bra       BB20_43;
+       mov.f64         %fd63, %fd17;
+       @%p36 bra       BB6_41;
 
        abs.f64         %fd55, %fd1;
        setp.gtu.f64    %p37, %fd55, 0d7FF0000000000000;
-       mov.f64         %fd63, %fd17;
-       mov.f64         %fd64, %fd63;
-       @%p37 bra       BB20_43;
-
-       and.b32         %r23, %r3, 2147483647;
-       setp.ne.s32     %p38, %r23, 2146435072;
-       @%p38 bra       BB20_39;
-
-       {
-       .reg .b32 %temp; 
-       mov.b64         {%r24, %temp}, %fd1;
-       }
-       setp.eq.s32     %p39, %r24, 0;
-       @%p39 bra       BB20_42;
-
-BB20_39:
-       and.b32         %r25, %r2, 2147483647;
-       setp.ne.s32     %p40, %r25, 2146435072;
-       mov.f64         %fd61, %fd16;
-       mov.f64         %fd64, %fd61;
-       @%p40 bra       BB20_43;
+       mov.f64         %fd62, %fd17;
+       mov.f64         %fd63, %fd62;
+       @%p37 bra       BB6_41;
 
        {
        .reg .b32 %temp; 
-       mov.b64         {%r26, %temp}, %fd52;
+       mov.b64         {%r23, %temp}, %fd1;
        }
-       setp.ne.s32     %p41, %r26, 0;
-       mov.f64         %fd64, %fd16;
-       @%p41 bra       BB20_43;
-
-       shr.s32         %r27, %r3, 31;
-       and.b32         %r28, %r27, -2146435072;
-       add.s32         %r29, %r28, 2146435072;
-       or.b32          %r30, %r29, -2147483648;
-       selp.b32        %r31, %r30, %r29, %p1;
-       mov.u32         %r32, 0;
-       mov.b64         %fd64, {%r32, %r31};
-       bra.uni         BB20_43;
-
-BB20_77:
-       setp.gt.s32     %p74, %r4, -1;
-       @%p74 bra       BB20_80;
+       and.b32         %r24, %r3, 2147483647;
+       setp.eq.s32     %p38, %r24, 2146435072;
+       setp.eq.s32     %p39, %r23, 0;
+       and.pred        %p40, %p38, %p39;
+       @%p40 bra       BB6_40;
+       bra.uni         BB6_38;
+
+BB6_40:
+       setp.gt.f64     %p44, %fd10, 0d3FF0000000000000;
+       selp.b32        %r32, 2146435072, 0, %p44;
+       xor.b32         %r33, %r32, 2146435072;
+       setp.lt.s32     %p45, %r3, 0;
+       selp.b32        %r34, %r33, %r32, %p45;
+       setp.eq.f64     %p46, %fd52, 0dBFF0000000000000;
+       selp.b32        %r35, 1072693248, %r34, %p46;
+       mov.u32         %r36, 0;
+       mov.b64         %fd63, {%r36, %r35};
+       bra.uni         BB6_41;
+
+BB6_75:
+       setp.gt.s32     %p76, %r4, -1;
+       @%p76 bra       BB6_78;
 
        cvt.rzi.f64.f64 %fd57, %fd52;
-       setp.neu.f64    %p75, %fd57, %fd52;
-       selp.f64        %fd74, 0dFFF8000000000000, %fd74, %p75;
+       setp.neu.f64    %p77, %fd57, %fd52;
+       selp.f64        %fd72, 0dFFF8000000000000, %fd72, %p77;
 
-BB20_80:
-       mov.f64         %fd41, %fd74;
+BB6_78:
+       mov.f64         %fd41, %fd72;
        add.f64         %fd42, %fd1, %fd52;
        {
        .reg .b32 %temp; 
-       mov.b64         {%temp, %r47}, %fd42;
+       mov.b64         {%temp, %r46}, %fd42;
        }
-       and.b32         %r48, %r47, 2146435072;
-       setp.ne.s32     %p78, %r48, 2146435072;
-       mov.f64         %fd73, %fd41;
-       @%p78 bra       BB20_89;
+       and.b32         %r47, %r46, 2146435072;
+       setp.ne.s32     %p80, %r47, 2146435072;
+       mov.f64         %fd71, %fd41;
+       @%p80 bra       BB6_85;
 
-       setp.gtu.f64    %p79, %fd35, 0d7FF0000000000000;
-       mov.f64         %fd73, %fd42;
-       @%p79 bra       BB20_89;
+       setp.gtu.f64    %p81, %fd35, 0d7FF0000000000000;
+       mov.f64         %fd71, %fd42;
+       @%p81 bra       BB6_85;
 
        abs.f64         %fd58, %fd52;
-       setp.gtu.f64    %p80, %fd58, 0d7FF0000000000000;
-       mov.f64         %fd72, %fd42;
-       mov.f64         %fd73, %fd72;
-       @%p80 bra       BB20_89;
+       setp.gtu.f64    %p82, %fd58, 0d7FF0000000000000;
+       mov.f64         %fd70, %fd42;
+       mov.f64         %fd71, %fd70;
+       @%p82 bra       BB6_85;
 
+       {
+       .reg .b32 %temp; 
+       mov.b64         {%r48, %temp}, %fd52;
+       }
        and.b32         %r49, %r5, 2147483647;
-       setp.ne.s32     %p81, %r49, 2146435072;
-       @%p81 bra       BB20_85;
-
+       setp.eq.s32     %p83, %r49, 2146435072;
+       setp.eq.s32     %p84, %r48, 0;
+       and.pred        %p85, %p83, %p84;
+       @%p85 bra       BB6_84;
+       bra.uni         BB6_82;
+
+BB6_84:
+       setp.gt.f64     %p89, %fd35, 0d3FF0000000000000;
+       selp.b32        %r57, 2146435072, 0, %p89;
+       xor.b32         %r58, %r57, 2146435072;
+       setp.lt.s32     %p90, %r5, 0;
+       selp.b32        %r59, %r58, %r57, %p90;
+       setp.eq.f64     %p91, %fd1, 0dBFF0000000000000;
+       selp.b32        %r60, 1072693248, %r59, %p91;
+       mov.u32         %r61, 0;
+       mov.b64         %fd71, {%r61, %r60};
+       bra.uni         BB6_85;
+
+BB6_38:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r50, %temp}, %fd52;
+       mov.b64         {%r25, %temp}, %fd52;
        }
-       setp.eq.s32     %p82, %r50, 0;
-       @%p82 bra       BB20_88;
-
-BB20_85:
-       and.b32         %r51, %r4, 2147483647;
-       setp.ne.s32     %p83, %r51, 2146435072;
-       mov.f64         %fd70, %fd41;
-       mov.f64         %fd73, %fd70;
-       @%p83 bra       BB20_89;
-
+       and.b32         %r26, %r2, 2147483647;
+       setp.eq.s32     %p41, %r26, 2146435072;
+       setp.eq.s32     %p42, %r25, 0;
+       and.pred        %p43, %p41, %p42;
+       mov.f64         %fd63, %fd16;
+       @!%p43 bra      BB6_41;
+       bra.uni         BB6_39;
+
+BB6_39:
+       shr.s32         %r27, %r3, 31;
+       and.b32         %r28, %r27, -2146435072;
+       selp.b32        %r29, -1048576, 2146435072, %p1;
+       add.s32         %r30, %r29, %r28;
+       mov.u32         %r31, 0;
+       mov.b64         %fd63, {%r31, %r30};
+
+BB6_41:
+       setp.eq.f64     %p47, %fd1, 0d0000000000000000;
+       setp.eq.f64     %p48, %fd52, 0d3FF0000000000000;
+       or.pred         %p49, %p48, %p47;
+       selp.f64        %fd66, 0d3FF0000000000000, %fd63, %p49;
+
+BB6_45:
+       st.global.f64   [%rd1], %fd66;
+       bra.uni         BB6_90;
+
+BB6_82:
        {
        .reg .b32 %temp; 
-       mov.b64         {%r52, %temp}, %fd1;
+       mov.b64         {%r50, %temp}, %fd1;
        }
-       setp.ne.s32     %p84, %r52, 0;
-       mov.f64         %fd73, %fd41;
-       @%p84 bra       BB20_89;
-
-       shr.s32         %r53, %r5, 31;
-       and.b32         %r54, %r53, -2146435072;
-       add.s32         %r55, %r54, 2146435072;
-       or.b32          %r56, %r55, -2147483648;
-       selp.b32        %r57, %r56, %r55, %p2;
-       mov.u32         %r58, 0;
-       mov.b64         %fd73, {%r58, %r57};
-       bra.uni         BB20_89;
-
-BB20_42:
-       setp.gt.f64     %p42, %fd10, 0d3FF0000000000000;
-       selp.b32        %r33, 2146435072, 0, %p42;
-       xor.b32         %r34, %r33, 2146435072;
-       setp.lt.s32     %p43, %r3, 0;
-       selp.b32        %r35, %r34, %r33, %p43;
-       setp.eq.f64     %p44, %fd52, 0dBFF0000000000000;
-       selp.b32        %r36, 1072693248, %r35, %p44;
-       mov.u32         %r37, 0;
-       mov.b64         %fd64, {%r37, %r36};
-
-BB20_43:
-       setp.eq.f64     %p45, %fd1, 0d0000000000000000;
-       setp.eq.f64     %p46, %fd52, 0d3FF0000000000000;
-       or.pred         %p47, %p46, %p45;
-       selp.f64        %fd67, 0d3FF0000000000000, %fd64, %p47;
-
-BB20_47:
-       st.global.f64   [%rd1], %fd67;
-       bra.uni         BB20_94;
-
-BB20_88:
-       setp.gt.f64     %p85, %fd35, 0d3FF0000000000000;
-       selp.b32        %r59, 2146435072, 0, %p85;
-       xor.b32         %r60, %r59, 2146435072;
-       setp.lt.s32     %p86, %r5, 0;
-       selp.b32        %r61, %r60, %r59, %p86;
-       setp.eq.f64     %p87, %fd1, 0dBFF0000000000000;
-       selp.b32        %r62, 1072693248, %r61, %p87;
-       mov.u32         %r63, 0;
-       mov.b64         %fd73, {%r63, %r62};
-
-BB20_89:
-       setp.eq.f64     %p88, %fd52, 0d0000000000000000;
-       setp.eq.f64     %p89, %fd1, 0d3FF0000000000000;
-       or.pred         %p90, %p89, %p88;
-       selp.f64        %fd76, 0d3FF0000000000000, %fd73, %p90;
-
-BB20_93:
-       st.global.f64   [%rd1], %fd76;
-
-BB20_94:
+       and.b32         %r51, %r4, 2147483647;
+       setp.eq.s32     %p86, %r51, 2146435072;
+       setp.eq.s32     %p87, %r50, 0;
+       and.pred        %p88, %p86, %p87;
+       mov.f64         %fd71, %fd41;
+       @!%p88 bra      BB6_85;
+       bra.uni         BB6_83;
+
+BB6_83:
+       shr.s32         %r52, %r5, 31;
+       and.b32         %r53, %r52, -2146435072;
+       selp.b32        %r54, -1048576, 2146435072, %p2;
+       add.s32         %r55, %r54, %r53;
+       mov.u32         %r56, 0;
+       mov.b64         %fd71, {%r56, %r55};
+
+BB6_85:
+       setp.eq.f64     %p92, %fd52, 0d0000000000000000;
+       setp.eq.f64     %p93, %fd1, 0d3FF0000000000000;
+       or.pred         %p94, %p93, %p92;
+       selp.f64        %fd74, 0d3FF0000000000000, %fd71, %p94;
+
+BB6_89:
+       st.global.f64   [%rd1], %fd74;
+
+BB6_90:
        bar.sync        0;
        ret;
 }
@@ -3045,14 +1274,14 @@ BB20_94:
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r4, %r3, %r5;
        setp.ge.s32     %p1, %r1, %r2;
-       @%p1 bra        BB21_2;
+       @%p1 bra        BB7_2;
 
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r1, 8;
        add.s64         %rd4, %rd2, %rd3;
        st.global.f64   [%rd4], %fd1;
 
-BB21_2:
+BB7_2:
        ret;
 }
 
@@ -3080,9 +1309,9 @@ BB21_2:
        mov.f64         %fd76, 0d0000000000000000;
        mov.f64         %fd77, %fd76;
        setp.ge.u32     %p1, %r32, %r5;
-       @%p1 bra        BB22_4;
+       @%p1 bra        BB8_4;
 
-BB22_1:
+BB8_1:
        mov.f64         %fd1, %fd77;
        cvta.to.global.u64      %rd4, %rd2;
        mul.wide.u32    %rd5, %r32, 8;
@@ -3091,23 +1320,23 @@ BB22_1:
        add.f64         %fd78, %fd1, %fd30;
        add.s32         %r3, %r32, %r9;
        setp.ge.u32     %p2, %r3, %r5;
-       @%p2 bra        BB22_3;
+       @%p2 bra        BB8_3;
 
        mul.wide.u32    %rd8, %r3, 8;
        add.s64         %rd9, %rd4, %rd8;
        ld.global.f64   %fd31, [%rd9];
        add.f64         %fd78, %fd78, %fd31;
 
-BB22_3:
+BB8_3:
        mov.f64         %fd77, %fd78;
        shl.b32         %r12, %r9, 1;
        mov.u32         %r13, %nctaid.x;
        mad.lo.s32      %r32, %r12, %r13, %r32;
        setp.lt.u32     %p3, %r32, %r5;
        mov.f64         %fd76, %fd77;
-       @%p3 bra        BB22_1;
+       @%p3 bra        BB8_1;
 
-BB22_4:
+BB8_4:
        mov.f64         %fd74, %fd76;
        mul.wide.u32    %rd10, %r6, 8;
        mov.u64         %rd11, sdata;
@@ -3115,130 +1344,130 @@ BB22_4:
        st.shared.f64   [%rd1], %fd74;
        bar.sync        0;
        setp.lt.u32     %p4, %r9, 1024;
-       @%p4 bra        BB22_8;
+       @%p4 bra        BB8_8;
 
        setp.gt.u32     %p5, %r6, 511;
        mov.f64         %fd75, %fd74;
-       @%p5 bra        BB22_7;
+       @%p5 bra        BB8_7;
 
        ld.shared.f64   %fd32, [%rd1+4096];
        add.f64         %fd75, %fd74, %fd32;
        st.shared.f64   [%rd1], %fd75;
 
-BB22_7:
+BB8_7:
        mov.f64         %fd74, %fd75;
        bar.sync        0;
 
-BB22_8:
+BB8_8:
        mov.f64         %fd72, %fd74;
        setp.lt.u32     %p6, %r9, 512;
-       @%p6 bra        BB22_12;
+       @%p6 bra        BB8_12;
 
        setp.gt.u32     %p7, %r6, 255;
        mov.f64         %fd73, %fd72;
-       @%p7 bra        BB22_11;
+       @%p7 bra        BB8_11;
 
        ld.shared.f64   %fd33, [%rd1+2048];
        add.f64         %fd73, %fd72, %fd33;
        st.shared.f64   [%rd1], %fd73;
 
-BB22_11:
+BB8_11:
        mov.f64         %fd72, %fd73;
        bar.sync        0;
 
-BB22_12:
+BB8_12:
        mov.f64         %fd70, %fd72;
        setp.lt.u32     %p8, %r9, 256;
-       @%p8 bra        BB22_16;
+       @%p8 bra        BB8_16;
 
        setp.gt.u32     %p9, %r6, 127;
        mov.f64         %fd71, %fd70;
-       @%p9 bra        BB22_15;
+       @%p9 bra        BB8_15;
 
        ld.shared.f64   %fd34, [%rd1+1024];
        add.f64         %fd71, %fd70, %fd34;
        st.shared.f64   [%rd1], %fd71;
 
-BB22_15:
+BB8_15:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB22_16:
+BB8_16:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p10, %r9, 128;
-       @%p10 bra       BB22_20;
+       @%p10 bra       BB8_20;
 
        setp.gt.u32     %p11, %r6, 63;
        mov.f64         %fd69, %fd68;
-       @%p11 bra       BB22_19;
+       @%p11 bra       BB8_19;
 
        ld.shared.f64   %fd35, [%rd1+512];
        add.f64         %fd69, %fd68, %fd35;
        st.shared.f64   [%rd1], %fd69;
 
-BB22_19:
+BB8_19:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB22_20:
+BB8_20:
        mov.f64         %fd67, %fd68;
        setp.gt.u32     %p12, %r6, 31;
-       @%p12 bra       BB22_33;
+       @%p12 bra       BB8_33;
 
        setp.lt.u32     %p13, %r9, 64;
-       @%p13 bra       BB22_23;
+       @%p13 bra       BB8_23;
 
        ld.volatile.shared.f64  %fd36, [%rd1+256];
        add.f64         %fd67, %fd67, %fd36;
        st.volatile.shared.f64  [%rd1], %fd67;
 
-BB22_23:
+BB8_23:
        mov.f64         %fd66, %fd67;
        setp.lt.u32     %p14, %r9, 32;
-       @%p14 bra       BB22_25;
+       @%p14 bra       BB8_25;
 
        ld.volatile.shared.f64  %fd37, [%rd1+128];
        add.f64         %fd66, %fd66, %fd37;
        st.volatile.shared.f64  [%rd1], %fd66;
 
-BB22_25:
+BB8_25:
        mov.f64         %fd65, %fd66;
        setp.lt.u32     %p15, %r9, 16;
-       @%p15 bra       BB22_27;
+       @%p15 bra       BB8_27;
 
        ld.volatile.shared.f64  %fd38, [%rd1+64];
        add.f64         %fd65, %fd65, %fd38;
        st.volatile.shared.f64  [%rd1], %fd65;
 
-BB22_27:
+BB8_27:
        mov.f64         %fd64, %fd65;
        setp.lt.u32     %p16, %r9, 8;
-       @%p16 bra       BB22_29;
+       @%p16 bra       BB8_29;
 
        ld.volatile.shared.f64  %fd39, [%rd1+32];
        add.f64         %fd64, %fd64, %fd39;
        st.volatile.shared.f64  [%rd1], %fd64;
 
-BB22_29:
+BB8_29:
        mov.f64         %fd63, %fd64;
        setp.lt.u32     %p17, %r9, 4;
-       @%p17 bra       BB22_31;
+       @%p17 bra       BB8_31;
 
        ld.volatile.shared.f64  %fd40, [%rd1+16];
        add.f64         %fd63, %fd63, %fd40;
        st.volatile.shared.f64  [%rd1], %fd63;
 
-BB22_31:
+BB8_31:
        setp.lt.u32     %p18, %r9, 2;
-       @%p18 bra       BB22_33;
+       @%p18 bra       BB8_33;
 
        ld.volatile.shared.f64  %fd41, [%rd1+8];
        add.f64         %fd42, %fd63, %fd41;
        st.volatile.shared.f64  [%rd1], %fd42;
 
-BB22_33:
+BB8_33:
        setp.ne.s32     %p19, %r6, 0;
-       @%p19 bra       BB22_35;
+       @%p19 bra       BB8_35;
 
        ld.shared.f64   %fd43, [sdata];
        cvta.to.global.u64      %rd12, %rd3;
@@ -3246,7 +1475,7 @@ BB22_33:
        add.s64         %rd14, %rd12, %rd13;
        st.global.f64   [%rd14], %fd43;
 
-BB22_35:
+BB8_35:
        ret;
 }
 
@@ -3270,17 +1499,17 @@ BB22_35:
        ld.param.u32    %r4, [reduce_row_sum_param_3];
        mov.u32         %r6, %ctaid.x;
        setp.ge.u32     %p1, %r6, %r5;
-       @%p1 bra        BB23_35;
+       @%p1 bra        BB9_35;
 
        mov.u32         %r38, %tid.x;
        mov.f64         %fd72, 0d0000000000000000;
        mov.f64         %fd73, %fd72;
        setp.ge.u32     %p2, %r38, %r4;
-       @%p2 bra        BB23_4;
+       @%p2 bra        BB9_4;
 
        cvta.to.global.u64      %rd3, %rd1;
 
-BB23_3:
+BB9_3:
        mad.lo.s32      %r8, %r6, %r4, %r38;
        mul.wide.u32    %rd4, %r8, 8;
        add.s64         %rd5, %rd3, %rd4;
@@ -3290,9 +1519,9 @@ BB23_3:
        add.s32         %r38, %r9, %r38;
        setp.lt.u32     %p3, %r38, %r4;
        mov.f64         %fd72, %fd73;
-       @%p3 bra        BB23_3;
+       @%p3 bra        BB9_3;
 
-BB23_4:
+BB9_4:
        mov.f64         %fd70, %fd72;
        mov.u32         %r10, %tid.x;
        mul.wide.u32    %rd6, %r10, 8;
@@ -3302,130 +1531,130 @@ BB23_4:
        bar.sync        0;
        mov.u32         %r11, %ntid.x;
        setp.lt.u32     %p4, %r11, 1024;
-       @%p4 bra        BB23_8;
+       @%p4 bra        BB9_8;
 
        setp.gt.u32     %p5, %r10, 511;
        mov.f64         %fd71, %fd70;
-       @%p5 bra        BB23_7;
+       @%p5 bra        BB9_7;
 
        ld.shared.f64   %fd29, [%rd8+4096];
        add.f64         %fd71, %fd70, %fd29;
        st.shared.f64   [%rd8], %fd71;
 
-BB23_7:
+BB9_7:
        mov.f64         %fd70, %fd71;
        bar.sync        0;
 
-BB23_8:
+BB9_8:
        mov.f64         %fd68, %fd70;
        setp.lt.u32     %p6, %r11, 512;
-       @%p6 bra        BB23_12;
+       @%p6 bra        BB9_12;
 
        setp.gt.u32     %p7, %r10, 255;
        mov.f64         %fd69, %fd68;
-       @%p7 bra        BB23_11;
+       @%p7 bra        BB9_11;
 
        ld.shared.f64   %fd30, [%rd8+2048];
        add.f64         %fd69, %fd68, %fd30;
        st.shared.f64   [%rd8], %fd69;
 
-BB23_11:
+BB9_11:
        mov.f64         %fd68, %fd69;
        bar.sync        0;
 
-BB23_12:
+BB9_12:
        mov.f64         %fd66, %fd68;
        setp.lt.u32     %p8, %r11, 256;
-       @%p8 bra        BB23_16;
+       @%p8 bra        BB9_16;
 
        setp.gt.u32     %p9, %r10, 127;
        mov.f64         %fd67, %fd66;
-       @%p9 bra        BB23_15;
+       @%p9 bra        BB9_15;
 
        ld.shared.f64   %fd31, [%rd8+1024];
        add.f64         %fd67, %fd66, %fd31;
        st.shared.f64   [%rd8], %fd67;
 
-BB23_15:
+BB9_15:
        mov.f64         %fd66, %fd67;
        bar.sync        0;
 
-BB23_16:
+BB9_16:
        mov.f64         %fd64, %fd66;
        setp.lt.u32     %p10, %r11, 128;
-       @%p10 bra       BB23_20;
+       @%p10 bra       BB9_20;
 
        setp.gt.u32     %p11, %r10, 63;
        mov.f64         %fd65, %fd64;
-       @%p11 bra       BB23_19;
+       @%p11 bra       BB9_19;
 
        ld.shared.f64   %fd32, [%rd8+512];
        add.f64         %fd65, %fd64, %fd32;
        st.shared.f64   [%rd8], %fd65;
 
-BB23_19:
+BB9_19:
        mov.f64         %fd64, %fd65;
        bar.sync        0;
 
-BB23_20:
+BB9_20:
        mov.f64         %fd63, %fd64;
        setp.gt.u32     %p12, %r10, 31;
-       @%p12 bra       BB23_33;
+       @%p12 bra       BB9_33;
 
        setp.lt.u32     %p13, %r11, 64;
-       @%p13 bra       BB23_23;
+       @%p13 bra       BB9_23;
 
        ld.volatile.shared.f64  %fd33, [%rd8+256];
        add.f64         %fd63, %fd63, %fd33;
        st.volatile.shared.f64  [%rd8], %fd63;
 
-BB23_23:
+BB9_23:
        mov.f64         %fd62, %fd63;
        setp.lt.u32     %p14, %r11, 32;
-       @%p14 bra       BB23_25;
+       @%p14 b

<TRUNCATED>

Reply via email to