http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88526702/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx
b/src/main/cpp/kernels/SystemML.ptx
index c708134..99d5898 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -319,407 +319,1511 @@ BB1_12:
bra.uni BB1_41;
}
- // .globl copyUpperToLowerTriangleDense
-.visible .entry copyUpperToLowerTriangleDense(
- .param .u64 copyUpperToLowerTriangleDense_param_0,
- .param .u32 copyUpperToLowerTriangleDense_param_1,
- .param .u32 copyUpperToLowerTriangleDense_param_2
+ // .globl _Z6reduceI5SumOpEvPdS1_jT_d
+.visible .func _Z6reduceI5SumOpEvPdS1_jT_d(
+ .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_0,
+ .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_1,
+ .param .b32 _Z6reduceI5SumOpEvPdS1_jT_d_param_2,
+ .param .align 1 .b8 _Z6reduceI5SumOpEvPdS1_jT_d_param_3[1],
+ .param .b64 _Z6reduceI5SumOpEvPdS1_jT_d_param_4
)
{
- .reg .pred %p<4>;
- .reg .b32 %r<13>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<7>;
+ .reg .pred %p<18>;
+ .reg .b32 %r<31>;
+ .reg .f64 %fd<70>;
+ .reg .b64 %rd<12>;
- ld.param.u64 %rd1, [copyUpperToLowerTriangleDense_param_0];
- ld.param.u32 %r4, [copyUpperToLowerTriangleDense_param_1];
- ld.param.u32 %r5, [copyUpperToLowerTriangleDense_param_2];
- mov.u32 %r6, %ntid.x;
+ ld.param.u64 %rd2, [_Z6reduceI5SumOpEvPdS1_jT_d_param_0];
+ ld.param.u64 %rd3, [_Z6reduceI5SumOpEvPdS1_jT_d_param_1];
+ ld.param.u32 %r5, [_Z6reduceI5SumOpEvPdS1_jT_d_param_2];
+ ld.param.f64 %fd67, [_Z6reduceI5SumOpEvPdS1_jT_d_param_4];
+ mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
- mov.u32 %r8, %tid.x;
- mad.lo.s32 %r1, %r6, %r7, %r8;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %ctaid.y;
- mov.u32 %r11, %tid.y;
- mad.lo.s32 %r2, %r9, %r10, %r11;
- mad.lo.s32 %r3, %r2, %r4, %r1;
- setp.gt.s32 %p1, %r2, %r1;
- setp.lt.s32 %p2, %r3, %r5;
- and.pred %p3, %p1, %p2;
- @!%p3 bra BB2_2;
- bra.uni BB2_1;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r30, %r8, %r9, %r6;
+ setp.ge.u32 %p1, %r30, %r5;
+ @%p1 bra BB2_5;
-BB2_1:
- cvta.to.global.u64 %rd2, %rd1;
- mad.lo.s32 %r12, %r1, %r4, %r2;
- mul.wide.s32 %rd3, %r12, 8;
- add.s64 %rd4, %rd2, %rd3;
- ld.global.f64 %fd1, [%rd4];
- mul.wide.s32 %rd5, %r3, 8;
- add.s64 %rd6, %rd2, %rd5;
- st.global.f64 [%rd6], %fd1;
+ mov.f64 %fd68, %fd67;
BB2_2:
- ret;
-}
+ mov.f64 %fd1, %fd68;
+ mul.wide.u32 %rd4, %r30, 8;
+ add.s64 %rd5, %rd2, %rd4;
+ ld.f64 %fd26, [%rd5];
+ add.f64 %fd69, %fd1, %fd26;
+ add.s32 %r3, %r30, %r9;
+ setp.ge.u32 %p2, %r3, %r5;
+ @%p2 bra BB2_4;
- // .globl dense_matrix_set
-.visible .entry dense_matrix_set(
- .param .u64 dense_matrix_set_param_0,
- .param .f64 dense_matrix_set_param_1,
- .param .u32 dense_matrix_set_param_2,
- .param .u32 dense_matrix_set_param_3
-)
-{
- .reg .pred %p<2>;
- .reg .b32 %r<13>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<5>;
+ mul.wide.u32 %rd6, %r3, 8;
+ add.s64 %rd7, %rd2, %rd6;
+ ld.f64 %fd27, [%rd7];
+ add.f64 %fd69, %fd69, %fd27;
+BB2_4:
+ mov.f64 %fd68, %fd69;
+ shl.b32 %r12, %r9, 1;
+ mov.u32 %r13, %nctaid.x;
+ mad.lo.s32 %r30, %r12, %r13, %r30;
+ setp.lt.u32 %p3, %r30, %r5;
+ mov.f64 %fd67, %fd68;
+ @%p3 bra BB2_2;
- ld.param.u64 %rd1, [dense_matrix_set_param_0];
- ld.param.f64 %fd1, [dense_matrix_set_param_1];
- ld.param.u32 %r2, [dense_matrix_set_param_2];
- ld.param.u32 %r3, [dense_matrix_set_param_3];
- mov.u32 %r4, %ctaid.x;
- mov.u32 %r5, %ntid.x;
- mov.u32 %r6, %tid.x;
- mad.lo.s32 %r7, %r5, %r4, %r6;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r7, %r3, %r10;
- mad.lo.s32 %r1, %r8, %r9, %r11;
- mul.lo.s32 %r12, %r3, %r2;
- setp.ge.s32 %p1, %r1, %r12;
- @%p1 bra BB3_2;
+BB2_5:
+ mov.f64 %fd65, %fd67;
+ mul.wide.u32 %rd8, %r6, 8;
+ mov.u64 %rd9, sdata;
+ add.s64 %rd1, %rd9, %rd8;
+ st.shared.f64 [%rd1], %fd65;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 512;
+ @%p4 bra BB2_9;
- cvta.to.global.u64 %rd2, %rd1;
- mul.wide.s32 %rd3, %r1, 8;
- add.s64 %rd4, %rd2, %rd3;
- st.global.f64 [%rd4], %fd1;
+ setp.gt.u32 %p5, %r6, 255;
+ mov.f64 %fd66, %fd65;
+ @%p5 bra BB2_8;
-BB3_2:
- ret;
-}
+ ld.shared.f64 %fd28, [%rd1+2048];
+ add.f64 %fd66, %fd65, %fd28;
+ st.shared.f64 [%rd1], %fd66;
- // .globl dense_matrix_copy
-.visible .entry dense_matrix_copy(
- .param .u64 dense_matrix_copy_param_0,
- .param .u64 dense_matrix_copy_param_1,
- .param .u32 dense_matrix_copy_param_2,
- .param .u32 dense_matrix_copy_param_3
-)
-{
- .reg .pred %p<4>;
- .reg .b32 %r<12>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<8>;
+BB2_8:
+ mov.f64 %fd65, %fd66;
+ bar.sync 0;
+BB2_9:
+ mov.f64 %fd63, %fd65;
+ setp.lt.u32 %p6, %r9, 256;
+ @%p6 bra BB2_13;
- ld.param.u64 %rd1, [dense_matrix_copy_param_0];
- ld.param.u64 %rd2, [dense_matrix_copy_param_1];
- ld.param.u32 %r2, [dense_matrix_copy_param_2];
- ld.param.u32 %r3, [dense_matrix_copy_param_3];
- mov.u32 %r4, %ctaid.x;
- mov.u32 %r5, %ntid.x;
- mov.u32 %r6, %tid.x;
- mad.lo.s32 %r7, %r5, %r4, %r6;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r8, %r9, %r10;
- mad.lo.s32 %r1, %r7, %r3, %r11;
- setp.lt.s32 %p1, %r7, %r2;
- setp.lt.s32 %p2, %r11, %r3;
- and.pred %p3, %p1, %p2;
- @!%p3 bra BB4_2;
- bra.uni BB4_1;
+ setp.gt.u32 %p7, %r6, 127;
+ mov.f64 %fd64, %fd63;
+ @%p7 bra BB2_12;
-BB4_1:
- cvta.to.global.u64 %rd3, %rd1;
- mul.wide.s32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.f64 %fd1, [%rd5];
- cvta.to.global.u64 %rd6, %rd2;
- add.s64 %rd7, %rd6, %rd4;
- st.global.f64 [%rd7], %fd1;
+ ld.shared.f64 %fd29, [%rd1+1024];
+ add.f64 %fd64, %fd63, %fd29;
+ st.shared.f64 [%rd1], %fd64;
-BB4_2:
- ret;
-}
+BB2_12:
+ mov.f64 %fd63, %fd64;
+ bar.sync 0;
- // .globl relu
-.visible .entry relu(
- .param .u64 relu_param_0,
- .param .u64 relu_param_1,
- .param .u32 relu_param_2,
- .param .u32 relu_param_3
-)
-{
- .reg .pred %p<4>;
- .reg .b32 %r<12>;
- .reg .f64 %fd<4>;
- .reg .b64 %rd<8>;
+BB2_13:
+ mov.f64 %fd61, %fd63;
+ setp.lt.u32 %p8, %r9, 128;
+ @%p8 bra BB2_17;
+ setp.gt.u32 %p9, %r6, 63;
+ mov.f64 %fd62, %fd61;
+ @%p9 bra BB2_16;
- ld.param.u64 %rd1, [relu_param_0];
- ld.param.u64 %rd2, [relu_param_1];
- ld.param.u32 %r4, [relu_param_2];
- ld.param.u32 %r3, [relu_param_3];
- mov.u32 %r5, %ctaid.x;
- mov.u32 %r6, %ntid.x;
- mov.u32 %r7, %tid.x;
- mad.lo.s32 %r1, %r6, %r5, %r7;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r2, %r8, %r9, %r10;
- setp.lt.s32 %p1, %r1, %r4;
- setp.lt.s32 %p2, %r2, %r3;
- and.pred %p3, %p1, %p2;
- @!%p3 bra BB5_2;
- bra.uni BB5_1;
+ ld.shared.f64 %fd30, [%rd1+512];
+ add.f64 %fd62, %fd61, %fd30;
+ st.shared.f64 [%rd1], %fd62;
-BB5_1:
- cvta.to.global.u64 %rd3, %rd1;
- mad.lo.s32 %r11, %r1, %r3, %r2;
- mul.wide.s32 %rd4, %r11, 8;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.f64 %fd1, [%rd5];
- mov.f64 %fd2, 0d0000000000000000;
- max.f64 %fd3, %fd2, %fd1;
- cvta.to.global.u64 %rd6, %rd2;
- add.s64 %rd7, %rd6, %rd4;
- st.global.f64 [%rd7], %fd3;
+BB2_16:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
-BB5_2:
- ret;
-}
+BB2_17:
+ mov.f64 %fd60, %fd61;
+ setp.gt.u32 %p10, %r6, 31;
+ @%p10 bra BB2_30;
- // .globl compareAndSet
-.visible .entry compareAndSet(
- .param .u64 compareAndSet_param_0,
- .param .u64 compareAndSet_param_1,
- .param .u32 compareAndSet_param_2,
- .param .u32 compareAndSet_param_3,
- .param .f64 compareAndSet_param_4,
- .param .f64 compareAndSet_param_5,
- .param .f64 compareAndSet_param_6,
- .param .f64 compareAndSet_param_7,
- .param .f64 compareAndSet_param_8
-)
-{
- .reg .pred %p<6>;
- .reg .b32 %r<12>;
- .reg .f64 %fd<9>;
- .reg .b64 %rd<8>;
+ setp.lt.u32 %p11, %r9, 64;
+ @%p11 bra BB2_20;
+ ld.volatile.shared.f64 %fd31, [%rd1+256];
+ add.f64 %fd60, %fd60, %fd31;
+ st.volatile.shared.f64 [%rd1], %fd60;
- ld.param.u64 %rd2, [compareAndSet_param_0];
- ld.param.u64 %rd3, [compareAndSet_param_1];
- ld.param.u32 %r2, [compareAndSet_param_2];
- ld.param.u32 %r3, [compareAndSet_param_3];
- ld.param.f64 %fd2, [compareAndSet_param_4];
- ld.param.f64 %fd3, [compareAndSet_param_5];
- ld.param.f64 %fd4, [compareAndSet_param_6];
- ld.param.f64 %fd5, [compareAndSet_param_7];
- ld.param.f64 %fd6, [compareAndSet_param_8];
- mov.u32 %r4, %ctaid.x;
- mov.u32 %r5, %ntid.x;
- mov.u32 %r6, %tid.x;
- mad.lo.s32 %r7, %r5, %r4, %r6;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r8, %r9, %r10;
- mad.lo.s32 %r1, %r7, %r3, %r11;
- setp.lt.s32 %p1, %r7, %r2;
- setp.lt.s32 %p2, %r11, %r3;
- and.pred %p3, %p1, %p2;
- @!%p3 bra BB6_6;
- bra.uni BB6_1;
+BB2_20:
+ mov.f64 %fd59, %fd60;
+ setp.lt.u32 %p12, %r9, 32;
+ @%p12 bra BB2_22;
-BB6_1:
- cvta.to.global.u64 %rd4, %rd2;
- mul.wide.s32 %rd5, %r1, 8;
- add.s64 %rd6, %rd4, %rd5;
- ld.global.f64 %fd1, [%rd6];
- sub.f64 %fd7, %fd1, %fd2;
- abs.f64 %fd8, %fd7;
- setp.lt.f64 %p4, %fd8, %fd3;
- cvta.to.global.u64 %rd7, %rd3;
- add.s64 %rd1, %rd7, %rd5;
- @%p4 bra BB6_5;
- bra.uni BB6_2;
+ ld.volatile.shared.f64 %fd32, [%rd1+128];
+ add.f64 %fd59, %fd59, %fd32;
+ st.volatile.shared.f64 [%rd1], %fd59;
-BB6_5:
- st.global.f64 [%rd1], %fd4;
- bra.uni BB6_6;
+BB2_22:
+ mov.f64 %fd58, %fd59;
+ setp.lt.u32 %p13, %r9, 16;
+ @%p13 bra BB2_24;
-BB6_2:
- setp.lt.f64 %p5, %fd1, %fd2;
- @%p5 bra BB6_4;
- bra.uni BB6_3;
+ ld.volatile.shared.f64 %fd33, [%rd1+64];
+ add.f64 %fd58, %fd58, %fd33;
+ st.volatile.shared.f64 [%rd1], %fd58;
-BB6_4:
- st.global.f64 [%rd1], %fd5;
- bra.uni BB6_6;
+BB2_24:
+ mov.f64 %fd57, %fd58;
+ setp.lt.u32 %p14, %r9, 8;
+ @%p14 bra BB2_26;
-BB6_3:
- st.global.f64 [%rd1], %fd6;
+ ld.volatile.shared.f64 %fd34, [%rd1+32];
+ add.f64 %fd57, %fd57, %fd34;
+ st.volatile.shared.f64 [%rd1], %fd57;
-BB6_6:
- ret;
+BB2_26:
+ mov.f64 %fd56, %fd57;
+ setp.lt.u32 %p15, %r9, 4;
+ @%p15 bra BB2_28;
+
+ ld.volatile.shared.f64 %fd35, [%rd1+16];
+ add.f64 %fd56, %fd56, %fd35;
+ st.volatile.shared.f64 [%rd1], %fd56;
+
+BB2_28:
+ setp.lt.u32 %p16, %r9, 2;
+ @%p16 bra BB2_30;
+
+ ld.volatile.shared.f64 %fd36, [%rd1+8];
+ add.f64 %fd37, %fd56, %fd36;
+ st.volatile.shared.f64 [%rd1], %fd37;
+
+BB2_30:
+ setp.ne.s32 %p17, %r6, 0;
+ @%p17 bra BB2_32;
+
+ ld.shared.f64 %fd38, [sdata];
+ mul.wide.u32 %rd10, %r7, 8;
+ add.s64 %rd11, %rd3, %rd10;
+ st.f64 [%rd11], %fd38;
+
+BB2_32:
+ ret;
+}
+
+ // .globl _Z6reduceI5MaxOpEvPdS1_jT_d
+.visible .func _Z6reduceI5MaxOpEvPdS1_jT_d(
+ .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_0,
+ .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_1,
+ .param .b32 _Z6reduceI5MaxOpEvPdS1_jT_d_param_2,
+ .param .align 1 .b8 _Z6reduceI5MaxOpEvPdS1_jT_d_param_3[1],
+ .param .b64 _Z6reduceI5MaxOpEvPdS1_jT_d_param_4
+)
+{
+ .reg .pred %p<18>;
+ .reg .b32 %r<31>;
+ .reg .f64 %fd<70>;
+ .reg .b64 %rd<12>;
+
+
+ ld.param.u64 %rd2, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_0];
+ ld.param.u64 %rd3, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_1];
+ ld.param.u32 %r5, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_2];
+ ld.param.f64 %fd67, [_Z6reduceI5MaxOpEvPdS1_jT_d_param_4];
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r30, %r8, %r9, %r6;
+ setp.ge.u32 %p1, %r30, %r5;
+ @%p1 bra BB3_5;
+
+ mov.f64 %fd68, %fd67;
+
+BB3_2:
+ mov.f64 %fd1, %fd68;
+ mul.wide.u32 %rd4, %r30, 8;
+ add.s64 %rd5, %rd2, %rd4;
+ ld.f64 %fd26, [%rd5];
+ max.f64 %fd69, %fd1, %fd26;
+ add.s32 %r3, %r30, %r9;
+ setp.ge.u32 %p2, %r3, %r5;
+ @%p2 bra BB3_4;
+
+ mul.wide.u32 %rd6, %r3, 8;
+ add.s64 %rd7, %rd2, %rd6;
+ ld.f64 %fd27, [%rd7];
+ max.f64 %fd69, %fd69, %fd27;
+
+BB3_4:
+ mov.f64 %fd68, %fd69;
+ shl.b32 %r12, %r9, 1;
+ mov.u32 %r13, %nctaid.x;
+ mad.lo.s32 %r30, %r12, %r13, %r30;
+ setp.lt.u32 %p3, %r30, %r5;
+ mov.f64 %fd67, %fd68;
+ @%p3 bra BB3_2;
+
+BB3_5:
+ mov.f64 %fd65, %fd67;
+ mul.wide.u32 %rd8, %r6, 8;
+ mov.u64 %rd9, sdata;
+ add.s64 %rd1, %rd9, %rd8;
+ st.shared.f64 [%rd1], %fd65;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 512;
+ @%p4 bra BB3_9;
+
+ setp.gt.u32 %p5, %r6, 255;
+ mov.f64 %fd66, %fd65;
+ @%p5 bra BB3_8;
+
+ ld.shared.f64 %fd28, [%rd1+2048];
+ max.f64 %fd66, %fd65, %fd28;
+ st.shared.f64 [%rd1], %fd66;
+
+BB3_8:
+ mov.f64 %fd65, %fd66;
+ bar.sync 0;
+
+BB3_9:
+ mov.f64 %fd63, %fd65;
+ setp.lt.u32 %p6, %r9, 256;
+ @%p6 bra BB3_13;
+
+ setp.gt.u32 %p7, %r6, 127;
+ mov.f64 %fd64, %fd63;
+ @%p7 bra BB3_12;
+
+ ld.shared.f64 %fd29, [%rd1+1024];
+ max.f64 %fd64, %fd63, %fd29;
+ st.shared.f64 [%rd1], %fd64;
+
+BB3_12:
+ mov.f64 %fd63, %fd64;
+ bar.sync 0;
+
+BB3_13:
+ mov.f64 %fd61, %fd63;
+ setp.lt.u32 %p8, %r9, 128;
+ @%p8 bra BB3_17;
+
+ setp.gt.u32 %p9, %r6, 63;
+ mov.f64 %fd62, %fd61;
+ @%p9 bra BB3_16;
+
+ ld.shared.f64 %fd30, [%rd1+512];
+ max.f64 %fd62, %fd61, %fd30;
+ st.shared.f64 [%rd1], %fd62;
+
+BB3_16:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
+
+BB3_17:
+ mov.f64 %fd60, %fd61;
+ setp.gt.u32 %p10, %r6, 31;
+ @%p10 bra BB3_30;
+
+ setp.lt.u32 %p11, %r9, 64;
+ @%p11 bra BB3_20;
+
+ ld.volatile.shared.f64 %fd31, [%rd1+256];
+ max.f64 %fd60, %fd60, %fd31;
+ st.volatile.shared.f64 [%rd1], %fd60;
+
+BB3_20:
+ mov.f64 %fd59, %fd60;
+ setp.lt.u32 %p12, %r9, 32;
+ @%p12 bra BB3_22;
+
+ ld.volatile.shared.f64 %fd32, [%rd1+128];
+ max.f64 %fd59, %fd59, %fd32;
+ st.volatile.shared.f64 [%rd1], %fd59;
+
+BB3_22:
+ mov.f64 %fd58, %fd59;
+ setp.lt.u32 %p13, %r9, 16;
+ @%p13 bra BB3_24;
+
+ ld.volatile.shared.f64 %fd33, [%rd1+64];
+ max.f64 %fd58, %fd58, %fd33;
+ st.volatile.shared.f64 [%rd1], %fd58;
+
+BB3_24:
+ mov.f64 %fd57, %fd58;
+ setp.lt.u32 %p14, %r9, 8;
+ @%p14 bra BB3_26;
+
+ ld.volatile.shared.f64 %fd34, [%rd1+32];
+ max.f64 %fd57, %fd57, %fd34;
+ st.volatile.shared.f64 [%rd1], %fd57;
+
+BB3_26:
+ mov.f64 %fd56, %fd57;
+ setp.lt.u32 %p15, %r9, 4;
+ @%p15 bra BB3_28;
+
+ ld.volatile.shared.f64 %fd35, [%rd1+16];
+ max.f64 %fd56, %fd56, %fd35;
+ st.volatile.shared.f64 [%rd1], %fd56;
+
+BB3_28:
+ setp.lt.u32 %p16, %r9, 2;
+ @%p16 bra BB3_30;
+
+ ld.volatile.shared.f64 %fd36, [%rd1+8];
+ max.f64 %fd37, %fd56, %fd36;
+ st.volatile.shared.f64 [%rd1], %fd37;
+
+BB3_30:
+ setp.ne.s32 %p17, %r6, 0;
+ @%p17 bra BB3_32;
+
+ ld.shared.f64 %fd38, [sdata];
+ mul.wide.u32 %rd10, %r7, 8;
+ add.s64 %rd11, %rd3, %rd10;
+ st.f64 [%rd11], %fd38;
+
+BB3_32:
+ ret;
+}
+
+ // .globl _Z6reduceI5MinOpEvPdS1_jT_d
+.visible .func _Z6reduceI5MinOpEvPdS1_jT_d(
+ .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_0,
+ .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_1,
+ .param .b32 _Z6reduceI5MinOpEvPdS1_jT_d_param_2,
+ .param .align 1 .b8 _Z6reduceI5MinOpEvPdS1_jT_d_param_3[1],
+ .param .b64 _Z6reduceI5MinOpEvPdS1_jT_d_param_4
+)
+{
+ .reg .pred %p<18>;
+ .reg .b32 %r<31>;
+ .reg .f64 %fd<70>;
+ .reg .b64 %rd<12>;
+
+
+ ld.param.u64 %rd2, [_Z6reduceI5MinOpEvPdS1_jT_d_param_0];
+ ld.param.u64 %rd3, [_Z6reduceI5MinOpEvPdS1_jT_d_param_1];
+ ld.param.u32 %r5, [_Z6reduceI5MinOpEvPdS1_jT_d_param_2];
+ ld.param.f64 %fd67, [_Z6reduceI5MinOpEvPdS1_jT_d_param_4];
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r30, %r8, %r9, %r6;
+ setp.ge.u32 %p1, %r30, %r5;
+ @%p1 bra BB4_5;
+
+ mov.f64 %fd68, %fd67;
+
+BB4_2:
+ mov.f64 %fd1, %fd68;
+ mul.wide.u32 %rd4, %r30, 8;
+ add.s64 %rd5, %rd2, %rd4;
+ ld.f64 %fd26, [%rd5];
+ min.f64 %fd69, %fd1, %fd26;
+ add.s32 %r3, %r30, %r9;
+ setp.ge.u32 %p2, %r3, %r5;
+ @%p2 bra BB4_4;
+
+ mul.wide.u32 %rd6, %r3, 8;
+ add.s64 %rd7, %rd2, %rd6;
+ ld.f64 %fd27, [%rd7];
+ min.f64 %fd69, %fd69, %fd27;
+
+BB4_4:
+ mov.f64 %fd68, %fd69;
+ shl.b32 %r12, %r9, 1;
+ mov.u32 %r13, %nctaid.x;
+ mad.lo.s32 %r30, %r12, %r13, %r30;
+ setp.lt.u32 %p3, %r30, %r5;
+ mov.f64 %fd67, %fd68;
+ @%p3 bra BB4_2;
+
+BB4_5:
+ mov.f64 %fd65, %fd67;
+ mul.wide.u32 %rd8, %r6, 8;
+ mov.u64 %rd9, sdata;
+ add.s64 %rd1, %rd9, %rd8;
+ st.shared.f64 [%rd1], %fd65;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 512;
+ @%p4 bra BB4_9;
+
+ setp.gt.u32 %p5, %r6, 255;
+ mov.f64 %fd66, %fd65;
+ @%p5 bra BB4_8;
+
+ ld.shared.f64 %fd28, [%rd1+2048];
+ min.f64 %fd66, %fd65, %fd28;
+ st.shared.f64 [%rd1], %fd66;
+
+BB4_8:
+ mov.f64 %fd65, %fd66;
+ bar.sync 0;
+
+BB4_9:
+ mov.f64 %fd63, %fd65;
+ setp.lt.u32 %p6, %r9, 256;
+ @%p6 bra BB4_13;
+
+ setp.gt.u32 %p7, %r6, 127;
+ mov.f64 %fd64, %fd63;
+ @%p7 bra BB4_12;
+
+ ld.shared.f64 %fd29, [%rd1+1024];
+ min.f64 %fd64, %fd63, %fd29;
+ st.shared.f64 [%rd1], %fd64;
+
+BB4_12:
+ mov.f64 %fd63, %fd64;
+ bar.sync 0;
+
+BB4_13:
+ mov.f64 %fd61, %fd63;
+ setp.lt.u32 %p8, %r9, 128;
+ @%p8 bra BB4_17;
+
+ setp.gt.u32 %p9, %r6, 63;
+ mov.f64 %fd62, %fd61;
+ @%p9 bra BB4_16;
+
+ ld.shared.f64 %fd30, [%rd1+512];
+ min.f64 %fd62, %fd61, %fd30;
+ st.shared.f64 [%rd1], %fd62;
+
+BB4_16:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
+
+BB4_17:
+ mov.f64 %fd60, %fd61;
+ setp.gt.u32 %p10, %r6, 31;
+ @%p10 bra BB4_30;
+
+ setp.lt.u32 %p11, %r9, 64;
+ @%p11 bra BB4_20;
+
+ ld.volatile.shared.f64 %fd31, [%rd1+256];
+ min.f64 %fd60, %fd60, %fd31;
+ st.volatile.shared.f64 [%rd1], %fd60;
+
+BB4_20:
+ mov.f64 %fd59, %fd60;
+ setp.lt.u32 %p12, %r9, 32;
+ @%p12 bra BB4_22;
+
+ ld.volatile.shared.f64 %fd32, [%rd1+128];
+ min.f64 %fd59, %fd59, %fd32;
+ st.volatile.shared.f64 [%rd1], %fd59;
+
+BB4_22:
+ mov.f64 %fd58, %fd59;
+ setp.lt.u32 %p13, %r9, 16;
+ @%p13 bra BB4_24;
+
+ ld.volatile.shared.f64 %fd33, [%rd1+64];
+ min.f64 %fd58, %fd58, %fd33;
+ st.volatile.shared.f64 [%rd1], %fd58;
+
+BB4_24:
+ mov.f64 %fd57, %fd58;
+ setp.lt.u32 %p14, %r9, 8;
+ @%p14 bra BB4_26;
+
+ ld.volatile.shared.f64 %fd34, [%rd1+32];
+ min.f64 %fd57, %fd57, %fd34;
+ st.volatile.shared.f64 [%rd1], %fd57;
+
+BB4_26:
+ mov.f64 %fd56, %fd57;
+ setp.lt.u32 %p15, %r9, 4;
+ @%p15 bra BB4_28;
+
+ ld.volatile.shared.f64 %fd35, [%rd1+16];
+ min.f64 %fd56, %fd56, %fd35;
+ st.volatile.shared.f64 [%rd1], %fd56;
+
+BB4_28:
+ setp.lt.u32 %p16, %r9, 2;
+ @%p16 bra BB4_30;
+
+ ld.volatile.shared.f64 %fd36, [%rd1+8];
+ min.f64 %fd37, %fd56, %fd36;
+ st.volatile.shared.f64 [%rd1], %fd37;
+
+BB4_30:
+ setp.ne.s32 %p17, %r6, 0;
+ @%p17 bra BB4_32;
+
+ ld.shared.f64 %fd38, [sdata];
+ mul.wide.u32 %rd10, %r7, 8;
+ add.s64 %rd11, %rd3, %rd10;
+ st.f64 [%rd11], %fd38;
+
+BB4_32:
+ ret;
+}
+
+ // .globl copyUpperToLowerTriangleDense
+.visible .entry copyUpperToLowerTriangleDense(
+ .param .u64 copyUpperToLowerTriangleDense_param_0,
+ .param .u32 copyUpperToLowerTriangleDense_param_1,
+ .param .u32 copyUpperToLowerTriangleDense_param_2
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<13>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<7>;
+
+
+ ld.param.u64 %rd1, [copyUpperToLowerTriangleDense_param_0];
+ ld.param.u32 %r4, [copyUpperToLowerTriangleDense_param_1];
+ ld.param.u32 %r5, [copyUpperToLowerTriangleDense_param_2];
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %ctaid.x;
+ mov.u32 %r8, %tid.x;
+ mad.lo.s32 %r1, %r6, %r7, %r8;
+ mov.u32 %r9, %ntid.y;
+ mov.u32 %r10, %ctaid.y;
+ mov.u32 %r11, %tid.y;
+ mad.lo.s32 %r2, %r9, %r10, %r11;
+ mad.lo.s32 %r3, %r2, %r4, %r1;
+ setp.gt.s32 %p1, %r2, %r1;
+ setp.lt.s32 %p2, %r3, %r5;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB5_2;
+ bra.uni BB5_1;
+
+BB5_1:
+ cvta.to.global.u64 %rd2, %rd1;
+ mad.lo.s32 %r12, %r1, %r4, %r2;
+ mul.wide.s32 %rd3, %r12, 8;
+ add.s64 %rd4, %rd2, %rd3;
+ ld.global.f64 %fd1, [%rd4];
+ mul.wide.s32 %rd5, %r3, 8;
+ add.s64 %rd6, %rd2, %rd5;
+ st.global.f64 [%rd6], %fd1;
+
+BB5_2:
+ ret;
+}
+
+ // .globl dense_matrix_set
+.visible .entry dense_matrix_set(
+ .param .u64 dense_matrix_set_param_0,
+ .param .f64 dense_matrix_set_param_1,
+ .param .u32 dense_matrix_set_param_2,
+ .param .u32 dense_matrix_set_param_3
+)
+{
+ .reg .pred %p<2>;
+ .reg .b32 %r<13>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<5>;
+
+
+ ld.param.u64 %rd1, [dense_matrix_set_param_0];
+ ld.param.f64 %fd1, [dense_matrix_set_param_1];
+ ld.param.u32 %r2, [dense_matrix_set_param_2];
+ ld.param.u32 %r3, [dense_matrix_set_param_3];
+ mov.u32 %r4, %ctaid.x;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r5, %r4, %r6;
+ mov.u32 %r8, %ntid.y;
+ mov.u32 %r9, %ctaid.y;
+ mov.u32 %r10, %tid.y;
+ mad.lo.s32 %r11, %r7, %r3, %r10;
+ mad.lo.s32 %r1, %r8, %r9, %r11;
+ mul.lo.s32 %r12, %r3, %r2;
+ setp.ge.s32 %p1, %r1, %r12;
+ @%p1 bra BB6_2;
+
+ cvta.to.global.u64 %rd2, %rd1;
+ mul.wide.s32 %rd3, %r1, 8;
+ add.s64 %rd4, %rd2, %rd3;
+ st.global.f64 [%rd4], %fd1;
+
+BB6_2:
+ ret;
+}
+
+ // .globl dense_matrix_copy
+.visible .entry dense_matrix_copy(
+ .param .u64 dense_matrix_copy_param_0,
+ .param .u64 dense_matrix_copy_param_1,
+ .param .u32 dense_matrix_copy_param_2,
+ .param .u32 dense_matrix_copy_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<12>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1, [dense_matrix_copy_param_0];
+ ld.param.u64 %rd2, [dense_matrix_copy_param_1];
+ ld.param.u32 %r2, [dense_matrix_copy_param_2];
+ ld.param.u32 %r3, [dense_matrix_copy_param_3];
+ mov.u32 %r4, %ctaid.x;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r5, %r4, %r6;
+ mov.u32 %r8, %ntid.y;
+ mov.u32 %r9, %ctaid.y;
+ mov.u32 %r10, %tid.y;
+ mad.lo.s32 %r11, %r8, %r9, %r10;
+ mad.lo.s32 %r1, %r7, %r3, %r11;
+ setp.lt.s32 %p1, %r7, %r2;
+ setp.lt.s32 %p2, %r11, %r3;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB7_2;
+ bra.uni BB7_1;
+
+BB7_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mul.wide.s32 %rd4, %r1, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd1;
+
+BB7_2:
+ ret;
+}
+
+ // .globl relu
+.visible .entry relu(
+ .param .u64 relu_param_0,
+ .param .u64 relu_param_1,
+ .param .u32 relu_param_2,
+ .param .u32 relu_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<12>;
+ .reg .f64 %fd<4>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd1, [relu_param_0];
+ ld.param.u64 %rd2, [relu_param_1];
+ ld.param.u32 %r4, [relu_param_2];
+ ld.param.u32 %r3, [relu_param_3];
+ mov.u32 %r5, %ctaid.x;
+ mov.u32 %r6, %ntid.x;
+ mov.u32 %r7, %tid.x;
+ mad.lo.s32 %r1, %r6, %r5, %r7;
+ mov.u32 %r8, %ntid.y;
+ mov.u32 %r9, %ctaid.y;
+ mov.u32 %r10, %tid.y;
+ mad.lo.s32 %r2, %r8, %r9, %r10;
+ setp.lt.s32 %p1, %r1, %r4;
+ setp.lt.s32 %p2, %r2, %r3;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB8_2;
+ bra.uni BB8_1;
+
+BB8_1:
+ cvta.to.global.u64 %rd3, %rd1;
+ mad.lo.s32 %r11, %r1, %r3, %r2;
+ mul.wide.s32 %rd4, %r11, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd1, [%rd5];
+ mov.f64 %fd2, 0d0000000000000000;
+ max.f64 %fd3, %fd2, %fd1;
+ cvta.to.global.u64 %rd6, %rd2;
+ add.s64 %rd7, %rd6, %rd4;
+ st.global.f64 [%rd7], %fd3;
+
+BB8_2:
+ ret;
+}
+
+ // .globl compareAndSet
+.visible .entry compareAndSet(
+ .param .u64 compareAndSet_param_0,
+ .param .u64 compareAndSet_param_1,
+ .param .u32 compareAndSet_param_2,
+ .param .u32 compareAndSet_param_3,
+ .param .f64 compareAndSet_param_4,
+ .param .f64 compareAndSet_param_5,
+ .param .f64 compareAndSet_param_6,
+ .param .f64 compareAndSet_param_7,
+ .param .f64 compareAndSet_param_8
+)
+{
+ .reg .pred %p<6>;
+ .reg .b32 %r<12>;
+ .reg .f64 %fd<9>;
+ .reg .b64 %rd<8>;
+
+
+ ld.param.u64 %rd2, [compareAndSet_param_0];
+ ld.param.u64 %rd3, [compareAndSet_param_1];
+ ld.param.u32 %r2, [compareAndSet_param_2];
+ ld.param.u32 %r3, [compareAndSet_param_3];
+ ld.param.f64 %fd2, [compareAndSet_param_4];
+ ld.param.f64 %fd3, [compareAndSet_param_5];
+ ld.param.f64 %fd4, [compareAndSet_param_6];
+ ld.param.f64 %fd5, [compareAndSet_param_7];
+ ld.param.f64 %fd6, [compareAndSet_param_8];
+ mov.u32 %r4, %ctaid.x;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r5, %r4, %r6;
+ mov.u32 %r8, %ntid.y;
+ mov.u32 %r9, %ctaid.y;
+ mov.u32 %r10, %tid.y;
+ mad.lo.s32 %r11, %r8, %r9, %r10;
+ mad.lo.s32 %r1, %r7, %r3, %r11;
+ setp.lt.s32 %p1, %r7, %r2;
+ setp.lt.s32 %p2, %r11, %r3;
+ and.pred %p3, %p1, %p2;
+ @!%p3 bra BB9_6;
+ bra.uni BB9_1;
+
+BB9_1:
+ cvta.to.global.u64 %rd4, %rd2;
+ mul.wide.s32 %rd5, %r1, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd1, [%rd6];
+ sub.f64 %fd7, %fd1, %fd2;
+ abs.f64 %fd8, %fd7;
+ setp.lt.f64 %p4, %fd8, %fd3;
+ cvta.to.global.u64 %rd7, %rd3;
+ add.s64 %rd1, %rd7, %rd5;
+ @%p4 bra BB9_5;
+ bra.uni BB9_2;
+
+BB9_5:
+ st.global.f64 [%rd1], %fd4;
+ bra.uni BB9_6;
+
+BB9_2:
+ setp.lt.f64 %p5, %fd1, %fd2;
+ @%p5 bra BB9_4;
+ bra.uni BB9_3;
+
+BB9_4:
+ st.global.f64 [%rd1], %fd5;
+ bra.uni BB9_6;
+
+BB9_3:
+ st.global.f64 [%rd1], %fd6;
+
+BB9_6:
+ ret;
+}
+
+ // .globl binCellOp
+.visible .entry binCellOp(
+ .param .u64 binCellOp_param_0,
+ .param .u64 binCellOp_param_1,
+ .param .u64 binCellOp_param_2,
+ .param .u32 binCellOp_param_3,
+ .param .u32 binCellOp_param_4,
+ .param .u32 binCellOp_param_5,
+ .param .u32 binCellOp_param_6,
+ .param .u32 binCellOp_param_7
+)
+{
+ .reg .pred %p<52>;
+ .reg .b32 %r<56>;
+ .reg .f64 %fd<40>;
+ .reg .b64 %rd<15>;
+
+
+ ld.param.u64 %rd2, [binCellOp_param_0];
+ ld.param.u64 %rd3, [binCellOp_param_1];
+ ld.param.u64 %rd4, [binCellOp_param_2];
+ ld.param.u32 %r14, [binCellOp_param_3];
+ ld.param.u32 %r10, [binCellOp_param_4];
+ ld.param.u32 %r11, [binCellOp_param_5];
+ ld.param.u32 %r12, [binCellOp_param_6];
+ ld.param.u32 %r13, [binCellOp_param_7];
+ mov.u32 %r15, %ntid.x;
+ mov.u32 %r16, %ctaid.x;
+ mov.u32 %r17, %tid.x;
+ mad.lo.s32 %r1, %r15, %r16, %r17;
+ mov.u32 %r18, %ntid.y;
+ mov.u32 %r19, %ctaid.y;
+ mov.u32 %r20, %tid.y;
+ mad.lo.s32 %r2, %r18, %r19, %r20;
+ setp.lt.s32 %p2, %r1, %r14;
+ setp.lt.s32 %p3, %r2, %r10;
+ and.pred %p4, %p2, %p3;
+ @!%p4 bra BB10_55;
+ bra.uni BB10_1;
+
+BB10_1:
+ mad.lo.s32 %r3, %r1, %r10, %r2;
+ setp.eq.s32 %p5, %r11, 1;
+ mov.u32 %r54, %r1;
+ @%p5 bra BB10_5;
+
+ setp.ne.s32 %p6, %r11, 2;
+ mov.u32 %r55, %r3;
+ @%p6 bra BB10_4;
+
+ mov.u32 %r55, %r2;
+
+BB10_4:
+ mov.u32 %r49, %r55;
+ mov.u32 %r4, %r49;
+ mov.u32 %r54, %r4;
+
+BB10_5:
+ mov.u32 %r5, %r54;
+ setp.eq.s32 %p7, %r12, 1;
+ mov.u32 %r52, %r1;
+ @%p7 bra BB10_9;
+
+ setp.ne.s32 %p8, %r12, 2;
+ mov.u32 %r53, %r3;
+ @%p8 bra BB10_8;
+
+ mov.u32 %r53, %r2;
+
+BB10_8:
+ mov.u32 %r52, %r53;
+
+BB10_9:
+ cvta.to.global.u64 %rd5, %rd3;
+ cvta.to.global.u64 %rd6, %rd2;
+ mul.wide.s32 %rd7, %r5, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ ld.global.f64 %fd1, [%rd8];
+ mul.wide.s32 %rd9, %r52, 8;
+ add.s64 %rd10, %rd5, %rd9;
+ ld.global.f64 %fd2, [%rd10];
+ mov.f64 %fd39, 0dC08F380000000000;
+ setp.gt.s32 %p9, %r13, 5;
+ @%p9 bra BB10_19;
+
+ setp.gt.s32 %p19, %r13, 2;
+ @%p19 bra BB10_15;
+
+ setp.eq.s32 %p23, %r13, 0;
+ @%p23 bra BB10_53;
+
+ setp.eq.s32 %p24, %r13, 1;
+ @%p24 bra BB10_52;
+ bra.uni BB10_13;
+
+BB10_52:
+ sub.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_19:
+ setp.gt.s32 %p10, %r13, 8;
+ @%p10 bra BB10_24;
+
+ setp.eq.s32 %p16, %r13, 6;
+ @%p16 bra BB10_34;
+
+ setp.eq.s32 %p17, %r13, 7;
+ @%p17 bra BB10_33;
+ bra.uni BB10_22;
+
+BB10_33:
+ setp.gt.f64 %p29, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
+ bra.uni BB10_54;
+
+BB10_15:
+ setp.eq.s32 %p20, %r13, 3;
+ @%p20 bra BB10_51;
+
+ setp.eq.s32 %p21, %r13, 4;
+ @%p21 bra BB10_35;
+ bra.uni BB10_17;
+
+BB10_35:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r8}, %fd1;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r9}, %fd2;
+ }
+ bfe.u32 %r21, %r9, 20, 11;
+ add.s32 %r22, %r21, -1012;
+ mov.b64 %rd11, %fd2;
+ shl.b64 %rd1, %rd11, %r22;
+ setp.eq.s64 %p32, %rd1, -9223372036854775808;
+ abs.f64 %fd11, %fd1;
+ // Callseq Start 1
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.f64 [param0+0], %fd11;
+ .param .b64 param1;
+ st.param.f64 [param1+0], %fd2;
+ .param .b64 retval0;
+ call.uni (retval0),
+ __internal_accurate_pow,
+ (
+ param0,
+ param1
+ );
+ ld.param.f64 %fd38, [retval0+0];
+
+ //{
+ }// Callseq End 1
+ setp.lt.s32 %p33, %r8, 0;
+ and.pred %p1, %p33, %p32;
+ @!%p1 bra BB10_37;
+ bra.uni BB10_36;
+
+BB10_36:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r23}, %fd38;
+ }
+ xor.b32 %r24, %r23, -2147483648;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r25, %temp}, %fd38;
+ }
+ mov.b64 %fd38, {%r25, %r24};
+
+BB10_37:
+ mov.f64 %fd37, %fd38;
+ setp.eq.f64 %p34, %fd1, 0d0000000000000000;
+ @%p34 bra BB10_40;
+ bra.uni BB10_38;
+
+BB10_40:
+ selp.b32 %r26, %r8, 0, %p32;
+ or.b32 %r27, %r26, 2146435072;
+ setp.lt.s32 %p38, %r9, 0;
+ selp.b32 %r28, %r27, %r26, %p38;
+ mov.u32 %r29, 0;
+ mov.b64 %fd37, {%r29, %r28};
+ bra.uni BB10_41;
+
+BB10_24:
+ setp.gt.s32 %p11, %r13, 10;
+ @%p11 bra BB10_28;
+
+ setp.eq.s32 %p14, %r13, 9;
+ @%p14 bra BB10_32;
+ bra.uni BB10_26;
+
+BB10_32:
+ setp.eq.f64 %p27, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
+ bra.uni BB10_54;
+
+BB10_28:
+ setp.eq.s32 %p12, %r13, 11;
+ @%p12 bra BB10_31;
+ bra.uni BB10_29;
+
+BB10_31:
+ min.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_53:
+ add.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_13:
+ setp.eq.s32 %p25, %r13, 2;
+ @%p25 bra BB10_14;
+ bra.uni BB10_54;
+
+BB10_14:
+ mul.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_34:
+ setp.le.f64 %p30, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
+ bra.uni BB10_54;
+
+BB10_22:
+ setp.eq.s32 %p18, %r13, 8;
+ @%p18 bra BB10_23;
+ bra.uni BB10_54;
+
+BB10_23:
+ setp.ge.f64 %p28, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
+ bra.uni BB10_54;
+
+BB10_51:
+ div.rn.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_17:
+ setp.eq.s32 %p22, %r13, 5;
+ @%p22 bra BB10_18;
+ bra.uni BB10_54;
+
+BB10_18:
+ setp.lt.f64 %p31, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
+ bra.uni BB10_54;
+
+BB10_26:
+ setp.eq.s32 %p15, %r13, 10;
+ @%p15 bra BB10_27;
+ bra.uni BB10_54;
+
+BB10_27:
+ setp.neu.f64 %p26, %fd1, %fd2;
+ selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
+ bra.uni BB10_54;
+
+BB10_29:
+ setp.ne.s32 %p13, %r13, 12;
+ @%p13 bra BB10_54;
+
+ max.f64 %fd39, %fd1, %fd2;
+ bra.uni BB10_54;
+
+BB10_38:
+ setp.gt.s32 %p35, %r8, -1;
+ @%p35 bra BB10_41;
+
+ cvt.rzi.f64.f64 %fd29, %fd2;
+ setp.neu.f64 %p36, %fd29, %fd2;
+ selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36;
+
+BB10_41:
+ mov.f64 %fd17, %fd37;
+ add.f64 %fd18, %fd1, %fd2;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r30}, %fd18;
+ }
+ and.b32 %r31, %r30, 2146435072;
+ setp.ne.s32 %p39, %r31, 2146435072;
+ mov.f64 %fd36, %fd17;
+ @%p39 bra BB10_50;
+
+ setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000;
+ mov.f64 %fd36, %fd18;
+ @%p40 bra BB10_50;
+
+ abs.f64 %fd30, %fd2;
+ setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000;
+ mov.f64 %fd35, %fd18;
+ mov.f64 %fd36, %fd35;
+ @%p41 bra BB10_50;
+
+ and.b32 %r32, %r9, 2147483647;
+ setp.ne.s32 %p42, %r32, 2146435072;
+ @%p42 bra BB10_46;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r33, %temp}, %fd2;
+ }
+ setp.eq.s32 %p43, %r33, 0;
+ @%p43 bra BB10_49;
+
+BB10_46:
+ and.b32 %r34, %r8, 2147483647;
+ setp.ne.s32 %p44, %r34, 2146435072;
+ mov.f64 %fd33, %fd17;
+ mov.f64 %fd36, %fd33;
+ @%p44 bra BB10_50;
+
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r35, %temp}, %fd1;
+ }
+ setp.ne.s32 %p45, %r35, 0;
+ mov.f64 %fd36, %fd17;
+ @%p45 bra BB10_50;
+
+ shr.s32 %r36, %r9, 31;
+ and.b32 %r37, %r36, -2146435072;
+ add.s32 %r38, %r37, 2146435072;
+ or.b32 %r39, %r38, -2147483648;
+ selp.b32 %r40, %r39, %r38, %p1;
+ mov.u32 %r41, 0;
+ mov.b64 %fd36, {%r41, %r40};
+ bra.uni BB10_50;
+
+BB10_49:
+ setp.gt.f64 %p46, %fd11, 0d3FF0000000000000;
+ selp.b32 %r42, 2146435072, 0, %p46;
+ xor.b32 %r43, %r42, 2146435072;
+ setp.lt.s32 %p47, %r9, 0;
+ selp.b32 %r44, %r43, %r42, %p47;
+ setp.eq.f64 %p48, %fd1, 0dBFF0000000000000;
+ selp.b32 %r45, 1072693248, %r44, %p48;
+ mov.u32 %r46, 0;
+ mov.b64 %fd36, {%r46, %r45};
+
+BB10_50:
+ setp.eq.f64 %p49, %fd2, 0d0000000000000000;
+ setp.eq.f64 %p50, %fd1, 0d3FF0000000000000;
+ or.pred %p51, %p50, %p49;
+ selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51;
+
+BB10_54:
+ cvta.to.global.u64 %rd12, %rd4;
+ mul.wide.s32 %rd13, %r3, 8;
+ add.s64 %rd14, %rd12, %rd13;
+ st.global.f64 [%rd14], %fd39;
+
+BB10_55:
+ ret;
}
- // .globl binCellOp
-.visible .entry binCellOp(
- .param .u64 binCellOp_param_0,
- .param .u64 binCellOp_param_1,
- .param .u64 binCellOp_param_2,
- .param .u32 binCellOp_param_3,
- .param .u32 binCellOp_param_4,
- .param .u32 binCellOp_param_5,
- .param .u32 binCellOp_param_6,
- .param .u32 binCellOp_param_7
+ // .globl binCellScalarOp
+.visible .entry binCellScalarOp(
+ .param .u64 binCellScalarOp_param_0,
+ .param .f64 binCellScalarOp_param_1,
+ .param .u64 binCellScalarOp_param_2,
+ .param .u32 binCellScalarOp_param_3,
+ .param .u32 binCellScalarOp_param_4,
+ .param .u32 binCellScalarOp_param_5,
+ .param .u32 binCellScalarOp_param_6
)
{
- .reg .pred %p<52>;
- .reg .b32 %r<56>;
- .reg .f64 %fd<40>;
- .reg .b64 %rd<15>;
+ .reg .pred %p<89>;
+ .reg .b32 %r<71>;
+ .reg .f64 %fd<77>;
+ .reg .b64 %rd<12>;
- ld.param.u64 %rd2, [binCellOp_param_0];
- ld.param.u64 %rd3, [binCellOp_param_1];
- ld.param.u64 %rd4, [binCellOp_param_2];
- ld.param.u32 %r14, [binCellOp_param_3];
- ld.param.u32 %r10, [binCellOp_param_4];
- ld.param.u32 %r11, [binCellOp_param_5];
- ld.param.u32 %r12, [binCellOp_param_6];
- ld.param.u32 %r13, [binCellOp_param_7];
- mov.u32 %r15, %ntid.x;
- mov.u32 %r16, %ctaid.x;
- mov.u32 %r17, %tid.x;
- mad.lo.s32 %r1, %r15, %r16, %r17;
- mov.u32 %r18, %ntid.y;
- mov.u32 %r19, %ctaid.y;
- mov.u32 %r20, %tid.y;
- mad.lo.s32 %r2, %r18, %r19, %r20;
- setp.lt.s32 %p2, %r1, %r14;
- setp.lt.s32 %p3, %r2, %r10;
- and.pred %p4, %p2, %p3;
- @!%p4 bra BB7_55;
- bra.uni BB7_1;
+ ld.param.u64 %rd4, [binCellScalarOp_param_0];
+ ld.param.f64 %fd52, [binCellScalarOp_param_1];
+ ld.param.u64 %rd5, [binCellScalarOp_param_2];
+ ld.param.u32 %r8, [binCellScalarOp_param_3];
+ ld.param.u32 %r9, [binCellScalarOp_param_4];
+ ld.param.u32 %r6, [binCellScalarOp_param_5];
+ ld.param.u32 %r7, [binCellScalarOp_param_6];
+ mov.u32 %r10, %ctaid.x;
+ mov.u32 %r11, %ntid.x;
+ mov.u32 %r12, %tid.x;
+ mad.lo.s32 %r13, %r11, %r10, %r12;
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r17, %r13, %r9, %r16;
+ mad.lo.s32 %r1, %r14, %r15, %r17;
+ mul.lo.s32 %r18, %r9, %r8;
+ setp.ge.s32 %p3, %r1, %r18;
+ @%p3 bra BB11_92;
+
+ cvta.to.global.u64 %rd6, %rd5;
+ cvta.to.global.u64 %rd7, %rd4;
+ mul.wide.s32 %rd8, %r1, 8;
+ add.s64 %rd9, %rd7, %rd8;
+ ld.global.f64 %fd1, [%rd9];
+ add.s64 %rd1, %rd6, %rd8;
+ setp.eq.s32 %p4, %r7, 0;
+ @%p4 bra BB11_47;
+
+ setp.eq.s32 %p5, %r6, 0;
+ @%p5 bra BB11_45;
+
+ mov.f64 %fd67, 0dC08F380000000000;
+ setp.gt.s32 %p6, %r6, 6;
+ @%p6 bra BB11_13;
+
+ setp.gt.s32 %p14, %r6, 3;
+ @%p14 bra BB11_9;
+
+ setp.eq.s32 %p18, %r6, 1;
+ @%p18 bra BB11_44;
+
+ setp.eq.s32 %p19, %r6, 2;
+ @%p19 bra BB11_43;
+ bra.uni BB11_7;
+
+BB11_43:
+ mul.f64 %fd67, %fd1, %fd52;
+ bra.uni BB11_46;
+
+BB11_47:
+ setp.eq.s32 %p47, %r6, 0;
+ @%p47 bra BB11_90;
+
+ mov.f64 %fd76, 0dC08F380000000000;
+ setp.gt.s32 %p48, %r6, 6;
+ @%p48 bra BB11_58;
+
+ setp.gt.s32 %p56, %r6, 3;
+ @%p56 bra BB11_54;
+
+ setp.eq.s32 %p60, %r6, 1;
+ @%p60 bra BB11_89;
+
+ setp.eq.s32 %p61, %r6, 2;
+ @%p61 bra BB11_88;
+ bra.uni BB11_52;
+
+BB11_88:
+ mul.f64 %fd76, %fd1, %fd52;
+ bra.uni BB11_91;
+
+BB11_45:
+ add.f64 %fd67, %fd1, %fd52;
+
+BB11_46:
+ st.global.f64 [%rd1], %fd67;
+ bra.uni BB11_92;
+
+BB11_13:
+ setp.gt.s32 %p7, %r6, 9;
+ @%p7 bra BB11_18;
+
+ setp.eq.s32 %p11, %r6, 7;
+ @%p11 bra BB11_25;
+
+ setp.eq.s32 %p12, %r6, 8;
+ @%p12 bra BB11_24;
+ bra.uni BB11_16;
+
+BB11_24:
+ setp.le.f64 %p23, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
+ bra.uni BB11_46;
+
+BB11_90:
+ add.f64 %fd76, %fd1, %fd52;
+
+BB11_91:
+ st.global.f64 [%rd1], %fd76;
+
+BB11_92:
+ ret;
+
+BB11_58:
+ setp.gt.s32 %p49, %r6, 9;
+ @%p49 bra BB11_63;
+
+ setp.eq.s32 %p53, %r6, 7;
+ @%p53 bra BB11_70;
+
+ setp.eq.s32 %p54, %r6, 8;
+ @%p54 bra BB11_69;
+ bra.uni BB11_61;
+
+BB11_69:
+ setp.ge.f64 %p65, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
+ bra.uni BB11_91;
+
+BB11_9:
+ setp.eq.s32 %p15, %r6, 4;
+ @%p15 bra BB11_27;
+
+ setp.eq.s32 %p16, %r6, 5;
+ @%p16 bra BB11_26;
+ bra.uni BB11_11;
+
+BB11_26:
+ setp.gt.f64 %p26, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
+ bra.uni BB11_46;
+
+BB11_18:
+ setp.eq.s32 %p8, %r6, 10;
+ @%p8 bra BB11_23;
+
+ setp.eq.s32 %p9, %r6, 11;
+ @%p9 bra BB11_22;
+ bra.uni BB11_20;
+
+BB11_22:
+ min.f64 %fd67, %fd52, %fd1;
+ bra.uni BB11_46;
+
+BB11_54:
+ setp.eq.s32 %p57, %r6, 4;
+ @%p57 bra BB11_72;
+
+ setp.eq.s32 %p58, %r6, 5;
+ @%p58 bra BB11_71;
+ bra.uni BB11_56;
+
+BB11_71:
+ setp.lt.f64 %p68, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
+ bra.uni BB11_91;
-BB7_1:
- mad.lo.s32 %r3, %r1, %r10, %r2;
- setp.eq.s32 %p5, %r11, 1;
- mov.u32 %r54, %r1;
- @%p5 bra BB7_5;
+BB11_63:
+ setp.eq.s32 %p50, %r6, 10;
+ @%p50 bra BB11_68;
- setp.ne.s32 %p6, %r11, 2;
- mov.u32 %r55, %r3;
- @%p6 bra BB7_4;
+ setp.eq.s32 %p51, %r6, 11;
+ @%p51 bra BB11_67;
+ bra.uni BB11_65;
- mov.u32 %r55, %r2;
+BB11_67:
+ min.f64 %fd76, %fd1, %fd52;
+ bra.uni BB11_91;
-BB7_4:
- mov.u32 %r49, %r55;
- mov.u32 %r4, %r49;
- mov.u32 %r54, %r4;
+BB11_44:
+ sub.f64 %fd67, %fd52, %fd1;
+ bra.uni BB11_46;
-BB7_5:
- mov.u32 %r5, %r54;
- setp.eq.s32 %p7, %r12, 1;
- mov.u32 %r52, %r1;
- @%p7 bra BB7_9;
+BB11_7:
+ setp.eq.s32 %p20, %r6, 3;
+ @%p20 bra BB11_8;
+ bra.uni BB11_46;
- setp.ne.s32 %p8, %r12, 2;
- mov.u32 %r53, %r3;
- @%p8 bra BB7_8;
+BB11_8:
+ div.rn.f64 %fd67, %fd52, %fd1;
+ bra.uni BB11_46;
- mov.u32 %r53, %r2;
+BB11_25:
+ setp.lt.f64 %p24, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
+ bra.uni BB11_46;
-BB7_8:
- mov.u32 %r52, %r53;
+BB11_16:
+ setp.eq.s32 %p13, %r6, 9;
+ @%p13 bra BB11_17;
+ bra.uni BB11_46;
-BB7_9:
- cvta.to.global.u64 %rd5, %rd3;
- cvta.to.global.u64 %rd6, %rd2;
- mul.wide.s32 %rd7, %r5, 8;
- add.s64 %rd8, %rd6, %rd7;
- ld.global.f64 %fd1, [%rd8];
- mul.wide.s32 %rd9, %r52, 8;
- add.s64 %rd10, %rd5, %rd9;
- ld.global.f64 %fd2, [%rd10];
- mov.f64 %fd39, 0dC08F380000000000;
- setp.gt.s32 %p9, %r13, 5;
- @%p9 bra BB7_19;
+BB11_17:
+ setp.eq.f64 %p22, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
+ bra.uni BB11_46;
- setp.gt.s32 %p19, %r13, 2;
- @%p19 bra BB7_15;
+BB11_27:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r2}, %fd52;
+ }
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r3}, %fd1;
+ }
+ bfe.u32 %r19, %r3, 20, 11;
+ add.s32 %r20, %r19, -1012;
+ mov.b64 %rd10, %fd1;
+ shl.b64 %rd2, %rd10, %r20;
+ setp.eq.s64 %p27, %rd2, -9223372036854775808;
+ abs.f64 %fd10, %fd52;
+ // Callseq Start 2
+ {
+ .reg .b32 temp_param_reg;
+ // <end>}
+ .param .b64 param0;
+ st.param.f64 [param0+0], %fd10;
+ .param .b64 param1;
+ st.param.f64 [param1+0], %fd1;
+ .param .b64 retval0;
+ call.uni (retval0),
+ __internal_accurate_pow,
+ (
+ param0,
+ param1
+ );
+ ld.param.f64 %fd66, [retval0+0];
+
+ //{
+ }// Callseq End 2
+ setp.lt.s32 %p28, %r2, 0;
+ and.pred %p1, %p28, %p27;
+ @!%p1 bra BB11_29;
+ bra.uni BB11_28;
- setp.eq.s32 %p23, %r13, 0;
- @%p23 bra BB7_53;
+BB11_28:
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r21}, %fd66;
+ }
+ xor.b32 %r22, %r21, -2147483648;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r23, %temp}, %fd66;
+ }
+ mov.b64 %fd66, {%r23, %r22};
- setp.eq.s32 %p24, %r13, 1;
- @%p24 bra BB7_52;
- bra.uni BB7_13;
+BB11_29:
+ mov.f64 %fd65, %fd66;
+ setp.eq.f64 %p29, %fd52, 0d0000000000000000;
+ @%p29 bra BB11_32;
+ bra.uni BB11_30;
-BB7_52:
- sub.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+BB11_32:
+ selp.b32 %r24, %r2, 0, %p27;
+ or.b32 %r25, %r24, 2146435072;
+ setp.lt.s32 %p33, %r3, 0;
+ selp.b32 %r26, %r25, %r24, %p33;
+ mov.u32 %r27, 0;
+ mov.b64 %fd65, {%r27, %r26};
+ bra.uni BB11_33;
-BB7_19:
- setp.gt.s32 %p10, %r13, 8;
- @%p10 bra BB7_24;
+BB11_11:
+ setp.eq.s32 %p17, %r6, 6;
+ @%p17 bra BB11_12;
+ bra.uni BB11_46;
- setp.eq.s32 %p16, %r13, 6;
- @%p16 bra BB7_34;
+BB11_12:
+ setp.ge.f64 %p25, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
+ bra.uni BB11_46;
- setp.eq.s32 %p17, %r13, 7;
- @%p17 bra BB7_33;
- bra.uni BB7_22;
+BB11_23:
+ setp.neu.f64 %p21, %fd1, %fd52;
+ selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
+ bra.uni BB11_46;
-BB7_33:
- setp.gt.f64 %p29, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p29;
- bra.uni BB7_54;
+BB11_20:
+ setp.ne.s32 %p10, %r6, 12;
+ @%p10 bra BB11_46;
-BB7_15:
- setp.eq.s32 %p20, %r13, 3;
- @%p20 bra BB7_51;
+ max.f64 %fd67, %fd52, %fd1;
+ bra.uni BB11_46;
- setp.eq.s32 %p21, %r13, 4;
- @%p21 bra BB7_35;
- bra.uni BB7_17;
+BB11_89:
+ sub.f64 %fd76, %fd1, %fd52;
+ bra.uni BB11_91;
+
+BB11_52:
+ setp.eq.s32 %p62, %r6, 3;
+ @%p62 bra BB11_53;
+ bra.uni BB11_91;
+
+BB11_53:
+ div.rn.f64 %fd76, %fd1, %fd52;
+ bra.uni BB11_91;
+
+BB11_70:
+ setp.gt.f64 %p66, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
+ bra.uni BB11_91;
+
+BB11_61:
+ setp.eq.s32 %p55, %r6, 9;
+ @%p55 bra BB11_62;
+ bra.uni BB11_91;
+
+BB11_62:
+ setp.eq.f64 %p64, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
+ bra.uni BB11_91;
-BB7_35:
+BB11_72:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r8}, %fd1;
+ mov.b64 {%temp, %r4}, %fd1;
}
{
.reg .b32 %temp;
- mov.b64 {%temp, %r9}, %fd2;
+ mov.b64 {%temp, %r5}, %fd52;
}
- bfe.u32 %r21, %r9, 20, 11;
- add.s32 %r22, %r21, -1012;
- mov.b64 %rd11, %fd2;
- shl.b64 %rd1, %rd11, %r22;
- setp.eq.s64 %p32, %rd1, -9223372036854775808;
- abs.f64 %fd11, %fd1;
- // Callseq Start 1
+ bfe.u32 %r45, %r5, 20, 11;
+ add.s32 %r46, %r45, -1012;
+ mov.b64 %rd11, %fd52;
+ shl.b64 %rd3, %rd11, %r46;
+ setp.eq.s64 %p69, %rd3, -9223372036854775808;
+ abs.f64 %fd35, %fd1;
+ // Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
- st.param.f64 [param0+0], %fd11;
+ st.param.f64 [param0+0], %fd35;
.param .b64 param1;
- st.param.f64 [param1+0], %fd2;
+ st.param.f64 [param1+0], %fd52;
.param .b64 retval0;
call.uni (retval0),
__internal_accurate_pow,
@@ -727,838 +1831,671 @@ BB7_35:
param0,
param1
);
- ld.param.f64 %fd38, [retval0+0];
+ ld.param.f64 %fd75, [retval0+0];
//{
- }// Callseq End 1
- setp.lt.s32 %p33, %r8, 0;
- and.pred %p1, %p33, %p32;
- @!%p1 bra BB7_37;
- bra.uni BB7_36;
+ }// Callseq End 3
+ setp.lt.s32 %p70, %r4, 0;
+ and.pred %p2, %p70, %p69;
+ @!%p2 bra BB11_74;
+ bra.uni BB11_73;
-BB7_36:
+BB11_73:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r23}, %fd38;
+ mov.b64 {%temp, %r47}, %fd75;
}
- xor.b32 %r24, %r23, -2147483648;
+ xor.b32 %r48, %r47, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r25, %temp}, %fd38;
+ mov.b64 {%r49, %temp}, %fd75;
}
- mov.b64 %fd38, {%r25, %r24};
-
-BB7_37:
- mov.f64 %fd37, %fd38;
- setp.eq.f64 %p34, %fd1, 0d0000000000000000;
- @%p34 bra BB7_40;
- bra.uni BB7_38;
-
-BB7_40:
- selp.b32 %r26, %r8, 0, %p32;
- or.b32 %r27, %r26, 2146435072;
- setp.lt.s32 %p38, %r9, 0;
- selp.b32 %r28, %r27, %r26, %p38;
- mov.u32 %r29, 0;
- mov.b64 %fd37, {%r29, %r28};
- bra.uni BB7_41;
-
-BB7_24:
- setp.gt.s32 %p11, %r13, 10;
- @%p11 bra BB7_28;
+ mov.b64 %fd75, {%r49, %r48};
- setp.eq.s32 %p14, %r13, 9;
- @%p14 bra BB7_32;
- bra.uni BB7_26;
+BB11_74:
+ mov.f64 %fd74, %fd75;
+ setp.eq.f64 %p71, %fd1, 0d0000000000000000;
+ @%p71 bra BB11_77;
+ bra.uni BB11_75;
-BB7_32:
- setp.eq.f64 %p27, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p27;
- bra.uni BB7_54;
+BB11_77:
+ selp.b32 %r50, %r4, 0, %p69;
+ or.b32 %r51, %r50, 2146435072;
+ setp.lt.s32 %p75, %r5, 0;
+ selp.b32 %r52, %r51, %r50, %p75;
+ mov.u32 %r53, 0;
+ mov.b64 %fd74, {%r53, %r52};
+ bra.uni BB11_78;
-BB7_28:
- setp.eq.s32 %p12, %r13, 11;
- @%p12 bra BB7_31;
- bra.uni BB7_29;
+BB11_56:
+ setp.eq.s32 %p59, %r6, 6;
+ @%p59 bra BB11_57;
+ bra.uni BB11_91;
-BB7_31:
- min.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+BB11_57:
+ setp.le.f64 %p67, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
+ bra.uni BB11_91;
-BB7_53:
- add.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+BB11_68:
+ setp.neu.f64 %p63, %fd1, %fd52;
+ selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
+ bra.uni BB11_91;
-BB7_13:
- setp.eq.s32 %p25, %r13, 2;
- @%p25 bra BB7_14;
- bra.uni BB7_54;
+BB11_65:
+ setp.ne.s32 %p52, %r6, 12;
+ @%p52 bra BB11_91;
-BB7_14:
- mul.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+ max.f64 %fd76, %fd1, %fd52;
+ bra.uni BB11_91;
-BB7_34:
- setp.le.f64 %p30, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p30;
- bra.uni BB7_54;
+BB11_30:
+ setp.gt.s32 %p30, %r2, -1;
+ @%p30 bra BB11_33;
-BB7_22:
- setp.eq.s32 %p18, %r13, 8;
- @%p18 bra BB7_23;
- bra.uni BB7_54;
+ cvt.rzi.f64.f64 %fd54, %fd1;
+ setp.neu.f64 %p31, %fd54, %fd1;
+ selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31;
-BB7_23:
- setp.ge.f64 %p28, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p28;
- bra.uni BB7_54;
+BB11_33:
+ mov.f64 %fd16, %fd65;
+ add.f64 %fd17, %fd1, %fd52;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%temp, %r28}, %fd17;
+ }
+ and.b32 %r29, %r28, 2146435072;
+ setp.ne.s32 %p34, %r29, 2146435072;
+ mov.f64 %fd64, %fd16;
+ @%p34 bra BB11_42;
-BB7_51:
- div.rn.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+ setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000;
+ mov.f64 %fd64, %fd17;
+ @%p35 bra BB11_42;
-BB7_17:
- setp.eq.s32 %p22, %r13, 5;
- @%p22 bra BB7_18;
- bra.uni BB7_54;
+ abs.f64 %fd55, %fd1;
+ setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000;
+ mov.f64 %fd63, %fd17;
+ mov.f64 %fd64, %fd63;
+ @%p36 bra BB11_42;
-BB7_18:
- setp.lt.f64 %p31, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p31;
- bra.uni BB7_54;
+ and.b32 %r30, %r3, 2147483647;
+ setp.ne.s32 %p37, %r30, 2146435072;
+ @%p37 bra BB11_38;
-BB7_26:
- setp.eq.s32 %p15, %r13, 10;
- @%p15 bra BB7_27;
- bra.uni BB7_54;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r31, %temp}, %fd1;
+ }
+ setp.eq.s32 %p38, %r31, 0;
+ @%p38 bra BB11_41;
-BB7_27:
- setp.neu.f64 %p26, %fd1, %fd2;
- selp.f64 %fd39, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB7_54;
+BB11_38:
+ and.b32 %r32, %r2, 2147483647;
+ setp.ne.s32 %p39, %r32, 2146435072;
+ mov.f64 %fd61, %fd16;
+ mov.f64 %fd64, %fd61;
+ @%p39 bra BB11_42;
-BB7_29:
- setp.ne.s32 %p13, %r13, 12;
- @%p13 bra BB7_54;
+ {
+ .reg .b32 %temp;
+ mov.b64 {%r33, %temp}, %fd52;
+ }
+ setp.ne.s32 %p40, %r33, 0;
+ mov.f64 %fd64, %fd16;
+ @%p40 bra BB11_42;
- max.f64 %fd39, %fd1, %fd2;
- bra.uni BB7_54;
+ shr.s32 %r34, %r3, 31;
+ and.b32 %r35, %r34, -2146435072;
+ add.s32 %r36, %r35, 2146435072;
+ or.b32 %r37, %r36, -2147483648;
+ selp.b32 %r38, %r37, %r36, %p1;
+ mov.u32 %r39, 0;
+ mov.b64 %fd64, {%r39, %r38};
+ bra.uni BB11_42;
-BB7_38:
- setp.gt.s32 %p35, %r8, -1;
- @%p35 bra BB7_41;
+BB11_75:
+ setp.gt.s32 %p72, %r4, -1;
+ @%p72 bra BB11_78;
- cvt.rzi.f64.f64 %fd29, %fd2;
- setp.neu.f64 %p36, %fd29, %fd2;
- selp.f64 %fd37, 0dFFF8000000000000, %fd37, %p36;
+ cvt.rzi.f64.f64 %fd57, %fd52;
+ setp.neu.f64 %p73, %fd57, %fd52;
+ selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73;
-BB7_41:
- mov.f64 %fd17, %fd37;
- add.f64 %fd18, %fd1, %fd2;
+BB11_78:
+ mov.f64 %fd41, %fd74;
+ add.f64 %fd42, %fd1, %fd52;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r30}, %fd18;
+ mov.b64 {%temp, %r54}, %fd42;
}
- and.b32 %r31, %r30, 2146435072;
- setp.ne.s32 %p39, %r31, 2146435072;
- mov.f64 %fd36, %fd17;
- @%p39 bra BB7_50;
+ and.b32 %r55, %r54, 2146435072;
+ setp.ne.s32 %p76, %r55, 2146435072;
+ mov.f64 %fd73, %fd41;
+ @%p76 bra BB11_87;
- setp.gtu.f64 %p40, %fd11, 0d7FF0000000000000;
- mov.f64 %fd36, %fd18;
- @%p40 bra BB7_50;
+ setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000;
+ mov.f64 %fd73, %fd42;
+ @%p77 bra BB11_87;
- abs.f64 %fd30, %fd2;
- setp.gtu.f64 %p41, %fd30, 0d7FF0000000000000;
- mov.f64 %fd35, %fd18;
- mov.f64 %fd36, %fd35;
- @%p41 bra BB7_50;
+ abs.f64 %fd58, %fd52;
+ setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000;
+ mov.f64 %fd72, %fd42;
+ mov.f64 %fd73, %fd72;
+ @%p78 bra BB11_87;
- and.b32 %r32, %r9, 2147483647;
- setp.ne.s32 %p42, %r32, 2146435072;
- @%p42 bra BB7_46;
+ and.b32 %r56, %r5, 2147483647;
+ setp.ne.s32 %p79, %r56, 2146435072;
+ @%p79 bra BB11_83;
{
.reg .b32 %temp;
- mov.b64 {%r33, %temp}, %fd2;
+ mov.b64 {%r57, %temp}, %fd52;
}
- setp.eq.s32 %p43, %r33, 0;
- @%p43 bra BB7_49;
+ setp.eq.s32 %p80, %r57, 0;
+ @%p80 bra BB11_86;
-BB7_46:
- and.b32 %r34, %r8, 2147483647;
- setp.ne.s32 %p44, %r34, 2146435072;
- mov.f64 %fd33, %fd17;
- mov.f64 %fd36, %fd33;
- @%p44 bra BB7_50;
+BB11_83:
+ and.b32 %r58, %r4, 2147483647;
+ setp.ne.s32 %p81, %r58, 2146435072;
+ mov.f64 %fd70, %fd41;
+ mov.f64 %fd73, %fd70;
+ @%p81 bra BB11_87;
{
.reg .b32 %temp;
- mov.b64 {%r35, %temp}, %fd1;
+ mov.b64 {%r59, %temp}, %fd1;
}
- setp.ne.s32 %p45, %r35, 0;
- mov.f64 %fd36, %fd17;
- @%p45 bra BB7_50;
+ setp.ne.s32 %p82, %r59, 0;
+ mov.f64 %fd73, %fd41;
+ @%p82 bra BB11_87;
- shr.s32 %r36, %r9, 31;
- and.b32 %r37, %r36, -2146435072;
- add.s32 %r38, %r37, 2146435072;
- or.b32 %r39, %r38, -2147483648;
- selp.b32 %r40, %r39, %r38, %p1;
- mov.u32 %r41, 0;
- mov.b64 %fd36, {%r41, %r40};
- bra.uni BB7_50;
+ shr.s32 %r60, %r5, 31;
+ and.b32 %r61, %r60, -2146435072;
+ add.s32 %r62, %r61, 2146435072;
+ or.b32 %r63, %r62, -2147483648;
+ selp.b32 %r64, %r63, %r62, %p2;
+ mov.u32 %r65, 0;
+ mov.b64 %fd73, {%r65, %r64};
+ bra.uni BB11_87;
-BB7_49:
- setp.gt.f64 %p46, %fd11, 0d3FF0000000000000;
- selp.b32 %r42, 2146435072, 0, %p46;
- xor.b32 %r43, %r42, 2146435072;
- setp.lt.s32 %p47, %r9, 0;
- selp.b32 %r44, %r43, %r42, %p47;
- setp.eq.f64 %p48, %fd1, 0dBFF0000000000000;
- selp.b32 %r45, 1072693248, %r44, %p48;
- mov.u32 %r46, 0;
- mov.b64 %fd36, {%r46, %r45};
+BB11_41:
+ setp.gt.f64 %p41, %fd10, 0d3FF0000000000000;
+ selp.b32 %r40, 2146435072, 0, %p41;
+ xor.b32 %r41, %r40, 2146435072;
+ setp.lt.s32 %p42, %r3, 0;
+ selp.b32 %r42, %r41, %r40, %p42;
+ setp.eq.f64 %p43, %fd52, 0dBFF0000000000000;
+ selp.b32 %r43, 1072693248, %r42, %p43;
+ mov.u32 %r44, 0;
+ mov.b64 %fd64, {%r44, %r43};
-BB7_50:
- setp.eq.f64 %p49, %fd2, 0d0000000000000000;
- setp.eq.f64 %p50, %fd1, 0d3FF0000000000000;
- or.pred %p51, %p50, %p49;
- selp.f64 %fd39, 0d3FF0000000000000, %fd36, %p51;
+BB11_42:
+ setp.eq.f64 %p44, %fd1, 0d0000000000000000;
+ setp.eq.f64 %p45, %fd52, 0d3FF0000000000000;
+ or.pred %p46, %p45, %p44;
+ selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46;
+ bra.uni BB11_46;
-BB7_54:
- cvta.to.global.u64 %rd12, %rd4;
- mul.wide.s32 %rd13, %r3, 8;
- add.s64 %rd14, %rd12, %rd13;
- st.global.f64 [%rd14], %fd39;
+BB11_86:
+ setp.gt.f64 %p83, %fd35, 0d3FF0000000000000;
+ selp.b32 %r66, 2146435072, 0, %p83;
+ xor.b32 %r67, %r66, 2146435072;
+ setp.lt.s32 %p84, %r5, 0;
+ selp.b32 %r68, %r67, %r66, %p84;
+ setp.eq.f64 %p85, %fd1, 0dBFF0000000000000;
+ selp.b32 %r69, 1072693248, %r68, %p85;
+ mov.u32 %r70, 0;
+ mov.b64 %fd73, {%r70, %r69};
-BB7_55:
- ret;
+BB11_87:
+ setp.eq.f64 %p86, %fd52, 0d0000000000000000;
+ setp.eq.f64 %p87, %fd1, 0d3FF0000000000000;
+ or.pred %p88, %p87, %p86;
+ selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88;
+ bra.uni BB11_91;
}
- // .globl binCellScalarOp
-.visible .entry binCellScalarOp(
- .param .u64 binCellScalarOp_param_0,
- .param .f64 binCellScalarOp_param_1,
- .param .u64 binCellScalarOp_param_2,
- .param .u32 binCellScalarOp_param_3,
- .param .u32 binCellScalarOp_param_4,
- .param .u32 binCellScalarOp_param_5,
- .param .u32 binCellScalarOp_param_6
+ // .globl fill
+.visible .entry fill(
+ .param .u64 fill_param_0,
+ .param .f64 fill_param_1,
+ .param .u32 fill_param_2
)
{
- .reg .pred %p<89>;
- .reg .b32 %r<71>;
- .reg .f64 %fd<77>;
- .reg .b64 %rd<12>;
-
-
- ld.param.u64 %rd4, [binCellScalarOp_param_0];
- ld.param.f64 %fd52, [binCellScalarOp_param_1];
- ld.param.u64 %rd5, [binCellScalarOp_param_2];
- ld.param.u32 %r8, [binCellScalarOp_param_3];
- ld.param.u32 %r9, [binCellScalarOp_param_4];
- ld.param.u32 %r6, [binCellScalarOp_param_5];
- ld.param.u32 %r7, [binCellScalarOp_param_6];
- mov.u32 %r10, %ctaid.x;
- mov.u32 %r11, %ntid.x;
- mov.u32 %r12, %tid.x;
- mad.lo.s32 %r13, %r11, %r10, %r12;
- mov.u32 %r14, %ntid.y;
- mov.u32 %r15, %ctaid.y;
- mov.u32 %r16, %tid.y;
- mad.lo.s32 %r17, %r13, %r9, %r16;
- mad.lo.s32 %r1, %r14, %r15, %r17;
- mul.lo.s32 %r18, %r9, %r8;
- setp.ge.s32 %p3, %r1, %r18;
- @%p3 bra BB8_92;
-
- cvta.to.global.u64 %rd6, %rd5;
- cvta.to.global.u64 %rd7, %rd4;
- mul.wide.s32 %rd8, %r1, 8;
- add.s64 %rd9, %rd7, %rd8;
- ld.global.f64 %fd1, [%rd9];
- add.s64 %rd1, %rd6, %rd8;
- setp.eq.s32 %p4, %r7, 0;
- @%p4 bra BB8_47;
-
- setp.eq.s32 %p5, %r6, 0;
- @%p5 bra BB8_45;
-
- mov.f64 %fd67, 0dC08F380000000000;
- setp.gt.s32 %p6, %r6, 6;
- @%p6 bra BB8_13;
-
- setp.gt.s32 %p14, %r6, 3;
- @%p14 bra BB8_9;
-
- setp.eq.s32 %p18, %r6, 1;
- @%p18 bra BB8_44;
-
- setp.eq.s32 %p19, %r6, 2;
- @%p19 bra BB8_43;
- bra.uni BB8_7;
-
-BB8_43:
- mul.f64 %fd67, %fd1, %fd52;
- bra.uni BB8_46;
-
-BB8_47:
- setp.eq.s32 %p47, %r6, 0;
- @%p47 bra BB8_90;
-
- mov.f64 %fd76, 0dC08F380000000000;
- setp.gt.s32 %p48, %r6, 6;
- @%p48 bra BB8_58;
-
- setp.gt.s32 %p56, %r6, 3;
- @%p56 bra BB8_54;
-
- setp.eq.s32 %p60, %r6, 1;
- @%p60 bra BB8_89;
-
- setp.eq.s32 %p61, %r6, 2;
- @%p61 bra BB8_88;
- bra.uni BB8_52;
-
-BB8_88:
- mul.f64 %fd76, %fd1, %fd52;
- bra.uni BB8_91;
-
-BB8_45:
- add.f64 %fd67, %fd1, %fd52;
-
-BB8_46:
- st.global.f64 [%rd1], %fd67;
- bra.uni BB8_92;
-
-BB8_13:
- setp.gt.s32 %p7, %r6, 9;
- @%p7 bra BB8_18;
-
- setp.eq.s32 %p11, %r6, 7;
- @%p11 bra BB8_25;
-
- setp.eq.s32 %p12, %r6, 8;
- @%p12 bra BB8_24;
- bra.uni BB8_16;
+ .reg .pred %p<2>;
+ .reg .b32 %r<6>;
+ .reg .f64 %fd<2>;
+ .reg .b64 %rd<5>;
-BB8_24:
- setp.le.f64 %p23, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p23;
- bra.uni BB8_46;
-BB8_90:
- add.f64 %fd76, %fd1, %fd52;
+ ld.param.u64 %rd1, [fill_param_0];
+ ld.param.f64 %fd1, [fill_param_1];
+ ld.param.u32 %r2, [fill_param_2];
+ mov.u32 %r3, %ctaid.x;
+ mov.u32 %r4, %ntid.x;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r1, %r4, %r3, %r5;
+ setp.ge.s32 %p1, %r1, %r2;
+ @%p1 bra BB12_2;
-BB8_91:
- st.global.f64 [%rd1], %fd76;
+ cvta.to.global.u64 %rd2, %rd1;
+ mul.wide.s32 %rd3, %r1, 8;
+ add.s64 %rd4, %rd2, %rd3;
+ st.global.f64 [%rd4], %fd1;
-BB8_92:
+BB12_2:
ret;
+}
-BB8_58:
- setp.gt.s32 %p49, %r6, 9;
- @%p49 bra BB8_63;
+ // .globl reduce_row
+.visible .entry reduce_row(
+ .param .u64 reduce_row_param_0,
+ .param .u64 reduce_row_param_1,
+ .param .u32 reduce_row_param_2,
+ .param .u32 reduce_row_param_3
+)
+{
+ .reg .pred %p<18>;
+ .reg .b32 %r<36>;
+ .reg .f64 %fd<65>;
+ .reg .b64 %rd<39>;
- setp.eq.s32 %p53, %r6, 7;
- @%p53 bra BB8_70;
- setp.eq.s32 %p54, %r6, 8;
- @%p54 bra BB8_69;
- bra.uni BB8_61;
+ ld.param.u64 %rd1, [reduce_row_param_0];
+ ld.param.u64 %rd2, [reduce_row_param_1];
+ ld.param.u32 %r5, [reduce_row_param_2];
+ ld.param.u32 %r4, [reduce_row_param_3];
+ mov.u32 %r6, %ctaid.x;
+ setp.ge.u32 %p1, %r6, %r5;
+ @%p1 bra BB13_31;
-BB8_69:
- setp.ge.f64 %p65, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p65;
- bra.uni BB8_91;
+ mov.u32 %r35, %tid.x;
+ mov.f64 %fd63, 0d0000000000000000;
+ mov.f64 %fd64, %fd63;
+ setp.ge.u32 %p2, %r35, %r4;
+ @%p2 bra BB13_4;
-BB8_9:
- setp.eq.s32 %p15, %r6, 4;
- @%p15 bra BB8_27;
+ cvta.to.global.u64 %rd3, %rd1;
- setp.eq.s32 %p16, %r6, 5;
- @%p16 bra BB8_26;
- bra.uni BB8_11;
+BB13_3:
+ mad.lo.s32 %r8, %r6, %r4, %r35;
+ mul.wide.u32 %rd4, %r8, 8;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.f64 %fd25, [%rd5];
+ add.f64 %fd64, %fd64, %fd25;
+ mov.u32 %r9, %ntid.x;
+ add.s32 %r35, %r9, %r35;
+ setp.lt.u32 %p3, %r35, %r4;
+ mov.f64 %fd63, %fd64;
+ @%p3 bra BB13_3;
-BB8_26:
- setp.gt.f64 %p26, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p26;
- bra.uni BB8_46;
+BB13_4:
+ mov.f64 %fd61, %fd63;
+ mov.u32 %r10, %tid.x;
+ mul.wide.u32 %rd6, %r10, 8;
+ mov.u64 %rd7, sdata;
+ add.s64 %rd8, %rd7, %rd6;
+ st.shared.f64 [%rd8], %fd61;
+ bar.sync 0;
+ mov.u32 %r11, %ntid.x;
+ setp.lt.u32 %p4, %r11, 512;
+ @%p4 bra BB13_8;
-BB8_18:
- setp.eq.s32 %p8, %r6, 10;
- @%p8 bra BB8_23;
+ setp.gt.u32 %p5, %r10, 255;
+ mov.f64 %fd62, %fd61;
+ @%p5 bra BB13_7;
- setp.eq.s32 %p9, %r6, 11;
- @%p9 bra BB8_22;
- bra.uni BB8_20;
+ ld.shared.f64 %fd26, [%rd8+2048];
+ add.f64 %fd62, %fd61, %fd26;
+ st.shared.f64 [%rd8], %fd62;
-BB8_22:
- min.f64 %fd67, %fd52, %fd1;
- bra.uni BB8_46;
+BB13_7:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
-BB8_54:
- setp.eq.s32 %p57, %r6, 4;
- @%p57 bra BB8_72;
+BB13_8:
+ mov.f64 %fd59, %fd61;
+ setp.lt.u32 %p6, %r11, 256;
+ @%p6 bra BB13_12;
- setp.eq.s32 %p58, %r6, 5;
- @%p58 bra BB8_71;
- bra.uni BB8_56;
+ setp.gt.u32 %p7, %r10, 127;
+ mov.f64 %fd60, %fd59;
+ @%p7 bra BB13_11;
-BB8_71:
- setp.lt.f64 %p68, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p68;
- bra.uni BB8_91;
+ ld.shared.f64 %fd27, [%rd8+1024];
+ add.f64 %fd60, %fd59, %fd27;
+ st.shared.f64 [%rd8], %fd60;
-BB8_63:
- setp.eq.s32 %p50, %r6, 10;
- @%p50 bra BB8_68;
+BB13_11:
+ mov.f64 %fd59, %fd60;
+ bar.sync 0;
- setp.eq.s32 %p51, %r6, 11;
- @%p51 bra BB8_67;
- bra.uni BB8_65;
+BB13_12:
+ mov.f64 %fd57, %fd59;
+ setp.lt.u32 %p8, %r11, 128;
+ @%p8 bra BB13_16;
-BB8_67:
- min.f64 %fd76, %fd1, %fd52;
- bra.uni BB8_91;
+ setp.gt.u32 %p9, %r10, 63;
+ mov.f64 %fd58, %fd57;
+ @%p9 bra BB13_15;
-BB8_44:
- sub.f64 %fd67, %fd52, %fd1;
- bra.uni BB8_46;
+ ld.shared.f64 %fd28, [%rd8+512];
+ add.f64 %fd58, %fd57, %fd28;
+ st.shared.f64 [%rd8], %fd58;
-BB8_7:
- setp.eq.s32 %p20, %r6, 3;
- @%p20 bra BB8_8;
- bra.uni BB8_46;
+BB13_15:
+ mov.f64 %fd57, %fd58;
+ bar.sync 0;
-BB8_8:
- div.rn.f64 %fd67, %fd52, %fd1;
- bra.uni BB8_46;
+BB13_16:
+ mov.f64 %fd56, %fd57;
+ setp.gt.u32 %p10, %r10, 31;
+ @%p10 bra BB13_29;
-BB8_25:
- setp.lt.f64 %p24, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p24;
- bra.uni BB8_46;
+ setp.lt.u32 %p11, %r11, 64;
+ @%p11 bra BB13_19;
-BB8_16:
- setp.eq.s32 %p13, %r6, 9;
- @%p13 bra BB8_17;
- bra.uni BB8_46;
+ ld.volatile.shared.f64 %fd29, [%rd8+256];
+ add.f64 %fd56, %fd56, %fd29;
+ st.volatile.shared.f64 [%rd8], %fd56;
-BB8_17:
- setp.eq.f64 %p22, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p22;
- bra.uni BB8_46;
+BB13_19:
+ mov.f64 %fd55, %fd56;
+ setp.lt.u32 %p12, %r11, 32;
+ @%p12 bra BB13_21;
-BB8_27:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r2}, %fd52;
- }
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r3}, %fd1;
- }
- bfe.u32 %r19, %r3, 20, 11;
- add.s32 %r20, %r19, -1012;
- mov.b64 %rd10, %fd1;
- shl.b64 %rd2, %rd10, %r20;
- setp.eq.s64 %p27, %rd2, -9223372036854775808;
- abs.f64 %fd10, %fd52;
- // Callseq Start 2
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.f64 [param0+0], %fd10;
- .param .b64 param1;
- st.param.f64 [param1+0], %fd1;
- .param .b64 retval0;
- call.uni (retval0),
- __internal_accurate_pow,
- (
- param0,
- param1
- );
- ld.param.f64 %fd66, [retval0+0];
-
- //{
- }// Callseq End 2
- setp.lt.s32 %p28, %r2, 0;
- and.pred %p1, %p28, %p27;
- @!%p1 bra BB8_29;
- bra.uni BB8_28;
+ ld.volatile.shared.f64 %fd30, [%rd8+128];
+ add.f64 %fd55, %fd55, %fd30;
+ st.volatile.shared.f64 [%rd8], %fd55;
-BB8_28:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r21}, %fd66;
- }
- xor.b32 %r22, %r21, -2147483648;
- {
- .reg .b32 %temp;
- mov.b64 {%r23, %temp}, %fd66;
- }
- mov.b64 %fd66, {%r23, %r22};
+BB13_21:
+ mov.f64 %fd54, %fd55;
+ setp.lt.u32 %p13, %r11, 16;
+ @%p13 bra BB13_23;
-BB8_29:
- mov.f64 %fd65, %fd66;
- setp.eq.f64 %p29, %fd52, 0d0000000000000000;
- @%p29 bra BB8_32;
- bra.uni BB8_30;
+ ld.volatile.shared.f64 %fd31, [%rd8+64];
+ add.f64 %fd54, %fd54, %fd31;
+ st.volatile.shared.f64 [%rd8], %fd54;
-BB8_32:
- selp.b32 %r24, %r2, 0, %p27;
- or.b32 %r25, %r24, 2146435072;
- setp.lt.s32 %p33, %r3, 0;
- selp.b32 %r26, %r25, %r24, %p33;
- mov.u32 %r27, 0;
- mov.b64 %fd65, {%r27, %r26};
- bra.uni BB8_33;
+BB13_23:
+ mov.f64 %fd53, %fd54;
+ setp.lt.u32 %p14, %r11, 8;
+ @%p14 bra BB13_25;
-BB8_11:
- setp.eq.s32 %p17, %r6, 6;
- @%p17 bra BB8_12;
- bra.uni BB8_46;
+ ld.volatile.shared.f64 %fd32, [%rd8+32];
+ add.f64 %fd53, %fd53, %fd32;
+ st.volatile.shared.f64 [%rd8], %fd53;
-BB8_12:
- setp.ge.f64 %p25, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p25;
- bra.uni BB8_46;
+BB13_25:
+ mov.f64 %fd52, %fd53;
+ setp.lt.u32 %p15, %r11, 4;
+ @%p15 bra BB13_27;
-BB8_23:
- setp.neu.f64 %p21, %fd1, %fd52;
- selp.f64 %fd67, 0d3FF0000000000000, 0d0000000000000000, %p21;
- bra.uni BB8_46;
+ ld.volatile.shared.f64 %fd33, [%rd8+16];
+ add.f64 %fd52, %fd52, %fd33;
+ st.volatile.shared.f64 [%rd8], %fd52;
-BB8_20:
- setp.ne.s32 %p10, %r6, 12;
- @%p10 bra BB8_46;
+BB13_27:
+ setp.lt.u32 %p16, %r11, 2;
+ @%p16 bra BB13_29;
- max.f64 %fd67, %fd52, %fd1;
- bra.uni BB8_46;
+ ld.volatile.shared.f64 %fd34, [%rd8+8];
+ add.f64 %fd35, %fd52, %fd34;
+ st.volatile.shared.f64 [%rd8], %fd35;
-BB8_89:
- sub.f64 %fd76, %fd1, %fd52;
- bra.uni BB8_91;
+BB13_29:
+ setp.ne.s32 %p17, %r10, 0;
+ @%p17 bra BB13_31;
-BB8_52:
- setp.eq.s32 %p62, %r6, 3;
- @%p62 bra BB8_53;
- bra.uni BB8_91;
+ ld.shared.f64 %fd36, [sdata];
+ cvta.to.global.u64 %rd36, %rd2;
+ mul.wide.u32 %rd37, %r6, 8;
+ add.s64 %rd38, %rd36, %rd37;
+ st.global.f64 [%rd38], %fd36;
-BB8_53:
- div.rn.f64 %fd76, %fd1, %fd52;
- bra.uni BB8_91;
+BB13_31:
+ ret;
+}
-BB8_70:
- setp.gt.f64 %p66, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p66;
- bra.uni BB8_91;
+ // .globl reduce_col
+.visible .entry reduce_col(
+ .param .u64 reduce_col_param_0,
+ .param .u64 reduce_col_param_1,
+ .param .u32 reduce_col_param_2,
+ .param .u32 reduce_col_param_3
+)
+{
+ .reg .pred %p<4>;
+ .reg .b32 %r<11>;
+ .reg .f64 %fd<10>;
+ .reg .b64 %rd<9>;
-BB8_61:
- setp.eq.s32 %p55, %r6, 9;
- @%p55 bra BB8_62;
- bra.uni BB8_91;
-BB8_62:
- setp.eq.f64 %p64, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p64;
- bra.uni BB8_91;
+ ld.param.u64 %rd2, [reduce_col_param_0];
+ ld.param.u64 %rd3, [reduce_col_param_1];
+ ld.param.u32 %r5, [reduce_col_param_2];
+ ld.param.u32 %r6, [reduce_col_param_3];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ setp.ge.u32 %p1, %r1, %r6;
+ @%p1 bra BB14_5;
-BB8_72:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r4}, %fd1;
- }
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r5}, %fd52;
- }
- bfe.u32 %r45, %r5, 20, 11;
- add.s32 %r46, %r45, -1012;
- mov.b64 %rd11, %fd52;
- shl.b64 %rd3, %rd11, %r46;
- setp.eq.s64 %p69, %rd3, -9223372036854775808;
- abs.f64 %fd35, %fd1;
- // Callseq Start 3
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.f64 [param0+0], %fd35;
- .param .b64 param1;
- st.param.f64 [param1+0], %fd52;
- .param .b64 retval0;
- call.uni (retval0),
- __internal_accurate_pow,
- (
- param0,
- param1
- );
- ld.param.f64 %fd75, [retval0+0];
-
- //{
- }// Callseq End 3
- setp.lt.s32 %p70, %r4, 0;
- and.pred %p2, %p70, %p69;
- @!%p2 bra BB8_74;
- bra.uni BB8_73;
+ cvta.to.global.u64 %rd1, %rd2;
+ mul.lo.s32 %r2, %r6, %r5;
+ mov.f64 %fd8, 0d0000000000000000;
+ mov.f64 %fd9, %fd8;
+ setp.ge.u32 %p2, %r1, %r2;
+ @%p2 bra BB14_4;
-BB8_73:
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r47}, %fd75;
- }
- xor.b32 %r48, %r47, -2147483648;
- {
- .reg .b32 %temp;
- mov.b64 {%r49, %temp}, %fd75;
- }
- mov.b64 %fd75, {%r49, %r48};
+ mov.u32 %r10, %r1;
-BB8_74:
- mov.f64 %fd74, %fd75;
- setp.eq.f64 %p71, %fd1, 0d0000000000000000;
- @%p71 bra BB8_77;
- bra.uni BB8_75;
+BB14_3:
+ mov.u32 %r3, %r10;
+ mul.wide.u32 %rd4, %r3, 8;
+ add.s64 %rd5, %rd1, %rd4;
+ ld.global.f64 %fd6, [%rd5];
+ add.f64 %fd9, %fd9, %fd6;
+ add.s32 %r4, %r3, %r6;
+ setp.lt.u32 %p3, %r4, %r2;
+ mov.u32 %r10, %r4;
+ mov.f64 %fd8, %fd9;
+ @%p3 bra BB14_3;
-BB8_77:
- selp.b32 %r50, %r4, 0, %p69;
- or.b32 %r51, %r50, 2146435072;
- setp.lt.s32 %p75, %r5, 0;
- selp.b32 %r52, %r51, %r50, %p75;
- mov.u32 %r53, 0;
- mov.b64 %fd74, {%r53, %r52};
- bra.uni BB8_78;
+BB14_4:
+ cvta.to.global.u64 %rd6, %rd3;
+ mul.wide.u32 %rd7, %r1, 8;
+ add.s64 %rd8, %rd6, %rd7;
+ st.global.f64 [%rd8], %fd8;
-BB8_56:
- setp.eq.s32 %p59, %r6, 6;
- @%p59 bra BB8_57;
- bra.uni BB8_91;
+BB14_5:
+ ret;
+}
-BB8_57:
- setp.le.f64 %p67, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p67;
- bra.uni BB8_91;
+ // .globl reduce_sum
+.visible .entry reduce_sum(
+ .param .u64 reduce_sum_param_0,
+ .param .u64 reduce_sum_param_1,
+ .param .u32 reduce_sum_param_2
+)
+{
+ .reg .pred %p<18>;
+ .reg .b32 %r<31>;
+ .reg .f64 %fd<70>;
+ .reg .b64 %rd<15>;
-BB8_68:
- setp.neu.f64 %p63, %fd1, %fd52;
- selp.f64 %fd76, 0d3FF0000000000000, 0d0000000000000000, %p63;
- bra.uni BB8_91;
-BB8_65:
- setp.ne.s32 %p52, %r6, 12;
- @%p52 bra BB8_91;
+ ld.param.u64 %rd2, [reduce_sum_param_0];
+ ld.param.u64 %rd3, [reduce_sum_param_1];
+ ld.param.u32 %r5, [reduce_sum_param_2];
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
+ mov.u32 %r9, %ntid.x;
+ mad.lo.s32 %r30, %r8, %r9, %r6;
+ mov.f64 %fd67, 0d0000000000000000;
+ mov.f64 %fd68, %fd67;
+ setp.ge.u32 %p1, %r30, %r5;
+ @%p1 bra BB15_4;
- max.f64 %fd76, %fd1, %fd52;
- bra.uni BB8_91;
+BB15_1:
+ mov.f64 %fd1, %fd68;
+ cvta.to.global.u64 %rd4, %rd2;
+ mul.wide.u32 %rd5, %r30, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd27, [%rd6];
+ add.f64 %fd69, %fd1, %fd27;
+ add.s32 %r3, %r30, %r9;
+ setp.ge.u32 %p2, %r3, %r5;
+ @%p2 bra BB15_3;
-BB8_30:
- setp.gt.s32 %p30, %r2, -1;
- @%p30 bra BB8_33;
+ mul.wide.u32 %rd8, %r3, 8;
+ add.s64 %rd9, %rd4, %rd8;
+ ld.global.f64 %fd28, [%rd9];
+ add.f64 %fd69, %fd69, %fd28;
- cvt.rzi.f64.f64 %fd54, %fd1;
- setp.neu.f64 %p31, %fd54, %fd1;
- selp.f64 %fd65, 0dFFF8000000000000, %fd65, %p31;
+BB15_3:
+ mov.f64 %fd68, %fd69;
+ shl.b32 %r12, %r9, 1;
+ mov.u32 %r13, %nctaid.x;
+ mad.lo.s32 %r30, %r12, %r13, %r30;
+ setp.lt.u32 %p3, %r30, %r5;
+ mov.f64 %fd67, %fd68;
+ @%p3 bra BB15_1;
-BB8_33:
- mov.f64 %fd16, %fd65;
- add.f64 %fd17, %fd1, %fd52;
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r28}, %fd17;
- }
- and.b32 %r29, %r28, 2146435072;
- setp.ne.s32 %p34, %r29, 2146435072;
- mov.f64 %fd64, %fd16;
- @%p34 bra BB8_42;
+BB15_4:
+ mov.f64 %fd65, %fd67;
+ mul.wide.u32 %rd10, %r6, 8;
+ mov.u64 %rd11, sdata;
+ add.s64 %rd1, %rd11, %rd10;
+ st.shared.f64 [%rd1], %fd65;
+ bar.sync 0;
+ setp.lt.u32 %p4, %r9, 512;
+ @%p4 bra BB15_8;
- setp.gtu.f64 %p35, %fd10, 0d7FF0000000000000;
- mov.f64 %fd64, %fd17;
- @%p35 bra BB8_42;
+ setp.gt.u32 %p5, %r6, 255;
+ mov.f64 %fd66, %fd65;
+ @%p5 bra BB15_7;
- abs.f64 %fd55, %fd1;
- setp.gtu.f64 %p36, %fd55, 0d7FF0000000000000;
- mov.f64 %fd63, %fd17;
- mov.f64 %fd64, %fd63;
- @%p36 bra BB8_42;
+ ld.shared.f64 %fd29, [%rd1+2048];
+ add.f64 %fd66, %fd65, %fd29;
+ st.shared.f64 [%rd1], %fd66;
- and.b32 %r30, %r3, 2147483647;
- setp.ne.s32 %p37, %r30, 2146435072;
- @%p37 bra BB8_38;
+BB15_7:
+ mov.f64 %fd65, %fd66;
+ bar.sync 0;
- {
- .reg .b32 %temp;
- mov.b64 {%r31, %temp}, %fd1;
- }
- setp.eq.s32 %p38, %r31, 0;
- @%p38 bra BB8_41;
+BB15_8:
+ mov.f64 %fd63, %fd65;
+ setp.lt.u32 %p6, %r9, 256;
+ @%p6 bra BB15_12;
-BB8_38:
- and.b32 %r32, %r2, 2147483647;
- setp.ne.s32 %p39, %r32, 2146435072;
- mov.f64 %fd61, %fd16;
- mov.f64 %fd64, %fd61;
- @%p39 bra BB8_42;
+ setp.gt.u32 %p7, %r6, 127;
+ mov.f64 %fd64, %fd63;
+ @%p7 bra BB15_11;
- {
- .reg .b32 %temp;
- mov.b64 {%r33, %temp}, %fd52;
- }
- setp.ne.s32 %p40, %r33, 0;
- mov.f64 %fd64, %fd16;
- @%p40 bra BB8_42;
+ ld.shared.f64 %fd30, [%rd1+1024];
+ add.f64 %fd64, %fd63, %fd30;
+ st.shared.f64 [%rd1], %fd64;
- shr.s32 %r34, %r3, 31;
- and.b32 %r35, %r34, -2146435072;
- add.s32 %r36, %r35, 2146435072;
- or.b32 %r37, %r36, -2147483648;
- selp.b32 %r38, %r37, %r36, %p1;
- mov.u32 %r39, 0;
- mov.b64 %fd64, {%r39, %r38};
- bra.uni BB8_42;
+BB15_11:
+ mov.f64 %fd63, %fd64;
+ bar.sync 0;
-BB8_75:
- setp.gt.s32 %p72, %r4, -1;
- @%p72 bra BB8_78;
+BB15_12:
+ mov.f64 %fd61, %fd63;
+ setp.lt.u32 %p8, %r9, 128;
+ @%p8 bra BB15_16;
- cvt.rzi.f64.f64 %fd57, %fd52;
- setp.neu.f64 %p73, %fd57, %fd52;
- selp.f64 %fd74, 0dFFF8000000000000, %fd74, %p73;
+ setp.gt.u32 %p9, %r6, 63;
+ mov.f64 %fd62, %fd61;
+ @%p9 bra BB15_15;
-BB8_78:
- mov.f64 %fd41, %fd74;
- add.f64 %fd42, %fd1, %fd52;
- {
- .reg .b32 %temp;
- mov.b64 {%temp, %r54}, %fd42;
- }
- and.b32 %r55, %r54, 2146435072;
- setp.ne.s32 %p76, %r55, 2146435072;
- mov.f64 %fd73, %fd41;
- @%p76 bra BB8_87;
+ ld.shared.f64 %fd31, [%rd1+512];
+ add.f64 %fd62, %fd61, %fd31;
+ st.shared.f64 [%rd1], %fd62;
- setp.gtu.f64 %p77, %fd35, 0d7FF0000000000000;
- mov.f64 %fd73, %fd42;
- @%p77 bra BB8_87;
+BB15_15:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
- abs.f64 %fd58, %fd52;
- setp.gtu.f64 %p78, %fd58, 0d7FF0000000000000;
- mov.f64 %fd72, %fd42;
- mov.f64 %fd73, %fd72;
- @%p78 bra BB8_87;
+BB15_16:
+ mov.f64 %fd60, %fd61;
+ setp.gt.u32 %p10, %r6, 31;
+ @%p10 bra BB15_29;
- and.b32 %r56, %r5, 2147483647;
- setp.ne.s32 %p79, %r56, 2146435072;
- @%p79 bra BB8_83;
+ setp.lt.u32 %p11, %r9, 64;
+ @%p11 bra BB15_19;
- {
- .reg .b32 %temp;
- mov.b64 {%r57, %temp}, %fd52;
- }
- setp.eq.s32 %p80, %r57, 0;
- @%p80 bra BB8_86;
+ ld.volatile.shared.f64 %fd32, [%rd1+256];
+ add.f64 %fd60, %fd60, %fd32;
+ st.volatile.shared.f64 [%rd1], %fd60;
-BB8_83:
- and.b32 %r58, %r4, 2147483647;
- setp.ne.s32 %p81, %r58, 2146435072;
- mov.f64 %fd70, %fd41;
- mov.f64 %fd73, %fd70;
- @%p81 bra BB8_87;
+BB15_19:
+ mov.f64 %fd59, %fd60;
+ setp.lt.u32 %p12, %r9, 32;
+ @%p12 bra BB15_21;
- {
- .reg .b32 %temp;
- mov.b64 {%r59, %temp}, %fd1;
- }
- setp.ne.s32 %p82, %r59, 0;
- mov.f64 %fd73, %fd41;
- @%p82 bra BB8_87;
+ ld.volatile.shared.f64 %fd33, [%rd1+128];
+ add.f64 %fd59, %fd59, %fd33;
+ st.volatile.shared.f64 [%rd1], %fd59;
- shr.s32 %r60, %r5, 31;
- and.b32 %r61, %r60, -2146435072;
- add.s32 %r62, %r61, 2146435072;
- or.b32 %r63, %r62, -2147483648;
- selp.b32 %r64, %r63, %r62, %p2;
- mov.u32 %r65, 0;
- mov.b64 %fd73, {%r65, %r64};
- bra.uni BB8_87;
+BB15_21:
+ mov.f64 %fd58, %fd59;
+ setp.lt.u32 %p13, %r9, 16;
+ @%p13 bra BB15_23;
-BB8_41:
- setp.gt.f64 %p41, %fd10, 0d3FF0000000000000;
- selp.b32 %r40, 2146435072, 0, %p41;
- xor.b32 %r41, %r40, 2146435072;
- setp.lt.s32 %p42, %r3, 0;
- selp.b32 %r42, %r41, %r40, %p42;
- setp.eq.f64 %p43, %fd52, 0dBFF0000000000000;
- selp.b32 %r43, 1072693248, %r42, %p43;
- mov.u32 %r44, 0;
- mov.b64 %fd64, {%r44, %r43};
+ ld.volatile.shared.f64 %fd34, [%rd1+64];
+ add.f64 %fd58, %fd58, %fd34;
+ st.volatile.shared.f64 [%rd1], %fd58;
-BB8_42:
- setp.eq.f64 %p44, %fd1, 0d0000000000000000;
- setp.eq.f64 %p45, %fd52, 0d3FF0000000000000;
- or.pred %p46, %p45, %p44;
- selp.f64 %fd67, 0d3FF0000000000000, %fd64, %p46;
- bra.uni BB8_46;
+BB15_23:
+ mov.f64 %fd57, %fd58;
+ setp.lt.u32 %p14, %r9, 8;
+ @%p14 bra BB15_25;
-BB8_86:
- setp.gt.f64 %p83, %fd35, 0d3FF0000000000000;
- selp.b32 %r66, 2146435072, 0, %p83;
- xor.b32 %r67, %r66, 2146435072;
- setp.lt.s32 %p84, %r5, 0;
- selp.b32 %r68, %r67, %r66, %p84;
- setp.eq.f64 %p85, %fd1, 0dBFF0000000000000;
- selp.b32 %r69, 1072693248, %r68, %p85;
- mov.u32 %r70, 0;
- mov.b64 %fd73, {%r70, %r69};
+ ld.volatile.shared.f64 %fd35, [%rd1+32];
+ add.f64 %fd57, %fd57, %fd35;
+ st.volatile.shared.f64 [%rd1], %fd57;
-BB8_87:
- setp.eq.f64 %p86, %fd52, 0d0000000000000000;
- setp.eq.f64 %p87, %fd1, 0d3FF0000000000000;
- or.pred %p88, %p87, %p86;
- selp.f64 %fd76, 0d3FF0000000000000, %fd73, %p88;
- bra.uni BB8_91;
-}
+BB15_25:
+ mov.f64 %fd56, %fd57;
+ setp.lt.u32 %p15, %r9, 4;
+ @%p15 bra BB15_27;
- // .globl fill
-.visible .entry fill(
- .param .u64 fill_param_0,
- .param .f64 fill_param_1,
- .param .u32 fill_param_2
-)
-{
- .reg .pred %p<2>;
- .reg .b32 %r<6>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<5>;
+ ld.volatile.shared.f64 %fd36, [%rd1+16];
+ add.f64 %fd56, %fd56, %fd36;
+ st.volatile.shared.f64 [%rd1], %fd56;
+BB15_27:
+ setp.lt.u32 %p16, %r9, 2;
+ @%p16 bra BB15_29;
- ld.param.u64 %rd1, [fill_param_0];
- ld.param.f64 %fd1, [fill_param_1];
- ld.param.u32 %r2, [fill_param_2];
- mov.u32 %r3, %ctaid.x;
- mov.u32 %r4, %ntid.x;
- mov.u32 %r5, %tid.x;
- mad.lo.s32 %r1, %r4, %r3, %r5;
- setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB9_2;
+ ld.volatile.shared.f64 %fd37, [%rd1+8];
+ add.f64 %fd38, %fd56, %fd37;
+ st.volatile.shared.f64 [%rd1], %fd38;
- cvta.to.global.u64 %rd2, %rd1;
- mul.wide.s32 %rd3, %r1, 8;
- add.s64 %rd4, %rd2, %rd3;
- st.global.f64 [%rd4], %fd1;
+BB15_29:
+ setp.ne.s32 %p17, %r6, 0;
+ @%p17 bra BB15_31;
-BB9_2:
+ ld.shared.f64 %fd39, [sdata];
+ cvta.to.global.u64 %rd12, %rd3;
+ mul.wide.u32 %rd13, %r7, 8;
+ add.s64 %rd14, %rd12, %rd13;
+ st.global.f64 [%rd14], %fd39;
+
+BB15_31:
ret;
}
- // .globl reduce
-.visible .entry reduce(
- .param .u64 reduce_param_0,
- .param .u64 reduce_param_1,
- .param .u32 reduce_param_2
+ // .globl reduce_max
+.visible .entry reduce_max(
+ .param .u64 reduce_max_param_0,
+ .param .u64 reduce_max_param_1,
+ .param .u32 reduce_max_param_2
)
{
.reg .pred %p<18>;
@@ -1567,45 +2504,45 @@ BB9_2:
.reg .b64 %rd<15>;
- ld.param.u64 %rd2, [reduce_param_0];
- ld.param.u64 %rd3, [reduce_param_1];
- ld.param.u32 %r5, [reduce_param_2];
+ ld.param.u64 %rd2, [reduce_max_param_0];
+ ld.param.u64 %rd3, [reduce_max_param_1];
+ ld.param.u32 %r5, [reduce_max_param_2];
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
mad.lo.s32 %r30, %r8, %r9, %r6;
- mov.f64 %fd67, 0d0000000000000000;
+ mov.f64 %fd67, 0d0010000000000000;
mov.f64 %fd68, %fd67;
setp.ge.u32 %p1, %r30, %r5;
- @%p1 bra BB10_4;
+ @%p1 bra BB16_4;
-BB10_1:
+BB16_1:
mov.f64 %fd1, %fd68;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.u32 %rd5, %r30, 8;
add.s64 %rd6, %rd4, %rd5;
ld.global.f64 %fd27, [%rd6];
- add.f64 %fd69, %fd1, %fd27;
+ max.f64 %fd69, %fd1, %fd27;
add.s32 %r3, %r30, %r9;
setp.ge.u32 %p2, %r3, %r5;
- @%p2 bra BB10_3;
+ @%p2 bra BB16_3;
mul.wide.u32 %rd8, %r3, 8;
add.s64 %rd9, %rd4, %rd8;
ld.global.f64 %fd28, [%rd9];
- add.f64 %fd69, %fd69, %fd28;
+ max.f64 %fd69, %fd69, %fd28;
-BB10_3:
+BB16_3:
mov.f64 %fd68, %fd69;
shl.b32 %r12, %r9, 1;
mov.u32 %r13, %nctaid.x;
mad.lo.s32 %r30, %r12, %r13, %r30;
setp.lt.u32 %p3, %r30, %r5;
mov.f64 %fd67, %fd68;
- @%p3 bra BB10_1;
+ @%p3 bra BB16_1;
-BB10_4:
+BB16_4:
mov.f64 %fd65, %fd67;
mul.wide.u32 %rd10, %r6, 8;
mov.u64 %rd11, sdata;
@@ -1613,113 +2550,113 @@ BB10_4:
st.shared.f64 [%rd1], %fd65;
bar.sync 0;
setp.lt.u32 %p4, %r9, 512;
- @%p4 bra BB10_8;
+ @%p4 bra BB16_8;
setp.gt.u32 %p5, %r6, 255;
mov.f64 %fd66, %fd65;
- @%p5 bra BB10_7;
+ @%p5 bra BB16_7;
ld.shared.f64 %fd29, [%rd1+2048];
- add.f64 %fd66, %fd65, %fd29;
+ max.f64 %fd66, %fd65, %fd29;
st.shared.f64 [%rd1], %fd66;
-BB10_7:
+BB16_7:
mov.f64 %fd65, %fd66;
bar.sync 0;
-BB10_8:
+BB16_8:
mov.f64 %fd63, %fd65;
setp.lt.u32 %p6, %r9, 256;
- @%p6 bra BB10_12;
+ @%p6 bra BB16_12;
setp.gt.u32 %p7, %r6, 127;
mov.f64 %fd64, %fd63;
- @%p7 bra BB10_11;
+ @%p7 bra BB16_11;
ld.shared.f64 %fd30, [%rd1+1024];
- add.f64 %fd64, %fd63, %fd30;
+ max.f64 %fd64, %fd63, %fd30;
st.shared.f64 [%rd1], %fd64;
-BB10_11:
+BB16_11:
mov.f64 %fd63, %fd64;
bar.sync 0;
-BB10_12:
+BB16_12:
mov.f64 %fd61, %fd63;
setp.lt.u32 %p8, %r9, 128;
- @%p8 bra BB10_16;
+ @%p8 bra BB16_16;
setp.gt.u32 %p9, %r6, 63;
mov.f64 %fd62, %fd61;
- @%p9 bra BB10_15;
+ @%p9 bra BB16_15;
ld.shared.f64 %fd31, [%rd1+512];
- add.f64 %fd62, %fd61, %fd31;
+ max.f64 %fd62, %fd61, %fd31;
st.shared.f64 [%rd1], %fd62;
-BB10_15:
+BB16_15:
mov.f64 %fd61, %fd62;
bar.sync 0;
-BB10_16:
+BB16_16:
mov.f64 %fd60, %fd61;
setp.gt.u32 %p10, %r6, 31;
- @%p10 bra BB10_29;
+ @%p10 bra BB16_29;
setp.lt.u32 %p11, %r9, 64;
- @%p11 bra BB10_19;
+ @%p11 bra BB16_19;
ld.volatile.shared.f64 %fd32, [%rd1+256];
- add.f64 %fd60, %fd60, %fd32;
+ max.f64 %fd60, %fd60, %fd32;
st.volatile.shared.f64 [%rd1], %fd60;
-BB10_19:
+BB16_19:
mov.f64 %fd59, %fd60;
setp.lt.u32 %p12, %r9, 32;
- @%p12 bra BB10_21;
+ @%p12 bra BB16_21;
ld.volatile.shared.f64 %fd33, [%rd1+128];
- add.f64 %fd59, %fd59, %fd33;
+ max.f64 %fd59, %fd59, %fd33;
st.volatile.shared.f64 [%rd1], %fd59;
-BB10_21:
+BB16_21:
mov.f64 %fd58, %fd59;
setp.lt.u32 %p13, %r9, 16;
- @%p13 bra BB10_23;
+ @%p13 bra BB16_23;
ld.volatile.shared.f64 %fd34, [%rd1+64];
- add.f64 %fd58, %fd58, %fd34;
+ max.f64 %fd58, %fd58, %fd34;
st.volatile.shared.f64 [%rd1], %fd58;
-BB10_23:
+BB16_23:
mov.f64 %fd57, %fd58;
setp.lt.u32 %p14, %r9, 8;
- @%p14 bra BB10_25;
+ @%p14 bra BB16_25;
ld.volatile.shared.f64 %fd35, [%rd1+32];
- add.f64 %fd57, %fd57, %fd35;
+ max.f64 %fd57, %fd57, %fd35;
st.volatile.shared.f64 [%rd1], %fd57;
-BB10_25:
+BB16_25:
mov.f64 %fd56, %fd57;
setp.lt.u32 %p15, %r9, 4;
- @%p15 bra BB10_27;
+ @%p15 bra BB16_27;
ld.volatile.shared.f64 %fd36, [%rd1+16];
- add.f64 %fd56, %fd56, %fd36;
+ max.f64 %fd56, %fd56, %fd36;
st.volatile.shared.f64 [%rd1], %fd56;
-BB10_27:
+BB16_27:
setp.lt.u32 %p16, %r9, 2;
- @%p16 bra BB10_29;
+ @%p16 bra BB16_29;
ld.volatile.shared.f64 %fd37, [%rd1+8];
- add.f64 %fd38, %fd56, %fd37;
+ max.f64 %fd38, %fd56, %fd37;
st.volatile.shared.f64 [%rd1], %fd38;
-BB10_29:
+BB16_29:
setp.ne.s32 %p17, %r6, 0;
- @%p17 bra BB10_31;
+ @%p17 bra BB16_31;
ld.shared.f64 %fd39, [sdata];
cvta.to.global.u64 %rd12, %rd3;
@@ -1727,233 +2664,184 @@ BB10_29:
add.s64 %rd14, %rd12, %rd13;
st.global.f64 [%rd14], %fd39;
-BB10_31:
+BB16_31:
ret;
}
- // .globl reduce_row
-.visible .entry reduce_row(
- .param .u64 reduce_row_param_0,
- .param .u64 reduce_row_param_1,
- .param .u32 reduce_row_param_2,
- .param .u32 reduce_row_param_3
+ // .globl reduce_min
+.visible .entry reduce_min(
+ .param .u64 reduce_min_param_0,
+ .param .u64 reduce_min_param_1,
+ .param .u32 reduce_min_param_2
)
{
.reg .pred %p<18>;
- .reg .b32 %r<36>;
- .reg .f64 %fd<65>;
- .reg .b64 %rd<39>;
-
-
- ld.param.u64 %rd1, [reduce_row_param_0];
- ld.param.u64 %rd2, [reduce_row_param_1];
- ld.param.u32 %r5, [reduce_row_param_2];
- ld.param.u32 %r4, [reduce_row_param_3];
- mov.u32 %r6, %ctaid.x;
- setp.ge.u32 %p1, %r6, %r5;
- @%p1 bra BB11_31;
-
- mov.u32 %r35, %tid.x;
- mov.f64 %fd63, 0d0000000000000000;
- mov.f64 %fd64, %fd63;
- setp.ge.u32 %p2, %r35, %r4;
- @%p2 bra BB11_4;
+ .reg .b32 %r<31>;
+ .reg .f64 %fd<70>;
+ .reg .b64 %rd<15>;
- cvta.to.global.u64 %rd3, %rd1;
-BB11_3:
- mad.lo.s32 %r8, %r6, %r4, %r35;
- mul.wide.u32 %rd4, %r8, 8;
- add.s64 %rd5, %rd3, %rd4;
- ld.global.f64 %fd25, [%rd5];
- add.f64 %fd64, %fd64, %fd25;
+ ld.param.u64 %rd2, [reduce_min_param_0];
+ ld.param.u64 %rd3, [reduce_min_param_1];
+ ld.param.u32 %r5, [reduce_min_param_2];
+ mov.u32 %r6, %tid.x;
+ mov.u32 %r7, %ctaid.x;
+ shl.b32 %r8, %r7, 1;
mov.u32 %r9, %ntid.x;
- add.s32 %r35, %r9, %r35;
- setp.lt.u32 %p3, %r35, %r4;
- mov.f64 %fd63, %fd64;
- @%p3 bra BB11_3;
+ mad.lo.s32 %r30, %r8, %r9, %r6;
+ mov.f64 %fd67, 0d7FEFFFFFFFFFFFFF;
+ mov.f64 %fd68, %fd67;
+ setp.ge.u32 %p1, %r30, %r5;
+ @%p1 bra BB17_4;
-BB11_4:
- mov.f64 %fd61, %fd63;
- mov.u32 %r10, %tid.x;
- mul.wide.u32 %rd6, %r10, 8;
- mov.u64 %rd7, sdata;
- add.s64 %rd8, %rd7, %rd6;
- st.shared.f64 [%rd8], %fd61;
- bar.sync 0;
- mov.u32 %r11, %ntid.x;
- setp.lt.u32 %p4, %r11, 512;
- @%p4 bra BB11_8;
+BB17_1:
+ mov.f64 %fd1, %fd68;
+ cvta.to.global.u64 %rd4, %rd2;
+ mul.wide.u32 %rd5, %r30, 8;
+ add.s64 %rd6, %rd4, %rd5;
+ ld.global.f64 %fd27, [%rd6];
+ min.f64 %fd69, %fd1, %fd27;
+ add.s32 %r3, %r30, %r9;
+ setp.ge.u32 %p2, %r3, %r5;
+ @%p2 bra BB17_3;
- setp.gt.u32 %p5, %r10, 255;
- mov.f64 %fd62, %fd61;
- @%p5 bra BB11_7;
+ mul.wide.u32 %rd8, %r3, 8;
+ add.s64 %rd9, %rd4, %rd8;
+ ld.global.f64 %fd28, [%rd9];
+ min.f64 %fd69, %fd69, %fd28;
- ld.shared.f64 %fd26, [%rd8+2048];
- add.f64 %fd62, %fd61, %fd26;
- st.shared.f64 [%rd8], %fd62;
+BB17_3:
+ mov.f64 %fd68, %fd69;
+ shl.b32 %r12, %r9, 1;
+ mov.u32 %r13, %nctaid.x;
+ mad.lo.s32 %r30, %r12, %r13, %r30;
+ setp.lt.u32 %p3, %r30, %r5;
+ mov.f64 %fd67, %fd68;
+ @%p3 bra BB17_1;
-BB11_7:
- mov.f64 %fd61, %fd62;
+BB17_4:
+ mov.f64 %fd65, %fd67;
+ mul.wide.u32 %rd10, %r6, 8;
+ mov.u64 %rd11, sdata;
+ add.s64 %rd1, %rd11, %rd10;
+ st.shared.f64 [%rd1], %fd65;
bar.sync 0;
+ setp.lt.u32 %p4, %r9, 512;
+ @%p4 bra BB17_8;
-BB11_8:
- mov.f64 %fd59, %fd61;
- setp.lt.u32 %p6, %r11, 256;
- @%p6 bra BB11_12;
-
- setp.gt.u32 %p7, %r10, 127;
- mov.f64 %fd60, %fd59;
- @%p7 bra BB11_11;
+ setp.gt.u32 %p5, %r6, 255;
+ mov.f64 %fd66, %fd65;
+ @%p5 bra BB17_7;
- ld.shared.f64 %fd27, [%rd8+1024];
- add.f64 %fd60, %fd59, %fd27;
- st.shared.f64 [%rd8], %fd60;
+ ld.shared.f64 %fd29, [%rd1+2048];
+ min.f64 %fd66, %fd65, %fd29;
+ st.shared.f64 [%rd1], %fd66;
-BB11_11:
- mov.f64 %fd59, %fd60;
+BB17_7:
+ mov.f64 %fd65, %fd66;
bar.sync 0;
-BB11_12:
- mov.f64 %fd57, %fd59;
- setp.lt.u32 %p8, %r11, 128;
- @%p8 bra BB11_16;
+BB17_8:
+ mov.f64 %fd63, %fd65;
+ setp.lt.u32 %p6, %r9, 256;
+ @%p6 bra BB17_12;
- setp.gt.u32 %p9, %r10, 63;
- mov.f64 %fd58, %fd57;
- @%p9 bra BB11_15;
+ setp.gt.u32 %p7, %r6, 127;
+ mov.f64 %fd64, %fd63;
+ @%p7 bra BB17_11;
- ld.shared.f64 %fd28, [%rd8+512];
- add.f64 %fd58, %fd57, %fd28;
- st.shared.f64 [%rd8], %fd58;
+ ld.shared.f64 %fd30, [%rd1+1024];
+ min.f64 %fd64, %fd63, %fd30;
+ st.shared.f64 [%rd1], %fd64;
-BB11_15:
- mov.f64 %fd57, %fd58;
+BB17_11:
+ mov.f64 %fd63, %fd64;
bar.sync 0;
-BB11_16:
- mov.f64 %fd56, %fd57;
- setp.gt.u32 %p10, %r10, 31;
- @%p10 bra BB11_29;
-
- setp.lt.u32 %p11, %r11, 64;
- @%p11 bra BB11_19;
-
- ld.volatile.shared.f64 %fd29, [%rd8+256];
- add.f64 %fd56, %fd56, %fd29;
- st.volatile.shared.f64 [%rd8], %fd56;
-
-BB11_19:
- mov.f64 %fd55, %fd56;
- setp.lt.u32 %p12, %r11, 32;
- @%p12 bra BB11_21;
-
- ld.volatile.shared.f64 %fd30, [%rd8+128];
- add.f64 %fd55, %fd55, %fd30;
- st.volatile.shared.f64 [%rd8], %fd55;
+BB17_12:
+ mov.f64 %fd61, %fd63;
+ setp.lt.u32 %p8, %r9, 128;
+ @%p8 bra BB17_16;
-BB11_21:
- mov.f64 %fd54, %fd55;
- setp.lt.u32 %p13, %r11, 16;
- @%p13 bra BB11_23;
+ setp.gt.u32 %p9, %r6, 63;
+ mov.f64 %fd62, %fd61;
+ @%p9 bra BB17_15;
- ld.volatile.shared.f64 %fd31, [%rd8+64];
- add.f64 %fd54, %fd54, %fd31;
- st.volatile.shared.f64 [%rd8], %fd54;
+ ld.shared.f64 %fd31, [%rd1+512];
+ min.f64 %fd62, %fd61, %fd31;
+ st.shared.f64 [%rd1], %fd62;
-BB11_23:
- mov.f64 %fd53, %fd54;
- setp.lt.u32 %p14, %r11, 8;
- @%p14 bra BB11_25;
+BB17_15:
+ mov.f64 %fd61, %fd62;
+ bar.sync 0;
- ld.volatile.shared.f64 %fd32, [%rd8+32];
- add.f64 %fd53, %fd53, %fd32;
- st.volatile.shared.f64 [%rd8], %fd53;
+BB17_16:
+ mov.f64 %fd60, %fd61;
+ setp.gt.u32 %p10, %r6, 31;
+ @%p10 bra BB17_29;
-BB11_25:
- mov.f64 %fd52, %fd53;
- setp.lt.u32 %p15, %r11, 4;
- @%p15 bra BB11_27;
+ setp.lt.u32 %p11, %r9, 64;
+ @%p11 bra BB17_19;
- ld.volatile.shared.f64 %fd33, [%rd8+16];
- add.f64 %fd52, %fd52, %fd33;
- st.volatile.shared.f64 [%rd8], %fd52;
+ ld.volatile.shared.f64 %fd32, [%rd1+256];
+ min.f64 %fd60, %fd60, %fd32;
+ st.volatile.shared.f64 [%rd1], %fd60;
-BB11_27:
- setp.lt.u32 %p16, %r11, 2;
- @%p16 bra BB11_29;
+BB17_19:
+ mov.f64 %fd59, %fd60;
+ setp.lt.u32 %p12, %r9, 32;
+ @%p12 bra BB17_21;
- ld.volatile.shared.f64 %fd34, [%rd8+8];
- add.f64 %fd35, %fd52, %fd34;
- st.volatile.shared.f64 [%rd8], %fd35;
+ ld.volatile.shared.f64 %fd33, [%rd1+128];
+ min.f64 %fd59, %fd59, %fd33;
+ st.volatile.shared.f64 [%rd1], %fd59;
-BB11_29:
- setp.ne.s32 %p17, %r10, 0;
- @%p17 bra BB11_31;
+BB17_21:
+ mov.f64 %fd58, %fd59;
+ setp.lt.u32 %p13, %r9, 16;
+ @%p13 bra BB17_23;
- ld.shared.f64 %fd36, [sdata];
- cvta.to.global.u64 %rd36, %rd2;
- mul.wide.u32 %rd37, %r6, 8;
- add.s64 %rd38, %rd36, %rd37;
- st.global.f64 [%rd38], %fd36;
+ ld.volatile.shared.f64 %fd34, [%rd1+64];
+ min.f64 %fd58, %fd58, %fd34;
+ st.volatile.shared.f64 [%rd1], %fd58;
-BB11_31:
- ret;
-}
+BB17_23:
+ mov.f64 %fd57, %fd58;
+ setp.lt.u32 %p14, %r9, 8;
+ @%p14 bra BB17_25;
- // .globl reduce_col
-.visible .entry reduce_col(
- .param .u64 reduce_col_param_0,
- .param .u64 reduce_col_param_1,
- .param .u32 reduce_col_param_2,
- .param .u32 reduce_col_param_3
-)
-{
- .reg .pred %p<4>;
- .reg .b32 %r<11>;
- .reg .f64 %fd<10>;
- .reg .b64 %rd<9>;
+ ld.volatile.shared.f64 %fd35, [%rd1+32];
+ min.f64 %fd57, %fd57, %fd35;
+ st.volatile.shared.f64 [%rd1], %fd57;
+BB17_25:
+ mov.f64 %fd56, %fd57;
+ setp.lt.u32 %p15, %r9, 4;
+ @%p15 bra BB17_27;
- ld.param.u64 %rd2, [reduce_col_param_0];
- ld.param.u64 %rd3, [reduce_col_param_1];
- ld.param.u32 %r5, [reduce_col_param_2];
- ld.param.u32 %r6, [reduce_col_param_3];
- mov.u32 %r7, %ntid.x;
- mov.u32 %r8, %ctaid.x;
- mov.u32 %r9, %tid.x;
- mad.lo.s32 %r1, %r7, %r8, %r9;
- setp.ge.u32 %p1, %r1, %r6;
- @%p1 bra BB12_5;
+ ld.volatile.shared.f64 %fd36, [%rd1+16];
+ min.f64 %fd56, %fd56, %fd36;
+ st.volatile.shared.f64 [%rd1], %fd56;
- cvta.to.global.u64 %rd1, %rd2;
- mul.lo.s32 %r2, %r6, %r5;
- mov.f64 %fd8, 0d0000000000000000;
- mov.f64 %fd9, %fd8;
- setp.ge.u32 %p2, %r1, %r2;
- @%p2 bra BB12_4;
+BB17_27:
+ setp.lt.u32 %p16, %r9, 2;
+ @%p16 bra BB17_29;
- mov.u32 %r10, %r1;
+ ld.volatile.shared.f64 %fd37, [%rd1+8];
+ min.f64 %fd38, %fd56, %fd37;
+ st.volatile.shared.f64 [%rd1], %fd38;
-BB12_3:
- mov.u32 %r3, %r10;
- mul.wide.u32 %rd4, %r3, 8;
- add.s64 %rd5, %rd1, %rd4;
- ld.global.f64 %fd6, [%rd5];
- add.f64 %fd9, %fd9, %fd6;
- add.s32 %r4, %r3, %r6;
- setp.lt.u32 %p3, %r4, %r2;
- mov.u32 %r10, %r4;
- mov.f64 %fd8, %fd9;
- @%p3 bra BB12_3;
+BB17_29:
+ setp.ne.s32 %p17, %r6, 0;
+ @%p17 bra BB17_31;
-BB12_4:
- cvta.to.global.u64 %rd6, %rd3;
- mul.wide.u32 %rd7, %r1, 8;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f64 [%rd8], %fd8;
+ ld.shared.f64 %fd39, [sdata];
+ cvta.to.global.u64 %rd12, %rd3;
+ mul.wide.u32 %rd13, %r7, 8;
+ add.s64 %rd14, %rd12, %rd13;
+ st.global.f64 [%rd14], %fd39;
-BB12_5:
+BB17_31:
ret;
}
@@ -1980,7 +2868,7 @@ BB12_5:
}
shr.u32 %r50, %r49, 20;
setp.ne.s32 %p1, %r50, 0;
- @%p1 bra BB13_2;
+ @%p1 bra BB18_2;
mul.f64 %fd14, %fd12, 0d4350000000000000;
{
@@ -1994,13 +2882,13 @@ BB12_5:
shr.u32 %r16, %r49, 20;
add.s32 %r50, %r16, -54;
-BB13_2:
+BB18_2:
add.s32 %r51, %r50, -1023;
and.b32 %r17, %r49, -2146435073;
or.b32 %r18, %r17, 1072693248;
mov.b64 %fd133, {%r48, %r18};
setp.lt.u32 %p2, %r18, 1073127583;
- @%p2 bra BB13_4;
+ @%p2 bra BB18_4;
{
.reg .b32 %temp;
@@ -2014,7 +2902,7 @@ BB13_2:
mov.b64 %fd133, {%r19, %r21};
add.s32 %r51, %r50, -1022;
-BB13_4:
+BB18_4:
add.f64 %fd16, %fd133, 0d3FF0000000000000;
// inline asm
rcp.approx.ftz.f64 %fd15,%fd16;
@@ -2180,13 +3068,13 @@ BB13_4:
mov.b32 %f2, %r35;
abs.f32 %f1, %f2;
setp.lt.f32 %p4, %f1, 0f4086232B;
- @%p4 bra BB13_7;
+ @%p4 bra BB18_7;
setp.lt.f64 %p5, %fd4, 0d0000000000000000;
add.f64 %fd130, %fd4, 0d7FF0000000000000;
selp.f64 %fd134, 0d0000000000000000, %fd130, %p5;
setp.geu.f32 %p6, %f1, 0f40874800;
- @%p6 bra BB13_7;
+ @%p6 bra BB18_7;
shr.u32 %r36, %r13, 31;
add.s32 %r37, %r13, %r36;
@@ -2201,26 +3089,26 @@ BB13_4:
mov.b64 %fd132, {%r44, %r43};
mul.f64 %fd134, %fd131, %fd132;
-BB13_7:
+BB18_7:
{
.reg .b32 %temp;
mov.b64 {%temp, %r45}, %fd134;
}
and.b32 %r46, %r45, 2147483647;
setp.ne.s32 %p7, %r46, 2146435072;
- @%p7 bra BB13_9;
+ @%p7 bra BB18_9;
{
.reg .b32 %temp;
mov.b64 {%r47, %temp}, %fd134;
}
setp.eq.s32 %p8, %r47, 0;
- @%p8 bra BB13_10;
+ @%p8 bra BB18_10;
-BB13_9:
+BB18_9:
fma.rn.f64 %fd134, %fd134, %fd5, %fd134;
-BB13_10:
+BB18_10:
st.param.f64 [func_retval0+0], %fd134;
ret;
}