This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 7dcf6fe [MINOR] CUDA 10.2 PTX and spoof cuda helper binaries
(Win/Lin/64)
7dcf6fe is described below
commit 7dcf6fe424e80901852785a4a0bf7065c15973bb
Author: Mark Dokter <[email protected]>
AuthorDate: Thu Jun 17 01:42:42 2021 +0200
[MINOR] CUDA 10.2 PTX and spoof cuda helper binaries (Win/Lin/64)
Build command for your reference:
rm -rf target/build-cuda ; cmake -S src/main/cuda -B target/build-cuda -G
"Ninja Multi-Config" ; cmake --build target/build-cuda/ --target install
--config Release
---
.../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 303688 -> 302880
bytes
.../lib/libsystemds_spoof_cuda-Windows-AMD64.dll | Bin 244736 -> 244736
bytes
src/main/cuda/kernels/SystemDS.ptx | 449 +--
src/main/cuda/kernels/reduction.ptx | 3737 ++++++--------------
4 files changed, 1220 insertions(+), 2966 deletions(-)
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index 5bb044f..ec5be11 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and
b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll
index bdf0a4f..b005c1b 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll and
b/src/main/cpp/lib/libsystemds_spoof_cuda-Windows-AMD64.dll differ
diff --git a/src/main/cuda/kernels/SystemDS.ptx
b/src/main/cuda/kernels/SystemDS.ptx
index b5ca8de..ee355bf 100644
--- a/src/main/cuda/kernels/SystemDS.ptx
+++ b/src/main/cuda/kernels/SystemDS.ptx
@@ -9190,7 +9190,7 @@ BB75_35:
.reg .pred %p<20>;
.reg .b32 %r<72>;
.reg .f64 %fd<58>;
- .reg .b64 %rd<9>;
+ .reg .b64 %rd<10>;
ld.param.u64 %rd1, [reduce_row_mean_d_param_0];
@@ -9338,12 +9338,13 @@ BB76_33:
@%p19 bra BB76_35;
ld.shared.f64 %fd40, [memory];
- cvt.rn.f64.s32 %fd41, %r4;
+ cvt.u64.u32 %rd6, %r4;
+ cvt.rn.f64.s64 %fd41, %rd6;
div.rn.f64 %fd42, %fd40, %fd41;
- cvta.to.global.u64 %rd6, %rd2;
- mul.wide.u32 %rd7, %r6, 8;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f64 [%rd8], %fd42;
+ cvta.to.global.u64 %rd7, %rd2;
+ mul.wide.u32 %rd8, %r6, 8;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd42;
BB76_35:
ret;
@@ -9360,7 +9361,7 @@ BB76_35:
.reg .pred %p<20>;
.reg .f32 %f<58>;
.reg .b32 %r<72>;
- .reg .b64 %rd<9>;
+ .reg .b64 %rd<10>;
ld.param.u64 %rd1, [reduce_row_mean_f_param_0];
@@ -9508,12 +9509,13 @@ BB77_33:
@%p19 bra BB77_35;
ld.shared.f32 %f40, [memory];
- cvt.rn.f32.s32 %f41, %r4;
+ cvt.u64.u32 %rd6, %r4;
+ cvt.rn.f32.s64 %f41, %rd6;
div.rn.f32 %f42, %f40, %f41;
- cvta.to.global.u64 %rd6, %rd2;
- mul.wide.u32 %rd7, %r6, 4;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f32 [%rd8], %f42;
+ cvta.to.global.u64 %rd7, %rd2;
+ mul.wide.u32 %rd8, %r6, 4;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f32 [%rd9], %f42;
BB77_35:
ret;
@@ -9530,7 +9532,7 @@ BB77_35:
.reg .pred %p<4>;
.reg .b32 %r<11>;
.reg .f64 %fd<11>;
- .reg .b64 %rd<9>;
+ .reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_d_param_0];
@@ -9562,12 +9564,13 @@ BB78_3:
@%p3 bra BB78_3;
BB78_4:
- cvt.rn.f64.s32 %fd7, %r5;
+ cvt.u64.u32 %rd6, %r5;
+ cvt.rn.f64.s64 %fd7, %rd6;
div.rn.f64 %fd8, %fd10, %fd7;
- cvta.to.global.u64 %rd6, %rd3;
- mul.wide.u32 %rd7, %r1, 8;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f64 [%rd8], %fd8;
+ cvta.to.global.u64 %rd7, %rd3;
+ mul.wide.u32 %rd8, %r1, 8;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f64 [%rd9], %fd8;
BB78_5:
ret;
@@ -9584,7 +9587,7 @@ BB78_5:
.reg .pred %p<4>;
.reg .f32 %f<11>;
.reg .b32 %r<11>;
- .reg .b64 %rd<9>;
+ .reg .b64 %rd<10>;
ld.param.u64 %rd2, [reduce_col_mean_f_param_0];
@@ -9616,12 +9619,13 @@ BB79_3:
@%p3 bra BB79_3;
BB79_4:
- cvt.rn.f32.s32 %f7, %r5;
+ cvt.u64.u32 %rd6, %r5;
+ cvt.rn.f32.s64 %f7, %rd6;
div.rn.f32 %f8, %f10, %f7;
- cvta.to.global.u64 %rd6, %rd3;
- mul.wide.u32 %rd7, %r1, 4;
- add.s64 %rd8, %rd6, %rd7;
- st.global.f32 [%rd8], %f8;
+ cvta.to.global.u64 %rd7, %rd3;
+ mul.wide.u32 %rd8, %r1, 4;
+ add.s64 %rd9, %rd7, %rd8;
+ st.global.f32 [%rd9], %f8;
BB79_5:
ret;
@@ -10594,7 +10598,7 @@ BB94_11:
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
- .reg .b32 %r<70>;
+ .reg .b32 %r<69>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -10602,12 +10606,12 @@ BB94_11:
mov.u64 %SPL, __local_depot95;
ld.param.u64 %rd7, [matrix_sin_f_param_0];
ld.param.u64 %rd8, [matrix_sin_f_param_1];
- ld.param.u32 %r30, [matrix_sin_f_param_2];
- mov.u32 %r31, %ntid.x;
- mov.u32 %r32, %ctaid.x;
- mov.u32 %r33, %tid.x;
- mad.lo.s32 %r1, %r31, %r32, %r33;
- setp.ge.u32 %p1, %r1, %r30;
+ ld.param.u32 %r29, [matrix_sin_f_param_2];
+ mov.u32 %r30, %ntid.x;
+ mov.u32 %r31, %ctaid.x;
+ mov.u32 %r32, %tid.x;
+ mad.lo.s32 %r1, %r30, %r31, %r32;
+ setp.ge.u32 %p1, %r1, %r29;
@%p1 bra BB95_17;
cvta.to.global.u64 %rd9, %rd7;
@@ -10616,8 +10620,8 @@ BB94_11:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r69, %f15;
- cvt.rn.f32.s32 %f16, %r69;
+ cvt.rni.s32.f32 %r68, %f15;
+ cvt.rn.f32.s32 %f16, %r68;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
@@ -10639,96 +10643,95 @@ BB95_11:
BB95_3:
mov.b32 %r3, %f1;
- shr.u32 %r4, %r3, 23;
- shl.b32 %r36, %r3, 8;
- or.b32 %r5, %r36, -2147483648;
- mov.u32 %r63, 0;
+ shl.b32 %r35, %r3, 8;
+ or.b32 %r4, %r35, -2147483648;
+ mov.u32 %r62, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r62, -6;
+ mov.u32 %r61, -6;
mov.u64 %rd23, %rd1;
BB95_4:
.pragma "nounroll";
- ld.const.u32 %r39, [%rd22];
+ ld.const.u32 %r38, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r37, %r39, %r5, %r63;
- madc.hi.u32 %r63, %r39, %r5, 0;
+ mad.lo.cc.u32 %r36, %r38, %r4, %r62;
+ madc.hi.u32 %r62, %r38, %r4, 0;
}
// inline asm
- st.local.u32 [%rd23], %r37;
+ st.local.u32 [%rd23], %r36;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r62, %r62, 1;
- setp.ne.s32 %p4, %r62, 0;
+ add.s32 %r61, %r61, 1;
+ setp.ne.s32 %p4, %r61, 0;
@%p4 bra BB95_4;
- and.b32 %r42, %r4, 255;
- add.s32 %r43, %r42, -128;
- shr.u32 %r44, %r43, 5;
- and.b32 %r10, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r63;
- mov.u32 %r45, 6;
- sub.s32 %r46, %r45, %r44;
- mul.wide.s32 %rd14, %r46, 4;
+ bfe.u32 %r41, %r3, 23, 8;
+ add.s32 %r42, %r41, -128;
+ shr.u32 %r43, %r42, 5;
+ and.b32 %r9, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r62;
+ bfe.u32 %r10, %r3, 23, 5;
+ mov.u32 %r44, 6;
+ sub.s32 %r45, %r44, %r43;
+ mul.wide.s32 %rd14, %r45, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r65, [%rd6];
- ld.local.u32 %r64, [%rd6+-4];
- and.b32 %r13, %r4, 31;
- setp.eq.s32 %p5, %r13, 0;
+ ld.local.u32 %r64, [%rd6];
+ ld.local.u32 %r63, [%rd6+-4];
+ setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB95_7;
- mov.u32 %r47, 32;
- sub.s32 %r48, %r47, %r13;
- shr.u32 %r49, %r64, %r48;
- shl.b32 %r50, %r65, %r13;
- add.s32 %r65, %r49, %r50;
- ld.local.u32 %r51, [%rd6+-8];
- shr.u32 %r52, %r51, %r48;
- shl.b32 %r53, %r64, %r13;
- add.s32 %r64, %r52, %r53;
+ mov.u32 %r46, 32;
+ sub.s32 %r47, %r46, %r10;
+ shr.u32 %r48, %r63, %r47;
+ shl.b32 %r49, %r64, %r10;
+ add.s32 %r64, %r48, %r49;
+ ld.local.u32 %r50, [%rd6+-8];
+ shr.u32 %r51, %r50, %r47;
+ shl.b32 %r52, %r63, %r10;
+ add.s32 %r63, %r51, %r52;
BB95_7:
- shr.u32 %r54, %r64, 30;
- shl.b32 %r55, %r65, 2;
- add.s32 %r67, %r55, %r54;
- shl.b32 %r19, %r64, 2;
- shr.u32 %r56, %r67, 31;
- shr.u32 %r57, %r65, 30;
- add.s32 %r20, %r56, %r57;
- setp.eq.s32 %p6, %r56, 0;
+ shr.u32 %r53, %r63, 30;
+ shl.b32 %r54, %r64, 2;
+ add.s32 %r66, %r54, %r53;
+ shl.b32 %r18, %r63, 2;
+ shr.u32 %r55, %r66, 31;
+ shr.u32 %r56, %r64, 30;
+ add.s32 %r19, %r55, %r56;
+ setp.eq.s32 %p6, %r55, 0;
@%p6 bra BB95_8;
- not.b32 %r58, %r67;
- neg.s32 %r66, %r19;
- setp.eq.s32 %p7, %r19, 0;
- selp.u32 %r59, 1, 0, %p7;
- add.s32 %r67, %r59, %r58;
- xor.b32 %r68, %r10, -2147483648;
+ not.b32 %r57, %r66;
+ neg.s32 %r65, %r18;
+ setp.eq.s32 %p7, %r18, 0;
+ selp.u32 %r58, 1, 0, %p7;
+ add.s32 %r66, %r58, %r57;
+ xor.b32 %r67, %r9, -2147483648;
bra.uni BB95_10;
BB95_8:
- mov.u32 %r66, %r19;
- mov.u32 %r68, %r10;
+ mov.u32 %r65, %r18;
+ mov.u32 %r67, %r9;
BB95_10:
- cvt.u64.u32 %rd15, %r67;
+ cvt.u64.u32 %rd15, %r66;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r66;
+ cvt.u64.u32 %rd17, %r65;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
- setp.eq.s32 %p8, %r68, 0;
+ setp.eq.s32 %p8, %r67, 0;
selp.f32 %f35, %f22, %f23, %p8;
- setp.eq.s32 %p9, %r10, 0;
- neg.s32 %r60, %r20;
- selp.b32 %r69, %r20, %r60, %p9;
+ setp.eq.s32 %p9, %r9, 0;
+ neg.s32 %r59, %r19;
+ selp.b32 %r68, %r19, %r59, %p9;
BB95_12:
- and.b32 %r29, %r69, 1;
- setp.eq.s32 %p10, %r29, 0;
+ and.b32 %r28, %r68, 1;
+ setp.eq.s32 %p10, %r28, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
@@ -10746,8 +10749,8 @@ BB95_14:
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
- and.b32 %r61, %r69, 2;
- setp.eq.s32 %p12, %r61, 0;
+ and.b32 %r60, %r68, 2;
+ setp.eq.s32 %p12, %r60, 0;
@%p12 bra BB95_16;
mov.f32 %f34, 0fBF800000;
@@ -11142,7 +11145,7 @@ BB98_11:
.reg .b64 %SPL;
.reg .pred %p<13>;
.reg .f32 %f<38>;
- .reg .b32 %r<71>;
+ .reg .b32 %r<70>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -11150,12 +11153,12 @@ BB98_11:
mov.u64 %SPL, __local_depot99;
ld.param.u64 %rd7, [matrix_cos_f_param_0];
ld.param.u64 %rd8, [matrix_cos_f_param_1];
- ld.param.u32 %r31, [matrix_cos_f_param_2];
- mov.u32 %r32, %ntid.x;
- mov.u32 %r33, %ctaid.x;
- mov.u32 %r34, %tid.x;
- mad.lo.s32 %r1, %r32, %r33, %r34;
- setp.ge.u32 %p1, %r1, %r31;
+ ld.param.u32 %r30, [matrix_cos_f_param_2];
+ mov.u32 %r31, %ntid.x;
+ mov.u32 %r32, %ctaid.x;
+ mov.u32 %r33, %tid.x;
+ mad.lo.s32 %r1, %r31, %r32, %r33;
+ setp.ge.u32 %p1, %r1, %r30;
@%p1 bra BB99_17;
cvta.to.global.u64 %rd9, %rd7;
@@ -11164,8 +11167,8 @@ BB98_11:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f15, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r70, %f15;
- cvt.rn.f32.s32 %f16, %r70;
+ cvt.rni.s32.f32 %r69, %f15;
+ cvt.rn.f32.s32 %f16, %r69;
mov.f32 %f17, 0fBFC90FDA;
fma.rn.f32 %f18, %f16, %f17, %f1;
mov.f32 %f19, 0fB3A22168;
@@ -11187,97 +11190,96 @@ BB99_11:
BB99_3:
mov.b32 %r3, %f1;
- shr.u32 %r4, %r3, 23;
- shl.b32 %r37, %r3, 8;
- or.b32 %r5, %r37, -2147483648;
- mov.u32 %r64, 0;
+ shl.b32 %r36, %r3, 8;
+ or.b32 %r4, %r36, -2147483648;
+ mov.u32 %r63, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r63, -6;
+ mov.u32 %r62, -6;
mov.u64 %rd23, %rd1;
BB99_4:
.pragma "nounroll";
- ld.const.u32 %r40, [%rd22];
+ ld.const.u32 %r39, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r38, %r40, %r5, %r64;
- madc.hi.u32 %r64, %r40, %r5, 0;
+ mad.lo.cc.u32 %r37, %r39, %r4, %r63;
+ madc.hi.u32 %r63, %r39, %r4, 0;
}
// inline asm
- st.local.u32 [%rd23], %r38;
+ st.local.u32 [%rd23], %r37;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r63, %r63, 1;
- setp.ne.s32 %p4, %r63, 0;
+ add.s32 %r62, %r62, 1;
+ setp.ne.s32 %p4, %r62, 0;
@%p4 bra BB99_4;
- and.b32 %r43, %r4, 255;
- add.s32 %r44, %r43, -128;
- shr.u32 %r45, %r44, 5;
- and.b32 %r10, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r64;
- mov.u32 %r46, 6;
- sub.s32 %r47, %r46, %r45;
- mul.wide.s32 %rd14, %r47, 4;
+ bfe.u32 %r42, %r3, 23, 8;
+ add.s32 %r43, %r42, -128;
+ shr.u32 %r44, %r43, 5;
+ and.b32 %r9, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r63;
+ bfe.u32 %r10, %r3, 23, 5;
+ mov.u32 %r45, 6;
+ sub.s32 %r46, %r45, %r44;
+ mul.wide.s32 %rd14, %r46, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r66, [%rd6];
- ld.local.u32 %r65, [%rd6+-4];
- and.b32 %r13, %r4, 31;
- setp.eq.s32 %p5, %r13, 0;
+ ld.local.u32 %r65, [%rd6];
+ ld.local.u32 %r64, [%rd6+-4];
+ setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB99_7;
- mov.u32 %r48, 32;
- sub.s32 %r49, %r48, %r13;
- shr.u32 %r50, %r65, %r49;
- shl.b32 %r51, %r66, %r13;
- add.s32 %r66, %r50, %r51;
- ld.local.u32 %r52, [%rd6+-8];
- shr.u32 %r53, %r52, %r49;
- shl.b32 %r54, %r65, %r13;
- add.s32 %r65, %r53, %r54;
+ mov.u32 %r47, 32;
+ sub.s32 %r48, %r47, %r10;
+ shr.u32 %r49, %r64, %r48;
+ shl.b32 %r50, %r65, %r10;
+ add.s32 %r65, %r49, %r50;
+ ld.local.u32 %r51, [%rd6+-8];
+ shr.u32 %r52, %r51, %r48;
+ shl.b32 %r53, %r64, %r10;
+ add.s32 %r64, %r52, %r53;
BB99_7:
- shr.u32 %r55, %r65, 30;
- shl.b32 %r56, %r66, 2;
- add.s32 %r68, %r56, %r55;
- shl.b32 %r19, %r65, 2;
- shr.u32 %r57, %r68, 31;
- shr.u32 %r58, %r66, 30;
- add.s32 %r20, %r57, %r58;
- setp.eq.s32 %p6, %r57, 0;
+ shr.u32 %r54, %r64, 30;
+ shl.b32 %r55, %r65, 2;
+ add.s32 %r67, %r55, %r54;
+ shl.b32 %r18, %r64, 2;
+ shr.u32 %r56, %r67, 31;
+ shr.u32 %r57, %r65, 30;
+ add.s32 %r19, %r56, %r57;
+ setp.eq.s32 %p6, %r56, 0;
@%p6 bra BB99_8;
- not.b32 %r59, %r68;
- neg.s32 %r67, %r19;
- setp.eq.s32 %p7, %r19, 0;
- selp.u32 %r60, 1, 0, %p7;
- add.s32 %r68, %r60, %r59;
- xor.b32 %r69, %r10, -2147483648;
+ not.b32 %r58, %r67;
+ neg.s32 %r66, %r18;
+ setp.eq.s32 %p7, %r18, 0;
+ selp.u32 %r59, 1, 0, %p7;
+ add.s32 %r67, %r59, %r58;
+ xor.b32 %r68, %r9, -2147483648;
bra.uni BB99_10;
BB99_8:
- mov.u32 %r67, %r19;
- mov.u32 %r69, %r10;
+ mov.u32 %r66, %r18;
+ mov.u32 %r68, %r9;
BB99_10:
- cvt.u64.u32 %rd15, %r68;
+ cvt.u64.u32 %rd15, %r67;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r67;
+ cvt.u64.u32 %rd17, %r66;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f22, %fd2;
neg.f32 %f23, %f22;
- setp.eq.s32 %p8, %r69, 0;
+ setp.eq.s32 %p8, %r68, 0;
selp.f32 %f35, %f22, %f23, %p8;
- setp.eq.s32 %p9, %r10, 0;
- neg.s32 %r61, %r20;
- selp.b32 %r70, %r20, %r61, %p9;
+ setp.eq.s32 %p9, %r9, 0;
+ neg.s32 %r60, %r19;
+ selp.b32 %r69, %r19, %r60, %p9;
BB99_12:
- add.s32 %r29, %r70, 1;
- and.b32 %r30, %r29, 1;
- setp.eq.s32 %p10, %r30, 0;
+ add.s32 %r28, %r69, 1;
+ and.b32 %r29, %r28, 1;
+ setp.eq.s32 %p10, %r29, 0;
selp.f32 %f7, %f35, 0f3F800000, %p10;
mul.rn.f32 %f8, %f35, %f35;
mov.f32 %f26, 0f00000000;
@@ -11295,8 +11297,8 @@ BB99_14:
selp.f32 %f31, 0fBE2AAAA8, 0fBEFFFFFF, %p10;
fma.rn.f32 %f32, %f30, %f8, %f31;
fma.rn.f32 %f37, %f32, %f9, %f7;
- and.b32 %r62, %r29, 2;
- setp.eq.s32 %p12, %r62, 0;
+ and.b32 %r61, %r28, 2;
+ setp.eq.s32 %p12, %r61, 0;
@%p12 bra BB99_16;
mov.f32 %f34, 0fBF800000;
@@ -11647,7 +11649,7 @@ BB102_9:
.reg .b64 %SPL;
.reg .pred %p<12>;
.reg .f32 %f<39>;
- .reg .b32 %r<69>;
+ .reg .b32 %r<68>;
.reg .f64 %fd<3>;
.reg .b64 %rd<24>;
@@ -11655,12 +11657,12 @@ BB102_9:
mov.u64 %SPL, __local_depot103;
ld.param.u64 %rd7, [matrix_tan_f_param_0];
ld.param.u64 %rd8, [matrix_tan_f_param_1];
- ld.param.u32 %r29, [matrix_tan_f_param_2];
- mov.u32 %r30, %ntid.x;
- mov.u32 %r31, %ctaid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r1, %r30, %r31, %r32;
- setp.ge.u32 %p1, %r1, %r29;
+ ld.param.u32 %r28, [matrix_tan_f_param_2];
+ mov.u32 %r29, %ntid.x;
+ mov.u32 %r30, %ctaid.x;
+ mov.u32 %r31, %tid.x;
+ mad.lo.s32 %r1, %r29, %r30, %r31;
+ setp.ge.u32 %p1, %r1, %r28;
@%p1 bra BB103_15;
cvta.to.global.u64 %rd9, %rd7;
@@ -11669,8 +11671,8 @@ BB102_9:
add.u64 %rd1, %SPL, 0;
ld.global.f32 %f1, [%rd11];
mul.f32 %f10, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r68, %f10;
- cvt.rn.f32.s32 %f11, %r68;
+ cvt.rni.s32.f32 %r67, %f10;
+ cvt.rn.f32.s32 %f11, %r67;
mov.f32 %f12, 0fBFC90FDA;
fma.rn.f32 %f13, %f11, %f12, %f1;
mov.f32 %f14, 0fB3A22168;
@@ -11692,92 +11694,91 @@ BB103_11:
BB103_3:
mov.b32 %r3, %f1;
- shr.u32 %r4, %r3, 23;
- shl.b32 %r35, %r3, 8;
- or.b32 %r5, %r35, -2147483648;
- mov.u32 %r62, 0;
+ shl.b32 %r34, %r3, 8;
+ or.b32 %r4, %r34, -2147483648;
+ mov.u32 %r61, 0;
mov.u64 %rd22, __cudart_i2opi_f;
- mov.u32 %r61, -6;
+ mov.u32 %r60, -6;
mov.u64 %rd23, %rd1;
BB103_4:
.pragma "nounroll";
- ld.const.u32 %r38, [%rd22];
+ ld.const.u32 %r37, [%rd22];
// inline asm
{
- mad.lo.cc.u32 %r36, %r38, %r5, %r62;
- madc.hi.u32 %r62, %r38, %r5, 0;
+ mad.lo.cc.u32 %r35, %r37, %r4, %r61;
+ madc.hi.u32 %r61, %r37, %r4, 0;
}
// inline asm
- st.local.u32 [%rd23], %r36;
+ st.local.u32 [%rd23], %r35;
add.s64 %rd23, %rd23, 4;
add.s64 %rd22, %rd22, 4;
- add.s32 %r61, %r61, 1;
- setp.ne.s32 %p4, %r61, 0;
+ add.s32 %r60, %r60, 1;
+ setp.ne.s32 %p4, %r60, 0;
@%p4 bra BB103_4;
- and.b32 %r41, %r4, 255;
- add.s32 %r42, %r41, -128;
- shr.u32 %r43, %r42, 5;
- and.b32 %r10, %r3, -2147483648;
- st.local.u32 [%rd1+24], %r62;
- mov.u32 %r44, 6;
- sub.s32 %r45, %r44, %r43;
- mul.wide.s32 %rd14, %r45, 4;
+ bfe.u32 %r40, %r3, 23, 8;
+ add.s32 %r41, %r40, -128;
+ shr.u32 %r42, %r41, 5;
+ and.b32 %r9, %r3, -2147483648;
+ st.local.u32 [%rd1+24], %r61;
+ bfe.u32 %r10, %r3, 23, 5;
+ mov.u32 %r43, 6;
+ sub.s32 %r44, %r43, %r42;
+ mul.wide.s32 %rd14, %r44, 4;
add.s64 %rd6, %rd1, %rd14;
- ld.local.u32 %r64, [%rd6];
- ld.local.u32 %r63, [%rd6+-4];
- and.b32 %r13, %r4, 31;
- setp.eq.s32 %p5, %r13, 0;
+ ld.local.u32 %r63, [%rd6];
+ ld.local.u32 %r62, [%rd6+-4];
+ setp.eq.s32 %p5, %r10, 0;
@%p5 bra BB103_7;
- mov.u32 %r46, 32;
- sub.s32 %r47, %r46, %r13;
- shr.u32 %r48, %r63, %r47;
- shl.b32 %r49, %r64, %r13;
- add.s32 %r64, %r48, %r49;
- ld.local.u32 %r50, [%rd6+-8];
- shr.u32 %r51, %r50, %r47;
- shl.b32 %r52, %r63, %r13;
- add.s32 %r63, %r51, %r52;
+ mov.u32 %r45, 32;
+ sub.s32 %r46, %r45, %r10;
+ shr.u32 %r47, %r62, %r46;
+ shl.b32 %r48, %r63, %r10;
+ add.s32 %r63, %r47, %r48;
+ ld.local.u32 %r49, [%rd6+-8];
+ shr.u32 %r50, %r49, %r46;
+ shl.b32 %r51, %r62, %r10;
+ add.s32 %r62, %r50, %r51;
BB103_7:
- shr.u32 %r53, %r63, 30;
- shl.b32 %r54, %r64, 2;
- add.s32 %r66, %r54, %r53;
- shl.b32 %r19, %r63, 2;
- shr.u32 %r55, %r66, 31;
- shr.u32 %r56, %r64, 30;
- add.s32 %r20, %r55, %r56;
- setp.eq.s32 %p6, %r55, 0;
+ shr.u32 %r52, %r62, 30;
+ shl.b32 %r53, %r63, 2;
+ add.s32 %r65, %r53, %r52;
+ shl.b32 %r18, %r62, 2;
+ shr.u32 %r54, %r65, 31;
+ shr.u32 %r55, %r63, 30;
+ add.s32 %r19, %r54, %r55;
+ setp.eq.s32 %p6, %r54, 0;
@%p6 bra BB103_8;
- not.b32 %r57, %r66;
- neg.s32 %r65, %r19;
- setp.eq.s32 %p7, %r19, 0;
- selp.u32 %r58, 1, 0, %p7;
- add.s32 %r66, %r58, %r57;
- xor.b32 %r67, %r10, -2147483648;
+ not.b32 %r56, %r65;
+ neg.s32 %r64, %r18;
+ setp.eq.s32 %p7, %r18, 0;
+ selp.u32 %r57, 1, 0, %p7;
+ add.s32 %r65, %r57, %r56;
+ xor.b32 %r66, %r9, -2147483648;
bra.uni BB103_10;
BB103_8:
- mov.u32 %r65, %r19;
- mov.u32 %r67, %r10;
+ mov.u32 %r64, %r18;
+ mov.u32 %r66, %r9;
BB103_10:
- cvt.u64.u32 %rd15, %r66;
+ cvt.u64.u32 %rd15, %r65;
shl.b64 %rd16, %rd15, 32;
- cvt.u64.u32 %rd17, %r65;
+ cvt.u64.u32 %rd17, %r64;
or.b64 %rd18, %rd16, %rd17;
cvt.rn.f64.s64 %fd1, %rd18;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f17, %fd2;
neg.f32 %f18, %f17;
- setp.eq.s32 %p8, %r67, 0;
+ setp.eq.s32 %p8, %r66, 0;
selp.f32 %f37, %f17, %f18, %p8;
- setp.eq.s32 %p9, %r10, 0;
- neg.s32 %r59, %r20;
- selp.b32 %r68, %r20, %r59, %p9;
+ setp.eq.s32 %p9, %r9, 0;
+ neg.s32 %r58, %r19;
+ selp.b32 %r67, %r19, %r58, %p9;
BB103_12:
mul.f32 %f20, %f37, %f37;
@@ -11797,8 +11798,8 @@ BB103_12:
abs.f32 %f34, %f37;
setp.eq.f32 %p10, %f34, 0f3A00B43C;
selp.f32 %f38, %f37, %f33, %p10;
- and.b32 %r60, %r68, 1;
- setp.eq.b32 %p11, %r60, 1;
+ and.b32 %r59, %r67, 1;
+ setp.eq.b32 %p11, %r59, 1;
@!%p11 bra BB103_14;
bra.uni BB103_13;
diff --git a/src/main/cuda/kernels/reduction.ptx
b/src/main/cuda/kernels/reduction.ptx
index 31038f5..72b9225 100644
--- a/src/main/cuda/kernels/reduction.ptx
+++ b/src/main/cuda/kernels/reduction.ptx
@@ -10,649 +10,9 @@
.target sm_30
.address_size 64
-.extern .func (.param .b32 func_retval0) vprintf
-(
- .param .b64 vprintf_param_0,
- .param .b64 vprintf_param_1
-)
-;
+ // .globl double2float_f
.extern .shared .align 1 .b8 memory[];
-.global .align 1 .b8 $str[78] = {69, 82, 82, 79, 82, 58, 32, 110, 111, 32, 99,
111, 108, 117, 109, 110, 32, 105, 110, 100, 105, 99, 101, 115, 32, 97, 114,
114, 97, 121, 32, 105, 110, 32, 97, 32, 100, 101, 110, 115, 101, 32, 109, 97,
116, 114, 105, 120, 33, 32, 84, 104, 105, 115, 32, 119, 105, 108, 108, 32, 108,
105, 107, 101, 108, 121, 32, 99, 114, 97, 115, 104, 32, 58, 45, 47, 10, 0};
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9len_denseEv(
- .param .b64 _ZN14MatrixAccessorIfE9len_denseEv_param_0
-)
-{
- .reg .b32 %r<4>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE9len_denseEv_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+4];
- ld.u32 %r2, [%rd2+8];
- mul.lo.s32 %r3, %r2, %r1;
- st.param.b32 [func_retval0+0], %r3;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE9pos_denseEj(
- .param .b64 _ZN14MatrixAccessorIfE9pos_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE9pos_denseEj_param_1
-)
-{
- .reg .b32 %r<4>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE9pos_denseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r2, [%rd2+8];
- mul.lo.s32 %r3, %r2, %r1;
- st.param.b32 [func_retval0+0], %r3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10cols_denseEj(
- .param .b64 _ZN14MatrixAccessorIfE10cols_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE10cols_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<4>;
-
-
- mov.u64 %rd1, $str;
- cvta.global.u64 %rd2, %rd1;
- mov.u64 %rd3, 0;
- // Callseq Start 0
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd2;
- .param .b64 param1;
- st.param.b64 [param1+0], %rd3;
- .param .b32 retval0;
- call.uni (retval0),
- vprintf,
- (
- param0,
- param1
- );
- ld.param.b32 %r1, [retval0+0];
-
- //{
- }// Callseq End 0
- st.param.b64 [func_retval0+0], %rd3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_dense_rcEjj(
- .param .b64 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1,
- .param .b32 _ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2
-)
-{
- .reg .b32 %r<5>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_1];
- ld.param.u32 %r2, [_ZN14MatrixAccessorIfE12val_dense_rcEjj_param_2];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- ld.u32 %r3, [%rd2+8];
- mad.lo.s32 %r4, %r3, %r1, %r2;
- mul.wide.u32 %rd4, %r4, 4;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE10vals_denseEj(
- .param .b64 _ZN14MatrixAccessorIfE10vals_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE10vals_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10vals_denseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE13row_len_denseEj(
- .param .b64 _ZN14MatrixAccessorIfE13row_len_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE13row_len_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE13row_len_denseEj_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+4];
- st.param.b32 [func_retval0+0], %r1;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11val_dense_iEj(
- .param .b64 _ZN14MatrixAccessorIfE11val_dense_iEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE11val_dense_iEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11val_dense_iEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10len_sparseEv(
- .param .b64 _ZN14MatrixAccessorIfE10len_sparseEv_param_0
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10len_sparseEv_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2];
- st.param.b32 [func_retval0+0], %r1;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE10pos_sparseEj(
- .param .b64 _ZN14MatrixAccessorIfE10pos_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE10pos_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10pos_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+16];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.u32 %r2, [%rd5];
- st.param.b32 [func_retval0+0], %r2;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11cols_sparseEj(
- .param .b64 _ZN14MatrixAccessorIfE11cols_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE11cols_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<9>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11cols_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+24];
- ld.u64 %rd4, [%rd2+16];
- mul.wide.u32 %rd5, %r1, 4;
- add.s64 %rd6, %rd4, %rd5;
- ld.u32 %r2, [%rd6];
- mul.wide.u32 %rd7, %r2, 4;
- add.s64 %rd8, %rd3, %rd7;
- st.param.b64 [func_retval0+0], %rd8;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE13val_sparse_rcEjj(
- .param .b64 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_1,
- .param .b32 _ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_2
-)
-{
- .reg .b64 %rd<4>;
-
-
- ld.param.u64 %rd1,
[_ZN14MatrixAccessorIfE13val_sparse_rcEjj_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- st.param.b64 [func_retval0+0], %rd3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE11vals_sparseEj(
- .param .b64 _ZN14MatrixAccessorIfE11vals_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE11vals_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<9>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE11vals_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- ld.u64 %rd4, [%rd2+16];
- mul.wide.u32 %rd5, %r1, 4;
- add.s64 %rd6, %rd4, %rd5;
- ld.u32 %r2, [%rd6];
- mul.wide.u32 %rd7, %r2, 4;
- add.s64 %rd8, %rd3, %rd7;
- st.param.b64 [func_retval0+0], %rd8;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIfE14row_len_sparseEj(
- .param .b64 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE14row_len_sparseEj_param_1
-)
-{
- .reg .b32 %r<6>;
- .reg .b64 %rd<8>;
-
-
- ld.param.u64 %rd1,
[_ZN14MatrixAccessorIfE14row_len_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE14row_len_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+16];
- add.s32 %r2, %r1, 1;
- mul.wide.u32 %rd4, %r2, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.u32 %r3, [%rd5];
- mul.wide.u32 %rd6, %r1, 4;
- add.s64 %rd7, %rd3, %rd6;
- ld.u32 %r4, [%rd7];
- sub.s32 %r5, %r3, %r4;
- st.param.b32 [func_retval0+0], %r5;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIfE12val_sparse_iEj(
- .param .b64 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_0,
- .param .b32 _ZN14MatrixAccessorIfE12val_sparse_iEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE12val_sparse_iEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func _ZN14MatrixAccessorIfE10set_sparseEjjf(
- .param .b64 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_0,
- .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_1,
- .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_2,
- .param .b32 _ZN14MatrixAccessorIfE10set_sparseEjjf_param_3
-)
-{
- .reg .f32 %f<2>;
- .reg .b32 %r<3>;
- .reg .b64 %rd<9>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_1];
- ld.param.u32 %r2, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_2];
- ld.param.f32 %f1, [_ZN14MatrixAccessorIfE10set_sparseEjjf_param_3];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- st.f32 [%rd5], %f1;
- ld.u64 %rd6, [%rd1];
- ld.u64 %rd7, [%rd6+24];
- add.s64 %rd8, %rd7, %rd4;
- st.u32 [%rd8], %r2;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE9len_denseEv(
- .param .b64 _ZN14MatrixAccessorIdE9len_denseEv_param_0
-)
-{
- .reg .b32 %r<4>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE9len_denseEv_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+4];
- ld.u32 %r2, [%rd2+8];
- mul.lo.s32 %r3, %r2, %r1;
- st.param.b32 [func_retval0+0], %r3;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE9pos_denseEj(
- .param .b64 _ZN14MatrixAccessorIdE9pos_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE9pos_denseEj_param_1
-)
-{
- .reg .b32 %r<4>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE9pos_denseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r2, [%rd2+8];
- mul.lo.s32 %r3, %r2, %r1;
- st.param.b32 [func_retval0+0], %r3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE10cols_denseEj(
- .param .b64 _ZN14MatrixAccessorIdE10cols_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE10cols_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<4>;
-
-
- mov.u64 %rd1, $str;
- cvta.global.u64 %rd2, %rd1;
- mov.u64 %rd3, 0;
- // Callseq Start 1
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd2;
- .param .b64 param1;
- st.param.b64 [param1+0], %rd3;
- .param .b32 retval0;
- call.uni (retval0),
- vprintf,
- (
- param0,
- param1
- );
- ld.param.b32 %r1, [retval0+0];
-
- //{
- }// Callseq End 1
- st.param.b64 [func_retval0+0], %rd3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE12val_dense_rcEjj(
- .param .b64 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_1,
- .param .b32 _ZN14MatrixAccessorIdE12val_dense_rcEjj_param_2
-)
-{
- .reg .b32 %r<5>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_1];
- ld.param.u32 %r2, [_ZN14MatrixAccessorIdE12val_dense_rcEjj_param_2];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- ld.u32 %r3, [%rd2+8];
- mad.lo.s32 %r4, %r3, %r1, %r2;
- mul.wide.u32 %rd4, %r4, 8;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE10vals_denseEj(
- .param .b64 _ZN14MatrixAccessorIdE10vals_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE10vals_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE10vals_denseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE13row_len_denseEj(
- .param .b64 _ZN14MatrixAccessorIdE13row_len_denseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE13row_len_denseEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE13row_len_denseEj_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2+4];
- st.param.b32 [func_retval0+0], %r1;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11val_dense_iEj(
- .param .b64 _ZN14MatrixAccessorIdE11val_dense_iEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE11val_dense_iEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11val_dense_iEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE10len_sparseEv(
- .param .b64 _ZN14MatrixAccessorIdE10len_sparseEv_param_0
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<3>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10len_sparseEv_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u32 %r1, [%rd2];
- st.param.b32 [func_retval0+0], %r1;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE10pos_sparseEj(
- .param .b64 _ZN14MatrixAccessorIdE10pos_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE10pos_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE10pos_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+16];
- mul.wide.u32 %rd4, %r1, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.u32 %r2, [%rd5];
- st.param.b32 [func_retval0+0], %r2;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11cols_sparseEj(
- .param .b64 _ZN14MatrixAccessorIdE11cols_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE11cols_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<9>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11cols_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+24];
- ld.u64 %rd4, [%rd2+16];
- mul.wide.u32 %rd5, %r1, 4;
- add.s64 %rd6, %rd4, %rd5;
- ld.u32 %r2, [%rd6];
- mul.wide.u32 %rd7, %r2, 4;
- add.s64 %rd8, %rd3, %rd7;
- st.param.b64 [func_retval0+0], %rd8;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE13val_sparse_rcEjj(
- .param .b64 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_1,
- .param .b32 _ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_2
-)
-{
- .reg .b64 %rd<4>;
-
-
- ld.param.u64 %rd1,
[_ZN14MatrixAccessorIdE13val_sparse_rcEjj_param_0];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- st.param.b64 [func_retval0+0], %rd3;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE11vals_sparseEj(
- .param .b64 _ZN14MatrixAccessorIdE11vals_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE11vals_sparseEj_param_1
-)
-{
- .reg .b32 %r<3>;
- .reg .b64 %rd<9>;
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE11vals_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- ld.u64 %rd4, [%rd2+16];
- mul.wide.u32 %rd5, %r1, 4;
- add.s64 %rd6, %rd4, %rd5;
- ld.u32 %r2, [%rd6];
- mul.wide.u32 %rd7, %r2, 8;
- add.s64 %rd8, %rd3, %rd7;
- st.param.b64 [func_retval0+0], %rd8;
- ret;
-}
-
-.func (.param .b32 func_retval0) _ZN14MatrixAccessorIdE14row_len_sparseEj(
- .param .b64 _ZN14MatrixAccessorIdE14row_len_sparseEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE14row_len_sparseEj_param_1
-)
-{
- .reg .b32 %r<6>;
- .reg .b64 %rd<8>;
-
-
- ld.param.u64 %rd1,
[_ZN14MatrixAccessorIdE14row_len_sparseEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE14row_len_sparseEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+16];
- add.s32 %r2, %r1, 1;
- mul.wide.u32 %rd4, %r2, 4;
- add.s64 %rd5, %rd3, %rd4;
- ld.u32 %r3, [%rd5];
- mul.wide.u32 %rd6, %r1, 4;
- add.s64 %rd7, %rd3, %rd6;
- ld.u32 %r4, [%rd7];
- sub.s32 %r5, %r3, %r4;
- st.param.b32 [func_retval0+0], %r5;
- ret;
-}
-
-.func (.param .b64 func_retval0) _ZN14MatrixAccessorIdE12val_sparse_iEj(
- .param .b64 _ZN14MatrixAccessorIdE12val_sparse_iEj_param_0,
- .param .b32 _ZN14MatrixAccessorIdE12val_sparse_iEj_param_1
-)
-{
- .reg .b32 %r<2>;
- .reg .b64 %rd<6>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE12val_sparse_iEj_param_1];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- st.param.b64 [func_retval0+0], %rd5;
- ret;
-}
-
-.func _ZN14MatrixAccessorIdE10set_sparseEjjd(
- .param .b64 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_0,
- .param .b32 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_1,
- .param .b32 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_2,
- .param .b64 _ZN14MatrixAccessorIdE10set_sparseEjjd_param_3
-)
-{
- .reg .b32 %r<3>;
- .reg .f64 %fd<2>;
- .reg .b64 %rd<10>;
-
-
- ld.param.u64 %rd1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_0];
- ld.param.u32 %r1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_1];
- ld.param.u32 %r2, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_2];
- ld.param.f64 %fd1, [_ZN14MatrixAccessorIdE10set_sparseEjjd_param_3];
- ld.u64 %rd2, [%rd1];
- ld.u64 %rd3, [%rd2+32];
- mul.wide.u32 %rd4, %r1, 8;
- add.s64 %rd5, %rd3, %rd4;
- st.f64 [%rd5], %fd1;
- ld.u64 %rd6, [%rd1];
- ld.u64 %rd7, [%rd6+24];
- mul.wide.u32 %rd8, %r1, 4;
- add.s64 %rd9, %rd7, %rd8;
- st.u32 [%rd9], %r2;
- ret;
-}
-
- // .globl double2float_f
.visible .entry double2float_f(
.param .u64 double2float_f_param_0,
.param .u64 double2float_f_param_1,
@@ -674,7 +34,7 @@
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB30_2;
+ @%p1 bra BB0_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 8;
@@ -686,7 +46,7 @@
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f1;
-BB30_2:
+BB0_2:
ret;
}
@@ -712,7 +72,7 @@ BB30_2:
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r4, %r3, %r5;
setp.ge.s32 %p1, %r1, %r2;
- @%p1 bra BB31_2;
+ @%p1 bra BB1_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
@@ -724,7 +84,7 @@ BB30_2:
add.s64 %rd8, %rd6, %rd7;
st.global.f64 [%rd8], %fd1;
-BB31_2:
+BB1_2:
ret;
}
@@ -735,400 +95,225 @@ BB31_2:
.param .u32 reduce_sum_f_param_2
)
{
- .local .align 8 .b8 __local_depot32[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
.reg .pred %p<25>;
- .reg .f32 %f<60>;
- .reg .b32 %r<44>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot32;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_sum_f_param_0];
- ld.param.u64 %rd16, [reduce_sum_f_param_1];
- ld.param.u32 %r5, [reduce_sum_f_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB32_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB32_3;
-
-BB32_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB32_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB32_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB32_6;
-
-BB32_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB32_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f32 %f44, 0f00000000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB32_15;
-
- mov.f32 %f44, 0f00000000;
-
-BB32_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB32_10;
- bra.uni BB32_9;
-
-BB32_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB32_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 2
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_2;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 2
- ld.f32 %f31, [%rd99];
- add.f32 %f44, %f44, %f31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB32_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB32_13;
- bra.uni BB32_12;
-
-BB32_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB32_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 3
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_3 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_3;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 3
- ld.f32 %f32, [%rd111];
- add.f32 %f44, %f44, %f32;
-
-BB32_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB32_8;
-
-BB32_15:
- shl.b32 %r23, %r6, 2;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f32 [%r4], %f44;
+ .reg .f32 %f<69>;
+ .reg .b32 %r<57>;
+ .reg .b64 %rd<36>;
+
+
+ ld.param.u64 %rd9, [reduce_sum_f_param_0];
+ ld.param.u64 %rd10, [reduce_sum_f_param_1];
+ ld.param.u32 %r13, [reduce_sum_f_param_2];
+ mov.u32 %r14, %ctaid.x;
+ shl.b32 %r15, %r14, 1;
+ mov.u32 %r16, %ntid.x;
+ mov.u32 %r17, %tid.x;
+ mad.lo.s32 %r56, %r15, %r16, %r17;
+ mov.f32 %f51, 0f00000000;
+ setp.ge.u32 %p1, %r56, %r13;
+ @%p1 bra BB2_11;
+
+ cvta.to.global.u64 %rd11, %rd9;
+ ld.global.u64 %rd1, [%rd11+16];
+ setp.eq.s64 %p2, %rd1, 0;
+ ld.global.u64 %rd12, [%rd11+32];
+ cvta.to.global.u64 %rd2, %rd12;
+ mov.f32 %f51, 0f00000000;
+ @%p2 bra BB2_8;
+
+ mad.lo.s32 %r54, %r15, %r16, %r17;
+ mov.f32 %f51, 0f00000000;
+ mov.u64 %rd32, %rd1;
+
+BB2_3:
+ cvta.to.global.u64 %rd13, %rd32;
+ mul.wide.u32 %rd14, %r54, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ ld.global.u32 %r27, [%rd15];
+ mul.wide.u32 %rd16, %r27, 4;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.f32 %f36, [%rd17];
+ add.f32 %f51, %f51, %f36;
+ add.s32 %r55, %r54, %r16;
+ setp.ge.u32 %p3, %r55, %r13;
+ @%p3 bra BB2_7;
+
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
+ @%p4 bra BB2_6;
+
+ cvta.to.global.u64 %rd19, %rd1;
+ mul.wide.u32 %rd20, %r55, 4;
+ add.s64 %rd21, %rd19, %rd20;
+ ld.global.u32 %r55, [%rd21];
+ mov.u64 %rd32, %rd1;
+
+BB2_6:
+ mul.wide.u32 %rd22, %r55, 4;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.global.f32 %f37, [%rd23];
+ add.f32 %f51, %f51, %f37;
+
+BB2_7:
+ shl.b32 %r30, %r16, 1;
+ mov.u32 %r31, %nctaid.x;
+ mad.lo.s32 %r54, %r30, %r31, %r54;
+ setp.lt.u32 %p5, %r54, %r13;
+ @%p5 bra BB2_3;
+ bra.uni BB2_11;
+
+BB2_8:
+ mul.wide.u32 %rd24, %r56, 4;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.global.f32 %f38, [%rd25];
+ add.f32 %f51, %f51, %f38;
+ add.s32 %r10, %r56, %r16;
+ setp.ge.u32 %p6, %r10, %r13;
+ @%p6 bra BB2_10;
+
+ mul.wide.u32 %rd26, %r10, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f39, [%rd27];
+ add.f32 %f51, %f51, %f39;
+
+BB2_10:
+ mov.u32 %r32, %nctaid.x;
+ shl.b32 %r33, %r16, 1;
+ mad.lo.s32 %r56, %r33, %r32, %r56;
+ setp.lt.u32 %p7, %r56, %r13;
+ @%p7 bra BB2_8;
+
+BB2_11:
+ shl.b32 %r35, %r17, 2;
+ mov.u32 %r36, memory;
+ add.s32 %r12, %r36, %r35;
+ st.shared.f32 [%r12], %f51;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB32_19;
+ setp.lt.u32 %p8, %r16, 1024;
+ @%p8 bra BB2_15;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB32_18;
+ setp.gt.u32 %p9, %r17, 511;
+ @%p9 bra BB2_14;
- ld.shared.f32 %f33, [%r4+2048];
- add.f32 %f44, %f44, %f33;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f40, [%r12+2048];
+ add.f32 %f51, %f51, %f40;
+ st.shared.f32 [%r12], %f51;
-BB32_18:
+BB2_14:
bar.sync 0;
-BB32_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB32_23;
+BB2_15:
+ setp.lt.u32 %p10, %r16, 512;
+ @%p10 bra BB2_19;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB32_22;
+ setp.gt.u32 %p11, %r17, 255;
+ @%p11 bra BB2_18;
- ld.shared.f32 %f34, [%r4+1024];
- add.f32 %f44, %f44, %f34;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f41, [%r12+1024];
+ add.f32 %f51, %f51, %f41;
+ st.shared.f32 [%r12], %f51;
-BB32_22:
+BB2_18:
bar.sync 0;
-BB32_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB32_27;
+BB2_19:
+ setp.lt.u32 %p12, %r16, 256;
+ @%p12 bra BB2_23;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB32_26;
+ setp.gt.u32 %p13, %r17, 127;
+ @%p13 bra BB2_22;
- ld.shared.f32 %f35, [%r4+512];
- add.f32 %f44, %f44, %f35;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f42, [%r12+512];
+ add.f32 %f51, %f51, %f42;
+ st.shared.f32 [%r12], %f51;
-BB32_26:
+BB2_22:
bar.sync 0;
-BB32_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB32_31;
+BB2_23:
+ setp.lt.u32 %p14, %r16, 128;
+ @%p14 bra BB2_27;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB32_30;
+ setp.gt.u32 %p15, %r17, 63;
+ @%p15 bra BB2_26;
- ld.shared.f32 %f36, [%r4+256];
- add.f32 %f44, %f44, %f36;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f43, [%r12+256];
+ add.f32 %f51, %f51, %f43;
+ st.shared.f32 [%r12], %f51;
-BB32_30:
+BB2_26:
bar.sync 0;
-BB32_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB32_44;
+BB2_27:
+ setp.gt.u32 %p16, %r17, 31;
+ @%p16 bra BB2_40;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB32_34;
+ setp.lt.u32 %p17, %r16, 64;
+ @%p17 bra BB2_30;
- ld.volatile.shared.f32 %f37, [%r4+128];
- add.f32 %f44, %f44, %f37;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f44, [%r12+128];
+ add.f32 %f51, %f51, %f44;
+ st.volatile.shared.f32 [%r12], %f51;
-BB32_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB32_36;
+BB2_30:
+ setp.lt.u32 %p18, %r16, 32;
+ @%p18 bra BB2_32;
- ld.volatile.shared.f32 %f38, [%r4+64];
- add.f32 %f44, %f44, %f38;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f45, [%r12+64];
+ add.f32 %f51, %f51, %f45;
+ st.volatile.shared.f32 [%r12], %f51;
-BB32_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB32_38;
+BB2_32:
+ setp.lt.u32 %p19, %r16, 16;
+ @%p19 bra BB2_34;
- ld.volatile.shared.f32 %f39, [%r4+32];
- add.f32 %f44, %f44, %f39;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f46, [%r12+32];
+ add.f32 %f51, %f51, %f46;
+ st.volatile.shared.f32 [%r12], %f51;
-BB32_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB32_40;
+BB2_34:
+ setp.lt.u32 %p20, %r16, 8;
+ @%p20 bra BB2_36;
- ld.volatile.shared.f32 %f40, [%r4+16];
- add.f32 %f44, %f44, %f40;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f47, [%r12+16];
+ add.f32 %f51, %f51, %f47;
+ st.volatile.shared.f32 [%r12], %f51;
-BB32_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB32_42;
+BB2_36:
+ setp.lt.u32 %p21, %r16, 4;
+ @%p21 bra BB2_38;
- ld.volatile.shared.f32 %f41, [%r4+8];
- add.f32 %f44, %f44, %f41;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f48, [%r12+8];
+ add.f32 %f51, %f51, %f48;
+ st.volatile.shared.f32 [%r12], %f51;
-BB32_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB32_44;
+BB2_38:
+ setp.lt.u32 %p22, %r16, 2;
+ @%p22 bra BB2_40;
- ld.volatile.shared.f32 %f42, [%r4+4];
- add.f32 %f43, %f44, %f42;
- st.volatile.shared.f32 [%r4], %f43;
+ ld.volatile.shared.f32 %f49, [%r12+4];
+ add.f32 %f50, %f51, %f49;
+ st.volatile.shared.f32 [%r12], %f50;
-BB32_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB32_48;
+BB2_40:
+ setp.ne.s32 %p23, %r17, 0;
+ @%p23 bra BB2_44;
- ld.shared.f32 %f28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB32_47;
- bra.uni BB32_46;
-
-BB32_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB32_47:
- mov.u32 %r42, 0;
- // Callseq Start 4
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_4 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_4;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 4
- st.f32 [%rd119], %f28;
-
-BB32_48:
+ ld.shared.f32 %f32, [memory];
+ cvta.to.global.u64 %rd28, %rd10;
+ ld.global.u64 %rd29, [%rd28+16];
+ ld.global.u64 %rd30, [%rd28+32];
+ cvta.to.global.u64 %rd35, %rd30;
+ setp.ne.s64 %p24, %rd29, 0;
+ @%p24 bra BB2_43;
+
+ mul.wide.u32 %rd31, %r14, 4;
+ add.s64 %rd35, %rd35, %rd31;
+
+BB2_43:
+ st.global.f32 [%rd35], %f32;
+
+BB2_44:
ret;
}
@@ -1139,400 +324,225 @@ BB32_48:
.param .u32 reduce_sum_d_param_2
)
{
- .local .align 8 .b8 __local_depot33[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
.reg .pred %p<25>;
- .reg .b32 %r<44>;
- .reg .f64 %fd<60>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot33;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_sum_d_param_0];
- ld.param.u64 %rd16, [reduce_sum_d_param_1];
- ld.param.u32 %r5, [reduce_sum_d_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB33_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB33_3;
-
-BB33_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB33_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB33_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB33_6;
-
-BB33_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB33_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f64 %fd44, 0d0000000000000000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB33_15;
-
- mov.f64 %fd44, 0d0000000000000000;
-
-BB33_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB33_10;
- bra.uni BB33_9;
-
-BB33_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB33_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 5
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_5 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_5;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 5
- ld.f64 %fd31, [%rd99];
- add.f64 %fd44, %fd44, %fd31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB33_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB33_13;
- bra.uni BB33_12;
-
-BB33_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB33_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 6
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_6 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_6;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 6
- ld.f64 %fd32, [%rd111];
- add.f64 %fd44, %fd44, %fd32;
-
-BB33_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB33_8;
-
-BB33_15:
- shl.b32 %r23, %r6, 3;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f64 [%r4], %fd44;
+ .reg .b32 %r<57>;
+ .reg .f64 %fd<69>;
+ .reg .b64 %rd<36>;
+
+
+ ld.param.u64 %rd9, [reduce_sum_d_param_0];
+ ld.param.u64 %rd10, [reduce_sum_d_param_1];
+ ld.param.u32 %r13, [reduce_sum_d_param_2];
+ mov.u32 %r14, %ctaid.x;
+ shl.b32 %r15, %r14, 1;
+ mov.u32 %r16, %ntid.x;
+ mov.u32 %r17, %tid.x;
+ mad.lo.s32 %r56, %r15, %r16, %r17;
+ mov.f64 %fd51, 0d0000000000000000;
+ setp.ge.u32 %p1, %r56, %r13;
+ @%p1 bra BB3_11;
+
+ cvta.to.global.u64 %rd11, %rd9;
+ ld.global.u64 %rd1, [%rd11+16];
+ setp.eq.s64 %p2, %rd1, 0;
+ ld.global.u64 %rd12, [%rd11+32];
+ cvta.to.global.u64 %rd2, %rd12;
+ mov.f64 %fd51, 0d0000000000000000;
+ @%p2 bra BB3_8;
+
+ mad.lo.s32 %r54, %r15, %r16, %r17;
+ mov.f64 %fd51, 0d0000000000000000;
+ mov.u64 %rd32, %rd1;
+
+BB3_3:
+ cvta.to.global.u64 %rd13, %rd32;
+ mul.wide.u32 %rd14, %r54, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ ld.global.u32 %r27, [%rd15];
+ mul.wide.u32 %rd16, %r27, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.f64 %fd36, [%rd17];
+ add.f64 %fd51, %fd51, %fd36;
+ add.s32 %r55, %r54, %r16;
+ setp.ge.u32 %p3, %r55, %r13;
+ @%p3 bra BB3_7;
+
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
+ @%p4 bra BB3_6;
+
+ cvta.to.global.u64 %rd19, %rd1;
+ mul.wide.u32 %rd20, %r55, 4;
+ add.s64 %rd21, %rd19, %rd20;
+ ld.global.u32 %r55, [%rd21];
+ mov.u64 %rd32, %rd1;
+
+BB3_6:
+ mul.wide.u32 %rd22, %r55, 8;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.global.f64 %fd37, [%rd23];
+ add.f64 %fd51, %fd51, %fd37;
+
+BB3_7:
+ shl.b32 %r30, %r16, 1;
+ mov.u32 %r31, %nctaid.x;
+ mad.lo.s32 %r54, %r30, %r31, %r54;
+ setp.lt.u32 %p5, %r54, %r13;
+ @%p5 bra BB3_3;
+ bra.uni BB3_11;
+
+BB3_8:
+ mul.wide.u32 %rd24, %r56, 8;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.global.f64 %fd38, [%rd25];
+ add.f64 %fd51, %fd51, %fd38;
+ add.s32 %r10, %r56, %r16;
+ setp.ge.u32 %p6, %r10, %r13;
+ @%p6 bra BB3_10;
+
+ mul.wide.u32 %rd26, %r10, 8;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f64 %fd39, [%rd27];
+ add.f64 %fd51, %fd51, %fd39;
+
+BB3_10:
+ mov.u32 %r32, %nctaid.x;
+ shl.b32 %r33, %r16, 1;
+ mad.lo.s32 %r56, %r33, %r32, %r56;
+ setp.lt.u32 %p7, %r56, %r13;
+ @%p7 bra BB3_8;
+
+BB3_11:
+ shl.b32 %r35, %r17, 3;
+ mov.u32 %r36, memory;
+ add.s32 %r12, %r36, %r35;
+ st.shared.f64 [%r12], %fd51;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB33_19;
+ setp.lt.u32 %p8, %r16, 1024;
+ @%p8 bra BB3_15;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB33_18;
+ setp.gt.u32 %p9, %r17, 511;
+ @%p9 bra BB3_14;
- ld.shared.f64 %fd33, [%r4+4096];
- add.f64 %fd44, %fd44, %fd33;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f64 %fd40, [%r12+4096];
+ add.f64 %fd51, %fd51, %fd40;
+ st.shared.f64 [%r12], %fd51;
-BB33_18:
+BB3_14:
bar.sync 0;
-BB33_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB33_23;
+BB3_15:
+ setp.lt.u32 %p10, %r16, 512;
+ @%p10 bra BB3_19;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB33_22;
+ setp.gt.u32 %p11, %r17, 255;
+ @%p11 bra BB3_18;
- ld.shared.f64 %fd34, [%r4+2048];
- add.f64 %fd44, %fd44, %fd34;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f64 %fd41, [%r12+2048];
+ add.f64 %fd51, %fd51, %fd41;
+ st.shared.f64 [%r12], %fd51;
-BB33_22:
+BB3_18:
bar.sync 0;
-BB33_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB33_27;
+BB3_19:
+ setp.lt.u32 %p12, %r16, 256;
+ @%p12 bra BB3_23;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB33_26;
+ setp.gt.u32 %p13, %r17, 127;
+ @%p13 bra BB3_22;
- ld.shared.f64 %fd35, [%r4+1024];
- add.f64 %fd44, %fd44, %fd35;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f64 %fd42, [%r12+1024];
+ add.f64 %fd51, %fd51, %fd42;
+ st.shared.f64 [%r12], %fd51;
-BB33_26:
+BB3_22:
bar.sync 0;
-BB33_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB33_31;
+BB3_23:
+ setp.lt.u32 %p14, %r16, 128;
+ @%p14 bra BB3_27;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB33_30;
+ setp.gt.u32 %p15, %r17, 63;
+ @%p15 bra BB3_26;
- ld.shared.f64 %fd36, [%r4+512];
- add.f64 %fd44, %fd44, %fd36;
- st.shared.f64 [%r4], %fd44;
+ ld.shared.f64 %fd43, [%r12+512];
+ add.f64 %fd51, %fd51, %fd43;
+ st.shared.f64 [%r12], %fd51;
-BB33_30:
+BB3_26:
bar.sync 0;
-BB33_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB33_44;
+BB3_27:
+ setp.gt.u32 %p16, %r17, 31;
+ @%p16 bra BB3_40;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB33_34;
+ setp.lt.u32 %p17, %r16, 64;
+ @%p17 bra BB3_30;
- ld.volatile.shared.f64 %fd37, [%r4+256];
- add.f64 %fd44, %fd44, %fd37;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f64 %fd44, [%r12+256];
+ add.f64 %fd51, %fd51, %fd44;
+ st.volatile.shared.f64 [%r12], %fd51;
-BB33_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB33_36;
+BB3_30:
+ setp.lt.u32 %p18, %r16, 32;
+ @%p18 bra BB3_32;
- ld.volatile.shared.f64 %fd38, [%r4+128];
- add.f64 %fd44, %fd44, %fd38;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f64 %fd45, [%r12+128];
+ add.f64 %fd51, %fd51, %fd45;
+ st.volatile.shared.f64 [%r12], %fd51;
-BB33_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB33_38;
+BB3_32:
+ setp.lt.u32 %p19, %r16, 16;
+ @%p19 bra BB3_34;
- ld.volatile.shared.f64 %fd39, [%r4+64];
- add.f64 %fd44, %fd44, %fd39;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f64 %fd46, [%r12+64];
+ add.f64 %fd51, %fd51, %fd46;
+ st.volatile.shared.f64 [%r12], %fd51;
-BB33_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB33_40;
+BB3_34:
+ setp.lt.u32 %p20, %r16, 8;
+ @%p20 bra BB3_36;
- ld.volatile.shared.f64 %fd40, [%r4+32];
- add.f64 %fd44, %fd44, %fd40;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f64 %fd47, [%r12+32];
+ add.f64 %fd51, %fd51, %fd47;
+ st.volatile.shared.f64 [%r12], %fd51;
-BB33_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB33_42;
+BB3_36:
+ setp.lt.u32 %p21, %r16, 4;
+ @%p21 bra BB3_38;
- ld.volatile.shared.f64 %fd41, [%r4+16];
- add.f64 %fd44, %fd44, %fd41;
- st.volatile.shared.f64 [%r4], %fd44;
+ ld.volatile.shared.f64 %fd48, [%r12+16];
+ add.f64 %fd51, %fd51, %fd48;
+ st.volatile.shared.f64 [%r12], %fd51;
-BB33_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB33_44;
+BB3_38:
+ setp.lt.u32 %p22, %r16, 2;
+ @%p22 bra BB3_40;
- ld.volatile.shared.f64 %fd42, [%r4+8];
- add.f64 %fd43, %fd44, %fd42;
- st.volatile.shared.f64 [%r4], %fd43;
+ ld.volatile.shared.f64 %fd49, [%r12+8];
+ add.f64 %fd50, %fd51, %fd49;
+ st.volatile.shared.f64 [%r12], %fd50;
-BB33_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB33_48;
+BB3_40:
+ setp.ne.s32 %p23, %r17, 0;
+ @%p23 bra BB3_44;
- ld.shared.f64 %fd28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB33_47;
- bra.uni BB33_46;
-
-BB33_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB33_47:
- mov.u32 %r42, 0;
- // Callseq Start 7
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_7 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_7;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 7
- st.f64 [%rd119], %fd28;
-
-BB33_48:
+ ld.shared.f64 %fd32, [memory];
+ cvta.to.global.u64 %rd28, %rd10;
+ ld.global.u64 %rd29, [%rd28+16];
+ ld.global.u64 %rd30, [%rd28+32];
+ cvta.to.global.u64 %rd35, %rd30;
+ setp.ne.s64 %p24, %rd29, 0;
+ @%p24 bra BB3_43;
+
+ mul.wide.u32 %rd31, %r14, 8;
+ add.s64 %rd35, %rd35, %rd31;
+
+BB3_43:
+ st.global.f64 [%rd35], %fd32;
+
+BB3_44:
ret;
}
@@ -1543,400 +553,225 @@ BB33_48:
.param .u32 reduce_max_f_param_2
)
{
- .local .align 8 .b8 __local_depot34[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
.reg .pred %p<25>;
- .reg .f32 %f<60>;
- .reg .b32 %r<44>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot34;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_max_f_param_0];
- ld.param.u64 %rd16, [reduce_max_f_param_1];
- ld.param.u32 %r5, [reduce_max_f_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB34_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB34_3;
-
-BB34_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB34_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB34_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB34_6;
-
-BB34_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB34_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f32 %f44, 0fFF800000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB34_15;
-
- mov.f32 %f44, 0fFF800000;
-
-BB34_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB34_10;
- bra.uni BB34_9;
-
-BB34_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB34_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 8
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_8 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_8;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 8
- ld.f32 %f31, [%rd99];
- max.f32 %f44, %f44, %f31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB34_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB34_13;
- bra.uni BB34_12;
-
-BB34_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB34_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 9
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_9 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_9;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 9
- ld.f32 %f32, [%rd111];
- max.f32 %f44, %f44, %f32;
-
-BB34_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB34_8;
-
-BB34_15:
- shl.b32 %r23, %r6, 2;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f32 [%r4], %f44;
+ .reg .f32 %f<69>;
+ .reg .b32 %r<57>;
+ .reg .b64 %rd<36>;
+
+
+ ld.param.u64 %rd9, [reduce_max_f_param_0];
+ ld.param.u64 %rd10, [reduce_max_f_param_1];
+ ld.param.u32 %r13, [reduce_max_f_param_2];
+ mov.u32 %r14, %ctaid.x;
+ shl.b32 %r15, %r14, 1;
+ mov.u32 %r16, %ntid.x;
+ mov.u32 %r17, %tid.x;
+ mad.lo.s32 %r56, %r15, %r16, %r17;
+ mov.f32 %f51, 0fFF800000;
+ setp.ge.u32 %p1, %r56, %r13;
+ @%p1 bra BB4_11;
+
+ cvta.to.global.u64 %rd11, %rd9;
+ ld.global.u64 %rd1, [%rd11+16];
+ setp.eq.s64 %p2, %rd1, 0;
+ ld.global.u64 %rd12, [%rd11+32];
+ cvta.to.global.u64 %rd2, %rd12;
+ mov.f32 %f51, 0fFF800000;
+ @%p2 bra BB4_8;
+
+ mad.lo.s32 %r54, %r15, %r16, %r17;
+ mov.f32 %f51, 0fFF800000;
+ mov.u64 %rd32, %rd1;
+
+BB4_3:
+ cvta.to.global.u64 %rd13, %rd32;
+ mul.wide.u32 %rd14, %r54, 4;
+ add.s64 %rd15, %rd13, %rd14;
+ ld.global.u32 %r27, [%rd15];
+ mul.wide.u32 %rd16, %r27, 4;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.f32 %f36, [%rd17];
+ max.f32 %f51, %f51, %f36;
+ add.s32 %r55, %r54, %r16;
+ setp.ge.u32 %p3, %r55, %r13;
+ @%p3 bra BB4_7;
+
+ setp.eq.s64 %p4, %rd32, 0;
+ mov.u64 %rd32, 0;
+ @%p4 bra BB4_6;
+
+ cvta.to.global.u64 %rd19, %rd1;
+ mul.wide.u32 %rd20, %r55, 4;
+ add.s64 %rd21, %rd19, %rd20;
+ ld.global.u32 %r55, [%rd21];
+ mov.u64 %rd32, %rd1;
+
+BB4_6:
+ mul.wide.u32 %rd22, %r55, 4;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.global.f32 %f37, [%rd23];
+ max.f32 %f51, %f51, %f37;
+
+BB4_7:
+ shl.b32 %r30, %r16, 1;
+ mov.u32 %r31, %nctaid.x;
+ mad.lo.s32 %r54, %r30, %r31, %r54;
+ setp.lt.u32 %p5, %r54, %r13;
+ @%p5 bra BB4_3;
+ bra.uni BB4_11;
+
+BB4_8:
+ mul.wide.u32 %rd24, %r56, 4;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.global.f32 %f38, [%rd25];
+ max.f32 %f51, %f51, %f38;
+ add.s32 %r10, %r56, %r16;
+ setp.ge.u32 %p6, %r10, %r13;
+ @%p6 bra BB4_10;
+
+ mul.wide.u32 %rd26, %r10, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f39, [%rd27];
+ max.f32 %f51, %f51, %f39;
+
+BB4_10:
+ mov.u32 %r32, %nctaid.x;
+ shl.b32 %r33, %r16, 1;
+ mad.lo.s32 %r56, %r33, %r32, %r56;
+ setp.lt.u32 %p7, %r56, %r13;
+ @%p7 bra BB4_8;
+
+BB4_11:
+ shl.b32 %r35, %r17, 2;
+ mov.u32 %r36, memory;
+ add.s32 %r12, %r36, %r35;
+ st.shared.f32 [%r12], %f51;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB34_19;
+ setp.lt.u32 %p8, %r16, 1024;
+ @%p8 bra BB4_15;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB34_18;
+ setp.gt.u32 %p9, %r17, 511;
+ @%p9 bra BB4_14;
- ld.shared.f32 %f33, [%r4+2048];
- max.f32 %f44, %f44, %f33;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f40, [%r12+2048];
+ max.f32 %f51, %f51, %f40;
+ st.shared.f32 [%r12], %f51;
-BB34_18:
+BB4_14:
bar.sync 0;
-BB34_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB34_23;
+BB4_15:
+ setp.lt.u32 %p10, %r16, 512;
+ @%p10 bra BB4_19;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB34_22;
+ setp.gt.u32 %p11, %r17, 255;
+ @%p11 bra BB4_18;
- ld.shared.f32 %f34, [%r4+1024];
- max.f32 %f44, %f44, %f34;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f41, [%r12+1024];
+ max.f32 %f51, %f51, %f41;
+ st.shared.f32 [%r12], %f51;
-BB34_22:
+BB4_18:
bar.sync 0;
-BB34_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB34_27;
+BB4_19:
+ setp.lt.u32 %p12, %r16, 256;
+ @%p12 bra BB4_23;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB34_26;
+ setp.gt.u32 %p13, %r17, 127;
+ @%p13 bra BB4_22;
- ld.shared.f32 %f35, [%r4+512];
- max.f32 %f44, %f44, %f35;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f42, [%r12+512];
+ max.f32 %f51, %f51, %f42;
+ st.shared.f32 [%r12], %f51;
-BB34_26:
+BB4_22:
bar.sync 0;
-BB34_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB34_31;
+BB4_23:
+ setp.lt.u32 %p14, %r16, 128;
+ @%p14 bra BB4_27;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB34_30;
+ setp.gt.u32 %p15, %r17, 63;
+ @%p15 bra BB4_26;
- ld.shared.f32 %f36, [%r4+256];
- max.f32 %f44, %f44, %f36;
- st.shared.f32 [%r4], %f44;
+ ld.shared.f32 %f43, [%r12+256];
+ max.f32 %f51, %f51, %f43;
+ st.shared.f32 [%r12], %f51;
-BB34_30:
+BB4_26:
bar.sync 0;
-BB34_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB34_44;
+BB4_27:
+ setp.gt.u32 %p16, %r17, 31;
+ @%p16 bra BB4_40;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB34_34;
+ setp.lt.u32 %p17, %r16, 64;
+ @%p17 bra BB4_30;
- ld.volatile.shared.f32 %f37, [%r4+128];
- max.f32 %f44, %f44, %f37;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f44, [%r12+128];
+ max.f32 %f51, %f51, %f44;
+ st.volatile.shared.f32 [%r12], %f51;
-BB34_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB34_36;
+BB4_30:
+ setp.lt.u32 %p18, %r16, 32;
+ @%p18 bra BB4_32;
- ld.volatile.shared.f32 %f38, [%r4+64];
- max.f32 %f44, %f44, %f38;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f45, [%r12+64];
+ max.f32 %f51, %f51, %f45;
+ st.volatile.shared.f32 [%r12], %f51;
-BB34_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB34_38;
+BB4_32:
+ setp.lt.u32 %p19, %r16, 16;
+ @%p19 bra BB4_34;
- ld.volatile.shared.f32 %f39, [%r4+32];
- max.f32 %f44, %f44, %f39;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f46, [%r12+32];
+ max.f32 %f51, %f51, %f46;
+ st.volatile.shared.f32 [%r12], %f51;
-BB34_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB34_40;
+BB4_34:
+ setp.lt.u32 %p20, %r16, 8;
+ @%p20 bra BB4_36;
- ld.volatile.shared.f32 %f40, [%r4+16];
- max.f32 %f44, %f44, %f40;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f47, [%r12+16];
+ max.f32 %f51, %f51, %f47;
+ st.volatile.shared.f32 [%r12], %f51;
-BB34_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB34_42;
+BB4_36:
+ setp.lt.u32 %p21, %r16, 4;
+ @%p21 bra BB4_38;
- ld.volatile.shared.f32 %f41, [%r4+8];
- max.f32 %f44, %f44, %f41;
- st.volatile.shared.f32 [%r4], %f44;
+ ld.volatile.shared.f32 %f48, [%r12+8];
+ max.f32 %f51, %f51, %f48;
+ st.volatile.shared.f32 [%r12], %f51;
-BB34_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB34_44;
+BB4_38:
+ setp.lt.u32 %p22, %r16, 2;
+ @%p22 bra BB4_40;
- ld.volatile.shared.f32 %f42, [%r4+4];
- max.f32 %f43, %f44, %f42;
- st.volatile.shared.f32 [%r4], %f43;
+ ld.volatile.shared.f32 %f49, [%r12+4];
+ max.f32 %f50, %f51, %f49;
+ st.volatile.shared.f32 [%r12], %f50;
-BB34_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB34_48;
+BB4_40:
+ setp.ne.s32 %p23, %r17, 0;
+ @%p23 bra BB4_44;
- ld.shared.f32 %f28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB34_47;
- bra.uni BB34_46;
-
-BB34_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB34_47:
- mov.u32 %r42, 0;
- // Callseq Start 10
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_10 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_10;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 10
- st.f32 [%rd119], %f28;
-
-BB34_48:
+ ld.shared.f32 %f32, [memory];
+ cvta.to.global.u64 %rd28, %rd10;
+ ld.global.u64 %rd29, [%rd28+16];
+ ld.global.u64 %rd30, [%rd28+32];
+ cvta.to.global.u64 %rd35, %rd30;
+ setp.ne.s64 %p24, %rd29, 0;
+ @%p24 bra BB4_43;
+
+ mul.wide.u32 %rd31, %r14, 4;
+ add.s64 %rd35, %rd35, %rd31;
+
+BB4_43:
+ st.global.f32 [%rd35], %f32;
+
+BB4_44:
ret;
}
@@ -1947,400 +782,206 @@ BB34_48:
.param .u32 reduce_max_d_param_2
)
{
- .local .align 8 .b8 __local_depot35[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
- .reg .pred %p<25>;
- .reg .b32 %r<44>;
+ .reg .pred %p<23>;
+ .reg .b32 %r<46>;
.reg .f64 %fd<60>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot35;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_max_d_param_0];
- ld.param.u64 %rd16, [reduce_max_d_param_1];
- ld.param.u32 %r5, [reduce_max_d_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB35_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB35_3;
-
-BB35_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB35_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB35_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB35_6;
-
-BB35_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB35_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f64 %fd44, 0dFFF0000000000000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB35_15;
+ .reg .b64 %rd<34>;
+
+ ld.param.u64 %rd10, [reduce_max_d_param_0];
+ ld.param.u64 %rd11, [reduce_max_d_param_1];
+ ld.param.u32 %r10, [reduce_max_d_param_2];
+ mov.u32 %r11, %tid.x;
+ mov.u32 %r12, %ctaid.x;
+ shl.b32 %r13, %r12, 1;
+ mov.u32 %r14, %ntid.x;
+ mad.lo.s32 %r43, %r13, %r14, %r11;
mov.f64 %fd44, 0dFFF0000000000000;
+ setp.ge.u32 %p1, %r43, %r10;
+ @%p1 bra BB5_9;
-BB35_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB35_10;
- bra.uni BB35_9;
-
-BB35_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB35_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 11
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_11 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_11;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 11
- ld.f64 %fd31, [%rd99];
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
+ ld.global.u64 %rd13, [%rd12+32];
+ cvta.to.global.u64 %rd2, %rd13;
+ mov.f64 %fd44, 0dFFF0000000000000;
+ mov.u64 %rd30, %rd1;
+
+BB5_2:
+ setp.eq.s64 %p2, %rd1, 0;
+ mov.u32 %r44, %r43;
+ @%p2 bra BB5_4;
+
+ cvta.to.global.u64 %rd14, %rd1;
+ mul.wide.u32 %rd15, %r43, 4;
+ add.s64 %rd16, %rd14, %rd15;
+ ld.global.u32 %r44, [%rd16];
+ mov.u64 %rd30, %rd1;
+
+BB5_4:
+ mul.wide.u32 %rd17, %r44, 8;
+ add.s64 %rd18, %rd2, %rd17;
+ ld.global.f64 %fd31, [%rd18];
max.f64 %fd44, %fd44, %fd31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB35_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB35_13;
- bra.uni BB35_12;
-
-BB35_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB35_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 12
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_12 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_12;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 12
- ld.f64 %fd32, [%rd111];
+ add.s32 %r45, %r43, %r14;
+ setp.ge.u32 %p3, %r45, %r10;
+ @%p3 bra BB5_8;
+
+ setp.eq.s64 %p4, %rd30, 0;
+ mov.u64 %rd30, 0;
+ @%p4 bra BB5_7;
+
+ cvta.to.global.u64 %rd20, %rd1;
+ add.s32 %r19, %r43, %r14;
+ mul.wide.u32 %rd21, %r19, 4;
+ add.s64 %rd22, %rd20, %rd21;
+ ld.global.u32 %r45, [%rd22];
+ mov.u64 %rd30, %rd1;
+
+BB5_7:
+ mul.wide.u32 %rd23, %r45, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.f64 %fd32, [%rd24];
max.f64 %fd44, %fd44, %fd32;
-BB35_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB35_8;
-
-BB35_15:
- shl.b32 %r23, %r6, 3;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f64 [%r4], %fd44;
+BB5_8:
+ shl.b32 %r21, %r14, 1;
+ mov.u32 %r22, %nctaid.x;
+ mad.lo.s32 %r43, %r21, %r22, %r43;
+ setp.lt.u32 %p5, %r43, %r10;
+ @%p5 bra BB5_2;
+
+BB5_9:
+ shl.b32 %r24, %r11, 3;
+ mov.u32 %r25, memory;
+ add.s32 %r9, %r25, %r24;
+ st.shared.f64 [%r9], %fd44;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB35_19;
+ setp.lt.u32 %p6, %r14, 1024;
+ @%p6 bra BB5_13;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB35_18;
+ setp.gt.u32 %p7, %r11, 511;
+ @%p7 bra BB5_12;
- ld.shared.f64 %fd33, [%r4+4096];
+ ld.shared.f64 %fd33, [%r9+4096];
max.f64 %fd44, %fd44, %fd33;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB35_18:
+BB5_12:
bar.sync 0;
-BB35_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB35_23;
+BB5_13:
+ setp.lt.u32 %p8, %r14, 512;
+ @%p8 bra BB5_17;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB35_22;
+ setp.gt.u32 %p9, %r11, 255;
+ @%p9 bra BB5_16;
- ld.shared.f64 %fd34, [%r4+2048];
+ ld.shared.f64 %fd34, [%r9+2048];
max.f64 %fd44, %fd44, %fd34;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB35_22:
+BB5_16:
bar.sync 0;
-BB35_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB35_27;
+BB5_17:
+ setp.lt.u32 %p10, %r14, 256;
+ @%p10 bra BB5_21;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB35_26;
+ setp.gt.u32 %p11, %r11, 127;
+ @%p11 bra BB5_20;
- ld.shared.f64 %fd35, [%r4+1024];
+ ld.shared.f64 %fd35, [%r9+1024];
max.f64 %fd44, %fd44, %fd35;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB35_26:
+BB5_20:
bar.sync 0;
-BB35_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB35_31;
+BB5_21:
+ setp.lt.u32 %p12, %r14, 128;
+ @%p12 bra BB5_25;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB35_30;
+ setp.gt.u32 %p13, %r11, 63;
+ @%p13 bra BB5_24;
- ld.shared.f64 %fd36, [%r4+512];
+ ld.shared.f64 %fd36, [%r9+512];
max.f64 %fd44, %fd44, %fd36;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB35_30:
+BB5_24:
bar.sync 0;
-BB35_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB35_44;
+BB5_25:
+ setp.gt.u32 %p14, %r11, 31;
+ @%p14 bra BB5_38;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB35_34;
+ setp.lt.u32 %p15, %r14, 64;
+ @%p15 bra BB5_28;
- ld.volatile.shared.f64 %fd37, [%r4+256];
+ ld.volatile.shared.f64 %fd37, [%r9+256];
max.f64 %fd44, %fd44, %fd37;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB35_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB35_36;
+BB5_28:
+ setp.lt.u32 %p16, %r14, 32;
+ @%p16 bra BB5_30;
- ld.volatile.shared.f64 %fd38, [%r4+128];
+ ld.volatile.shared.f64 %fd38, [%r9+128];
max.f64 %fd44, %fd44, %fd38;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB35_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB35_38;
+BB5_30:
+ setp.lt.u32 %p17, %r14, 16;
+ @%p17 bra BB5_32;
- ld.volatile.shared.f64 %fd39, [%r4+64];
+ ld.volatile.shared.f64 %fd39, [%r9+64];
max.f64 %fd44, %fd44, %fd39;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB35_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB35_40;
+BB5_32:
+ setp.lt.u32 %p18, %r14, 8;
+ @%p18 bra BB5_34;
- ld.volatile.shared.f64 %fd40, [%r4+32];
+ ld.volatile.shared.f64 %fd40, [%r9+32];
max.f64 %fd44, %fd44, %fd40;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB35_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB35_42;
+BB5_34:
+ setp.lt.u32 %p19, %r14, 4;
+ @%p19 bra BB5_36;
- ld.volatile.shared.f64 %fd41, [%r4+16];
+ ld.volatile.shared.f64 %fd41, [%r9+16];
max.f64 %fd44, %fd44, %fd41;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB35_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB35_44;
+BB5_36:
+ setp.lt.u32 %p20, %r14, 2;
+ @%p20 bra BB5_38;
- ld.volatile.shared.f64 %fd42, [%r4+8];
+ ld.volatile.shared.f64 %fd42, [%r9+8];
max.f64 %fd43, %fd44, %fd42;
- st.volatile.shared.f64 [%r4], %fd43;
+ st.volatile.shared.f64 [%r9], %fd43;
-BB35_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB35_48;
+BB5_38:
+ setp.ne.s32 %p21, %r11, 0;
+ @%p21 bra BB5_42;
ld.shared.f64 %fd28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB35_47;
- bra.uni BB35_46;
-
-BB35_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB35_47:
- mov.u32 %r42, 0;
- // Callseq Start 13
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_13 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_13;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 13
- st.f64 [%rd119], %fd28;
-
-BB35_48:
+ cvta.to.global.u64 %rd25, %rd11;
+ ld.global.u64 %rd26, [%rd25+16];
+ ld.global.u64 %rd27, [%rd25+32];
+ cvta.to.global.u64 %rd33, %rd27;
+ setp.ne.s64 %p22, %rd26, 0;
+ @%p22 bra BB5_41;
+
+ mul.wide.u32 %rd28, %r12, 8;
+ add.s64 %rd33, %rd33, %rd28;
+
+BB5_41:
+ st.global.f64 [%rd33], %fd28;
+
+BB5_42:
ret;
}
@@ -2351,400 +992,206 @@ BB35_48:
.param .u32 reduce_min_f_param_2
)
{
- .local .align 8 .b8 __local_depot36[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
- .reg .pred %p<25>;
+ .reg .pred %p<23>;
.reg .f32 %f<60>;
- .reg .b32 %r<44>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot36;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_min_f_param_0];
- ld.param.u64 %rd16, [reduce_min_f_param_1];
- ld.param.u32 %r5, [reduce_min_f_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB36_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB36_3;
-
-BB36_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB36_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB36_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIfE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIfE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIfE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIfE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIfE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIfE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIfE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIfE10set_sparseEjjf;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB36_6;
-
-BB36_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIfE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIfE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIfE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIfE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIfE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIfE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIfE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB36_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
+ .reg .b32 %r<46>;
+ .reg .b64 %rd<34>;
+
+
+ ld.param.u64 %rd10, [reduce_min_f_param_0];
+ ld.param.u64 %rd11, [reduce_min_f_param_1];
+ ld.param.u32 %r10, [reduce_min_f_param_2];
+ mov.u32 %r11, %tid.x;
+ mov.u32 %r12, %ctaid.x;
+ shl.b32 %r13, %r12, 1;
+ mov.u32 %r14, %ntid.x;
+ mad.lo.s32 %r43, %r13, %r14, %r11;
mov.f32 %f44, 0f7F800000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB36_15;
+ setp.ge.u32 %p1, %r43, %r10;
+ @%p1 bra BB6_9;
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
+ ld.global.u64 %rd13, [%rd12+32];
+ cvta.to.global.u64 %rd2, %rd13;
mov.f32 %f44, 0f7F800000;
-
-BB36_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB36_10;
- bra.uni BB36_9;
-
-BB36_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB36_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 14
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_14 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_14;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 14
- ld.f32 %f31, [%rd99];
+ mov.u64 %rd30, %rd1;
+
+BB6_2:
+ setp.eq.s64 %p2, %rd1, 0;
+ mov.u32 %r44, %r43;
+ @%p2 bra BB6_4;
+
+ cvta.to.global.u64 %rd14, %rd1;
+ mul.wide.u32 %rd15, %r43, 4;
+ add.s64 %rd16, %rd14, %rd15;
+ ld.global.u32 %r44, [%rd16];
+ mov.u64 %rd30, %rd1;
+
+BB6_4:
+ mul.wide.u32 %rd17, %r44, 4;
+ add.s64 %rd18, %rd2, %rd17;
+ ld.global.f32 %f31, [%rd18];
min.f32 %f44, %f44, %f31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB36_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB36_13;
- bra.uni BB36_12;
-
-BB36_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB36_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 15
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_15 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_15;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 15
- ld.f32 %f32, [%rd111];
+ add.s32 %r45, %r43, %r14;
+ setp.ge.u32 %p3, %r45, %r10;
+ @%p3 bra BB6_8;
+
+ setp.eq.s64 %p4, %rd30, 0;
+ mov.u64 %rd30, 0;
+ @%p4 bra BB6_7;
+
+ cvta.to.global.u64 %rd20, %rd1;
+ add.s32 %r19, %r43, %r14;
+ mul.wide.u32 %rd21, %r19, 4;
+ add.s64 %rd22, %rd20, %rd21;
+ ld.global.u32 %r45, [%rd22];
+ mov.u64 %rd30, %rd1;
+
+BB6_7:
+ mul.wide.u32 %rd23, %r45, 4;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.f32 %f32, [%rd24];
min.f32 %f44, %f44, %f32;
-BB36_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB36_8;
-
-BB36_15:
- shl.b32 %r23, %r6, 2;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f32 [%r4], %f44;
+BB6_8:
+ shl.b32 %r21, %r14, 1;
+ mov.u32 %r22, %nctaid.x;
+ mad.lo.s32 %r43, %r21, %r22, %r43;
+ setp.lt.u32 %p5, %r43, %r10;
+ @%p5 bra BB6_2;
+
+BB6_9:
+ shl.b32 %r24, %r11, 2;
+ mov.u32 %r25, memory;
+ add.s32 %r9, %r25, %r24;
+ st.shared.f32 [%r9], %f44;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB36_19;
+ setp.lt.u32 %p6, %r14, 1024;
+ @%p6 bra BB6_13;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB36_18;
+ setp.gt.u32 %p7, %r11, 511;
+ @%p7 bra BB6_12;
- ld.shared.f32 %f33, [%r4+2048];
+ ld.shared.f32 %f33, [%r9+2048];
min.f32 %f44, %f44, %f33;
- st.shared.f32 [%r4], %f44;
+ st.shared.f32 [%r9], %f44;
-BB36_18:
+BB6_12:
bar.sync 0;
-BB36_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB36_23;
+BB6_13:
+ setp.lt.u32 %p8, %r14, 512;
+ @%p8 bra BB6_17;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB36_22;
+ setp.gt.u32 %p9, %r11, 255;
+ @%p9 bra BB6_16;
- ld.shared.f32 %f34, [%r4+1024];
+ ld.shared.f32 %f34, [%r9+1024];
min.f32 %f44, %f44, %f34;
- st.shared.f32 [%r4], %f44;
+ st.shared.f32 [%r9], %f44;
-BB36_22:
+BB6_16:
bar.sync 0;
-BB36_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB36_27;
+BB6_17:
+ setp.lt.u32 %p10, %r14, 256;
+ @%p10 bra BB6_21;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB36_26;
+ setp.gt.u32 %p11, %r11, 127;
+ @%p11 bra BB6_20;
- ld.shared.f32 %f35, [%r4+512];
+ ld.shared.f32 %f35, [%r9+512];
min.f32 %f44, %f44, %f35;
- st.shared.f32 [%r4], %f44;
+ st.shared.f32 [%r9], %f44;
-BB36_26:
+BB6_20:
bar.sync 0;
-BB36_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB36_31;
+BB6_21:
+ setp.lt.u32 %p12, %r14, 128;
+ @%p12 bra BB6_25;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB36_30;
+ setp.gt.u32 %p13, %r11, 63;
+ @%p13 bra BB6_24;
- ld.shared.f32 %f36, [%r4+256];
+ ld.shared.f32 %f36, [%r9+256];
min.f32 %f44, %f44, %f36;
- st.shared.f32 [%r4], %f44;
+ st.shared.f32 [%r9], %f44;
-BB36_30:
+BB6_24:
bar.sync 0;
-BB36_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB36_44;
+BB6_25:
+ setp.gt.u32 %p14, %r11, 31;
+ @%p14 bra BB6_38;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB36_34;
+ setp.lt.u32 %p15, %r14, 64;
+ @%p15 bra BB6_28;
- ld.volatile.shared.f32 %f37, [%r4+128];
+ ld.volatile.shared.f32 %f37, [%r9+128];
min.f32 %f44, %f44, %f37;
- st.volatile.shared.f32 [%r4], %f44;
+ st.volatile.shared.f32 [%r9], %f44;
-BB36_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB36_36;
+BB6_28:
+ setp.lt.u32 %p16, %r14, 32;
+ @%p16 bra BB6_30;
- ld.volatile.shared.f32 %f38, [%r4+64];
+ ld.volatile.shared.f32 %f38, [%r9+64];
min.f32 %f44, %f44, %f38;
- st.volatile.shared.f32 [%r4], %f44;
+ st.volatile.shared.f32 [%r9], %f44;
-BB36_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB36_38;
+BB6_30:
+ setp.lt.u32 %p17, %r14, 16;
+ @%p17 bra BB6_32;
- ld.volatile.shared.f32 %f39, [%r4+32];
+ ld.volatile.shared.f32 %f39, [%r9+32];
min.f32 %f44, %f44, %f39;
- st.volatile.shared.f32 [%r4], %f44;
+ st.volatile.shared.f32 [%r9], %f44;
-BB36_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB36_40;
+BB6_32:
+ setp.lt.u32 %p18, %r14, 8;
+ @%p18 bra BB6_34;
- ld.volatile.shared.f32 %f40, [%r4+16];
+ ld.volatile.shared.f32 %f40, [%r9+16];
min.f32 %f44, %f44, %f40;
- st.volatile.shared.f32 [%r4], %f44;
+ st.volatile.shared.f32 [%r9], %f44;
-BB36_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB36_42;
+BB6_34:
+ setp.lt.u32 %p19, %r14, 4;
+ @%p19 bra BB6_36;
- ld.volatile.shared.f32 %f41, [%r4+8];
+ ld.volatile.shared.f32 %f41, [%r9+8];
min.f32 %f44, %f44, %f41;
- st.volatile.shared.f32 [%r4], %f44;
+ st.volatile.shared.f32 [%r9], %f44;
-BB36_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB36_44;
+BB6_36:
+ setp.lt.u32 %p20, %r14, 2;
+ @%p20 bra BB6_38;
- ld.volatile.shared.f32 %f42, [%r4+4];
+ ld.volatile.shared.f32 %f42, [%r9+4];
min.f32 %f43, %f44, %f42;
- st.volatile.shared.f32 [%r4], %f43;
+ st.volatile.shared.f32 [%r9], %f43;
-BB36_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB36_48;
+BB6_38:
+ setp.ne.s32 %p21, %r11, 0;
+ @%p21 bra BB6_42;
ld.shared.f32 %f28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB36_47;
- bra.uni BB36_46;
-
-BB36_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB36_47:
- mov.u32 %r42, 0;
- // Callseq Start 16
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_16 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_16;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 16
- st.f32 [%rd119], %f28;
-
-BB36_48:
+ cvta.to.global.u64 %rd25, %rd11;
+ ld.global.u64 %rd26, [%rd25+16];
+ ld.global.u64 %rd27, [%rd25+32];
+ cvta.to.global.u64 %rd33, %rd27;
+ setp.ne.s64 %p22, %rd26, 0;
+ @%p22 bra BB6_41;
+
+ mul.wide.u32 %rd28, %r12, 4;
+ add.s64 %rd33, %rd33, %rd28;
+
+BB6_41:
+ st.global.f32 [%rd33], %f28;
+
+BB6_42:
ret;
}
@@ -2755,400 +1202,206 @@ BB36_48:
.param .u32 reduce_min_d_param_2
)
{
- .local .align 8 .b8 __local_depot37[272];
- .reg .b64 %SP;
- .reg .b64 %SPL;
- .reg .pred %p<25>;
- .reg .b32 %r<44>;
+ .reg .pred %p<23>;
+ .reg .b32 %r<46>;
.reg .f64 %fd<60>;
- .reg .b64 %rd<123>;
-
-
- mov.u64 %SPL, __local_depot37;
- cvta.local.u64 %SP, %SPL;
- ld.param.u64 %rd17, [reduce_min_d_param_0];
- ld.param.u64 %rd16, [reduce_min_d_param_1];
- ld.param.u32 %r5, [reduce_min_d_param_2];
- add.u64 %rd18, %SP, 0;
- add.u64 %rd1, %SPL, 0;
- st.local.u64 [%rd1], %rd17;
- cvta.to.global.u64 %rd19, %rd17;
- ld.global.u64 %rd20, [%rd19+16];
- setp.eq.s64 %p1, %rd20, 0;
- @%p1 bra BB37_2;
-
- mov.u64 %rd21, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd1+8], %rd21;
- mov.u64 %rd23, 0;
- st.local.u64 [%rd1+16], %rd23;
- mov.u64 %rd24, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd1+40], %rd24;
- st.local.u64 [%rd1+48], %rd23;
- mov.u64 %rd26, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd1+56], %rd26;
- st.local.u64 [%rd1+64], %rd23;
- mov.u64 %rd28, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd1+88], %rd28;
- st.local.u64 [%rd1+96], %rd23;
- mov.u64 %rd30, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd1+104], %rd30;
- st.local.u64 [%rd1+112], %rd23;
- mov.u64 %rd32, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd1+24], %rd32;
- st.local.u64 [%rd1+32], %rd23;
- mov.u64 %rd34, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd1+72], %rd34;
- st.local.u64 [%rd1+80], %rd23;
- mov.u64 %rd36, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd1+120], %rd36;
- st.local.u64 [%rd1+128], %rd23;
- bra.uni BB37_3;
-
-BB37_2:
- mov.u64 %rd38, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd1+8], %rd38;
- mov.u64 %rd40, 0;
- st.local.u64 [%rd1+16], %rd40;
- mov.u64 %rd41, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd1+40], %rd41;
- st.local.u64 [%rd1+48], %rd40;
- mov.u64 %rd43, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd1+56], %rd43;
- st.local.u64 [%rd1+64], %rd40;
- mov.u64 %rd45, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd1+88], %rd45;
- st.local.u64 [%rd1+96], %rd40;
- mov.u64 %rd47, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd1+104], %rd47;
- st.local.u64 [%rd1+112], %rd40;
- mov.u64 %rd49, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd1+24], %rd49;
- st.local.u64 [%rd1+32], %rd40;
- mov.u64 %rd51, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd1+72], %rd51;
- st.local.u64 [%rd1+80], %rd40;
-
-BB37_3:
- add.u64 %rd53, %SP, 136;
- add.u64 %rd2, %SPL, 136;
- st.local.u64 [%rd2], %rd16;
- cvta.to.global.u64 %rd54, %rd16;
- ld.global.u64 %rd55, [%rd54+16];
- setp.eq.s64 %p2, %rd55, 0;
- @%p2 bra BB37_5;
-
- mov.u64 %rd56, _ZN14MatrixAccessorIdE10len_sparseEv;
- st.local.u64 [%rd2+8], %rd56;
- mov.u64 %rd58, 0;
- st.local.u64 [%rd2+16], %rd58;
- mov.u64 %rd59, _ZN14MatrixAccessorIdE10pos_sparseEj;
- st.local.u64 [%rd2+40], %rd59;
- st.local.u64 [%rd2+48], %rd58;
- mov.u64 %rd61, _ZN14MatrixAccessorIdE11cols_sparseEj;
- st.local.u64 [%rd2+56], %rd61;
- st.local.u64 [%rd2+64], %rd58;
- mov.u64 %rd63, _ZN14MatrixAccessorIdE13val_sparse_rcEjj;
- st.local.u64 [%rd2+88], %rd63;
- st.local.u64 [%rd2+96], %rd58;
- mov.u64 %rd65, _ZN14MatrixAccessorIdE11vals_sparseEj;
- st.local.u64 [%rd2+104], %rd65;
- st.local.u64 [%rd2+112], %rd58;
- mov.u64 %rd67, _ZN14MatrixAccessorIdE14row_len_sparseEj;
- st.local.u64 [%rd2+24], %rd67;
- st.local.u64 [%rd2+32], %rd58;
- mov.u64 %rd69, _ZN14MatrixAccessorIdE12val_sparse_iEj;
- st.local.u64 [%rd2+72], %rd69;
- st.local.u64 [%rd2+80], %rd58;
- mov.u64 %rd71, _ZN14MatrixAccessorIdE10set_sparseEjjd;
- st.local.u64 [%rd2+120], %rd71;
- st.local.u64 [%rd2+128], %rd58;
- bra.uni BB37_6;
-
-BB37_5:
- mov.u64 %rd73, _ZN14MatrixAccessorIdE9len_denseEv;
- st.local.u64 [%rd2+8], %rd73;
- mov.u64 %rd75, 0;
- st.local.u64 [%rd2+16], %rd75;
- mov.u64 %rd76, _ZN14MatrixAccessorIdE9pos_denseEj;
- st.local.u64 [%rd2+40], %rd76;
- st.local.u64 [%rd2+48], %rd75;
- mov.u64 %rd78, _ZN14MatrixAccessorIdE10cols_denseEj;
- st.local.u64 [%rd2+56], %rd78;
- st.local.u64 [%rd2+64], %rd75;
- mov.u64 %rd80, _ZN14MatrixAccessorIdE12val_dense_rcEjj;
- st.local.u64 [%rd2+88], %rd80;
- st.local.u64 [%rd2+96], %rd75;
- mov.u64 %rd82, _ZN14MatrixAccessorIdE10vals_denseEj;
- st.local.u64 [%rd2+104], %rd82;
- st.local.u64 [%rd2+112], %rd75;
- mov.u64 %rd84, _ZN14MatrixAccessorIdE13row_len_denseEj;
- st.local.u64 [%rd2+24], %rd84;
- st.local.u64 [%rd2+32], %rd75;
- mov.u64 %rd86, _ZN14MatrixAccessorIdE11val_dense_iEj;
- st.local.u64 [%rd2+72], %rd86;
- st.local.u64 [%rd2+80], %rd75;
-
-BB37_6:
- mov.u32 %r6, %tid.x;
- mov.u32 %r7, %ctaid.x;
- shl.b32 %r8, %r7, 1;
- mov.u32 %r9, %ntid.x;
- mad.lo.s32 %r43, %r8, %r9, %r6;
- mov.f64 %fd44, 0d7FF0000000000000;
- setp.ge.u32 %p3, %r43, %r5;
- @%p3 bra BB37_15;
+ .reg .b64 %rd<34>;
+
+ ld.param.u64 %rd10, [reduce_min_d_param_0];
+ ld.param.u64 %rd11, [reduce_min_d_param_1];
+ ld.param.u32 %r10, [reduce_min_d_param_2];
+ mov.u32 %r11, %tid.x;
+ mov.u32 %r12, %ctaid.x;
+ shl.b32 %r13, %r12, 1;
+ mov.u32 %r14, %ntid.x;
+ mad.lo.s32 %r43, %r13, %r14, %r11;
mov.f64 %fd44, 0d7FF0000000000000;
+ setp.ge.u32 %p1, %r43, %r10;
+ @%p1 bra BB7_9;
-BB37_8:
- ld.local.u64 %rd3, [%rd1+112];
- ld.local.u64 %rd120, [%rd1+104];
- and.b64 %rd90, %rd120, 1;
- setp.eq.b64 %p4, %rd90, 1;
- @!%p4 bra BB37_10;
- bra.uni BB37_9;
-
-BB37_9:
- add.s64 %rd93, %rd1, %rd3;
- ld.local.u64 %rd94, [%rd93];
- add.s64 %rd95, %rd120, %rd94;
- ld.u64 %rd120, [%rd95+-1];
-
-BB37_10:
- add.s64 %rd97, %rd18, %rd3;
- // Callseq Start 17
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd97;
- .param .b32 param1;
- st.param.b32 [param1+0], %r43;
- .param .b64 retval0;
- prototype_17 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd120,
- (
- param0,
- param1
- )
- , prototype_17;
- ld.param.b64 %rd99, [retval0+0];
-
- //{
- }// Callseq End 17
- ld.f64 %fd31, [%rd99];
+ cvta.to.global.u64 %rd12, %rd10;
+ ld.global.u64 %rd1, [%rd12+16];
+ ld.global.u64 %rd13, [%rd12+32];
+ cvta.to.global.u64 %rd2, %rd13;
+ mov.f64 %fd44, 0d7FF0000000000000;
+ mov.u64 %rd30, %rd1;
+
+BB7_2:
+ setp.eq.s64 %p2, %rd1, 0;
+ mov.u32 %r44, %r43;
+ @%p2 bra BB7_4;
+
+ cvta.to.global.u64 %rd14, %rd1;
+ mul.wide.u32 %rd15, %r43, 4;
+ add.s64 %rd16, %rd14, %rd15;
+ ld.global.u32 %r44, [%rd16];
+ mov.u64 %rd30, %rd1;
+
+BB7_4:
+ mul.wide.u32 %rd17, %r44, 8;
+ add.s64 %rd18, %rd2, %rd17;
+ ld.global.f64 %fd31, [%rd18];
min.f64 %fd44, %fd44, %fd31;
- add.s32 %r16, %r43, %r9;
- setp.ge.u32 %p5, %r16, %r5;
- @%p5 bra BB37_14;
-
- ld.local.u64 %rd121, [%rd1+104];
- and.b64 %rd102, %rd121, 1;
- setp.eq.b64 %p6, %rd102, 1;
- ld.local.u64 %rd8, [%rd1+112];
- @!%p6 bra BB37_13;
- bra.uni BB37_12;
-
-BB37_12:
- add.s64 %rd105, %rd1, %rd8;
- ld.local.u64 %rd106, [%rd105];
- add.s64 %rd107, %rd121, %rd106;
- ld.u64 %rd121, [%rd107+-1];
-
-BB37_13:
- add.s64 %rd109, %rd18, %rd8;
- // Callseq Start 18
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd109;
- .param .b32 param1;
- st.param.b32 [param1+0], %r16;
- .param .b64 retval0;
- prototype_18 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _) ;
- call (retval0),
- %rd121,
- (
- param0,
- param1
- )
- , prototype_18;
- ld.param.b64 %rd111, [retval0+0];
-
- //{
- }// Callseq End 18
- ld.f64 %fd32, [%rd111];
+ add.s32 %r45, %r43, %r14;
+ setp.ge.u32 %p3, %r45, %r10;
+ @%p3 bra BB7_8;
+
+ setp.eq.s64 %p4, %rd30, 0;
+ mov.u64 %rd30, 0;
+ @%p4 bra BB7_7;
+
+ cvta.to.global.u64 %rd20, %rd1;
+ add.s32 %r19, %r43, %r14;
+ mul.wide.u32 %rd21, %r19, 4;
+ add.s64 %rd22, %rd20, %rd21;
+ ld.global.u32 %r45, [%rd22];
+ mov.u64 %rd30, %rd1;
+
+BB7_7:
+ mul.wide.u32 %rd23, %r45, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.f64 %fd32, [%rd24];
min.f64 %fd44, %fd44, %fd32;
-BB37_14:
- shl.b32 %r20, %r9, 1;
- mov.u32 %r21, %nctaid.x;
- mad.lo.s32 %r43, %r20, %r21, %r43;
- setp.lt.u32 %p7, %r43, %r5;
- @%p7 bra BB37_8;
-
-BB37_15:
- shl.b32 %r23, %r6, 3;
- mov.u32 %r24, memory;
- add.s32 %r4, %r24, %r23;
- st.shared.f64 [%r4], %fd44;
+BB7_8:
+ shl.b32 %r21, %r14, 1;
+ mov.u32 %r22, %nctaid.x;
+ mad.lo.s32 %r43, %r21, %r22, %r43;
+ setp.lt.u32 %p5, %r43, %r10;
+ @%p5 bra BB7_2;
+
+BB7_9:
+ shl.b32 %r24, %r11, 3;
+ mov.u32 %r25, memory;
+ add.s32 %r9, %r25, %r24;
+ st.shared.f64 [%r9], %fd44;
bar.sync 0;
- setp.lt.u32 %p8, %r9, 1024;
- @%p8 bra BB37_19;
+ setp.lt.u32 %p6, %r14, 1024;
+ @%p6 bra BB7_13;
- setp.gt.u32 %p9, %r6, 511;
- @%p9 bra BB37_18;
+ setp.gt.u32 %p7, %r11, 511;
+ @%p7 bra BB7_12;
- ld.shared.f64 %fd33, [%r4+4096];
+ ld.shared.f64 %fd33, [%r9+4096];
min.f64 %fd44, %fd44, %fd33;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB37_18:
+BB7_12:
bar.sync 0;
-BB37_19:
- setp.lt.u32 %p10, %r9, 512;
- @%p10 bra BB37_23;
+BB7_13:
+ setp.lt.u32 %p8, %r14, 512;
+ @%p8 bra BB7_17;
- setp.gt.u32 %p11, %r6, 255;
- @%p11 bra BB37_22;
+ setp.gt.u32 %p9, %r11, 255;
+ @%p9 bra BB7_16;
- ld.shared.f64 %fd34, [%r4+2048];
+ ld.shared.f64 %fd34, [%r9+2048];
min.f64 %fd44, %fd44, %fd34;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB37_22:
+BB7_16:
bar.sync 0;
-BB37_23:
- setp.lt.u32 %p12, %r9, 256;
- @%p12 bra BB37_27;
+BB7_17:
+ setp.lt.u32 %p10, %r14, 256;
+ @%p10 bra BB7_21;
- setp.gt.u32 %p13, %r6, 127;
- @%p13 bra BB37_26;
+ setp.gt.u32 %p11, %r11, 127;
+ @%p11 bra BB7_20;
- ld.shared.f64 %fd35, [%r4+1024];
+ ld.shared.f64 %fd35, [%r9+1024];
min.f64 %fd44, %fd44, %fd35;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB37_26:
+BB7_20:
bar.sync 0;
-BB37_27:
- setp.lt.u32 %p14, %r9, 128;
- @%p14 bra BB37_31;
+BB7_21:
+ setp.lt.u32 %p12, %r14, 128;
+ @%p12 bra BB7_25;
- setp.gt.u32 %p15, %r6, 63;
- @%p15 bra BB37_30;
+ setp.gt.u32 %p13, %r11, 63;
+ @%p13 bra BB7_24;
- ld.shared.f64 %fd36, [%r4+512];
+ ld.shared.f64 %fd36, [%r9+512];
min.f64 %fd44, %fd44, %fd36;
- st.shared.f64 [%r4], %fd44;
+ st.shared.f64 [%r9], %fd44;
-BB37_30:
+BB7_24:
bar.sync 0;
-BB37_31:
- setp.gt.u32 %p16, %r6, 31;
- @%p16 bra BB37_44;
+BB7_25:
+ setp.gt.u32 %p14, %r11, 31;
+ @%p14 bra BB7_38;
- setp.lt.u32 %p17, %r9, 64;
- @%p17 bra BB37_34;
+ setp.lt.u32 %p15, %r14, 64;
+ @%p15 bra BB7_28;
- ld.volatile.shared.f64 %fd37, [%r4+256];
+ ld.volatile.shared.f64 %fd37, [%r9+256];
min.f64 %fd44, %fd44, %fd37;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB37_34:
- setp.lt.u32 %p18, %r9, 32;
- @%p18 bra BB37_36;
+BB7_28:
+ setp.lt.u32 %p16, %r14, 32;
+ @%p16 bra BB7_30;
- ld.volatile.shared.f64 %fd38, [%r4+128];
+ ld.volatile.shared.f64 %fd38, [%r9+128];
min.f64 %fd44, %fd44, %fd38;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB37_36:
- setp.lt.u32 %p19, %r9, 16;
- @%p19 bra BB37_38;
+BB7_30:
+ setp.lt.u32 %p17, %r14, 16;
+ @%p17 bra BB7_32;
- ld.volatile.shared.f64 %fd39, [%r4+64];
+ ld.volatile.shared.f64 %fd39, [%r9+64];
min.f64 %fd44, %fd44, %fd39;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB37_38:
- setp.lt.u32 %p20, %r9, 8;
- @%p20 bra BB37_40;
+BB7_32:
+ setp.lt.u32 %p18, %r14, 8;
+ @%p18 bra BB7_34;
- ld.volatile.shared.f64 %fd40, [%r4+32];
+ ld.volatile.shared.f64 %fd40, [%r9+32];
min.f64 %fd44, %fd44, %fd40;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB37_40:
- setp.lt.u32 %p21, %r9, 4;
- @%p21 bra BB37_42;
+BB7_34:
+ setp.lt.u32 %p19, %r14, 4;
+ @%p19 bra BB7_36;
- ld.volatile.shared.f64 %fd41, [%r4+16];
+ ld.volatile.shared.f64 %fd41, [%r9+16];
min.f64 %fd44, %fd44, %fd41;
- st.volatile.shared.f64 [%r4], %fd44;
+ st.volatile.shared.f64 [%r9], %fd44;
-BB37_42:
- setp.lt.u32 %p22, %r9, 2;
- @%p22 bra BB37_44;
+BB7_36:
+ setp.lt.u32 %p20, %r14, 2;
+ @%p20 bra BB7_38;
- ld.volatile.shared.f64 %fd42, [%r4+8];
+ ld.volatile.shared.f64 %fd42, [%r9+8];
min.f64 %fd43, %fd44, %fd42;
- st.volatile.shared.f64 [%r4], %fd43;
+ st.volatile.shared.f64 [%r9], %fd43;
-BB37_44:
- setp.ne.s32 %p23, %r6, 0;
- @%p23 bra BB37_48;
+BB7_38:
+ setp.ne.s32 %p21, %r11, 0;
+ @%p21 bra BB7_42;
ld.shared.f64 %fd28, [memory];
- ld.local.u64 %rd114, [%rd2+96];
- add.s64 %rd11, %rd2, %rd114;
- add.s64 %rd12, %rd53, %rd114;
- ld.local.u64 %rd122, [%rd2+88];
- and.b64 %rd115, %rd122, 1;
- setp.eq.b64 %p24, %rd115, 1;
- @!%p24 bra BB37_47;
- bra.uni BB37_46;
-
-BB37_46:
- ld.local.u64 %rd116, [%rd11];
- add.s64 %rd117, %rd122, %rd116;
- ld.u64 %rd122, [%rd117+-1];
-
-BB37_47:
- mov.u32 %r42, 0;
- // Callseq Start 19
- {
- .reg .b32 temp_param_reg;
- // <end>}
- .param .b64 param0;
- st.param.b64 [param0+0], %rd12;
- .param .b32 param1;
- st.param.b32 [param1+0], %r42;
- .param .b32 param2;
- st.param.b32 [param2+0], %r7;
- .param .b64 retval0;
- prototype_19 : .callprototype (.param .b64 _) _ (.param .b64 _, .param
.b32 _, .param .b32 _) ;
- call (retval0),
- %rd122,
- (
- param0,
- param1,
- param2
- )
- , prototype_19;
- ld.param.b64 %rd119, [retval0+0];
-
- //{
- }// Callseq End 19
- st.f64 [%rd119], %fd28;
-
-BB37_48:
+ cvta.to.global.u64 %rd25, %rd11;
+ ld.global.u64 %rd26, [%rd25+16];
+ ld.global.u64 %rd27, [%rd25+32];
+ cvta.to.global.u64 %rd33, %rd27;
+ setp.ne.s64 %p22, %rd26, 0;
+ @%p22 bra BB7_41;
+
+ mul.wide.u32 %rd28, %r12, 8;
+ add.s64 %rd33, %rd33, %rd28;
+
+BB7_41:
+ st.global.f64 [%rd33], %fd28;
+
+BB7_42:
ret;
}