http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index 5f72887..1865e18 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -1,12 +1,12 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-21124049 -// Cuda compilation tools, release 8.0, V8.0.44 +// Compiler Build ID: CL-22781540 +// Cuda compilation tools, release 9.0, V9.0.176 // Based on LLVM 3.4svn // -.version 5.0 +.version 6.0 .target sm_30 .address_size 64 @@ -128,15 +128,15 @@ BB1_2: ) { .reg .pred %p<13>; - .reg .b32 %r<74>; + .reg .b32 %r<72>; .reg .f64 %fd<2>; - .reg .b64 %rd<18>; + .reg .b64 %rd<17>; - ld.param.u64 %rd4, [sparse_dense_im2col_d_param_0]; - ld.param.u64 %rd5, [sparse_dense_im2col_d_param_1]; - ld.param.u64 %rd6, [sparse_dense_im2col_d_param_2]; - ld.param.u64 %rd7, [sparse_dense_im2col_d_param_3]; + ld.param.u64 %rd3, [sparse_dense_im2col_d_param_0]; + ld.param.u64 %rd4, [sparse_dense_im2col_d_param_1]; + ld.param.u64 %rd5, [sparse_dense_im2col_d_param_2]; + ld.param.u64 %rd6, [sparse_dense_im2col_d_param_3]; ld.param.u32 %r35, [sparse_dense_im2col_d_param_4]; ld.param.u32 %r22, [sparse_dense_im2col_d_param_7]; ld.param.u32 %r23, [sparse_dense_im2col_d_param_8]; @@ -158,27 +158,26 @@ BB1_2: setp.ge.s32 %p1, %r1, %r35; @%p1 bra BB2_11; - cvta.to.global.u64 %rd1, %rd6; - cvta.to.global.u64 %rd2, %rd5; - cvta.to.global.u64 %rd8, %rd4; - cvt.s64.s32 %rd3, %r1; - mul.wide.s32 %rd9, %r1, 8; - add.s64 %rd10, %rd8, %rd9; - ld.global.f64 %fd1, [%rd10]; + cvta.to.global.u64 %rd1, %rd5; + cvta.to.global.u64 %rd2, %rd4; + cvta.to.global.u64 %rd7, %rd3; + mul.wide.s32 %rd8, %r1, 8; + add.s64 %rd9, %rd7, %rd8; + ld.global.f64 %fd1, [%rd9]; mov.u32 %r67, 0; BB2_2: mov.u32 %r2, %r67; add.s32 %r67, %r2, 1; - mul.wide.s32 %rd11, %r67, 4; - add.s64 %rd12, %rd2, %rd11; - ld.global.u32 %r40, [%rd12]; + mul.wide.s32 %rd10, %r67, 4; + add.s64 %rd11, %rd2, %rd10; + ld.global.u32 %r40, [%rd11]; setp.le.s32 %p2, %r40, %r1; @%p2 bra BB2_2; - shl.b64 %rd13, %rd3, 2; - add.s64 %rd14, %rd1, %rd13; - ld.global.u32 %r41, [%rd14]; + mul.wide.s32 %rd12, %r1, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.u32 %r41, [%rd13]; div.s32 %r4, %r41, %r22; rem.s32 %r42, %r41, %r22; div.s32 %r43, %r42, %r23; @@ -189,71 +188,69 @@ BB2_2: sub.s32 %r47, %r46, %r45; add.s32 %r48, %r47, %r5; mov.u32 %r49, 0; - max.s32 %r70, %r49, %r48; + max.s32 %r68, %r49, %r48; add.s32 %r50, %r24, -1; min.s32 %r7, %r50, %r5; add.s32 %r8, %r44, %r34; mul.lo.s32 %r51, %r32, %r27; sub.s32 %r52, %r46, %r51; add.s32 %r53, %r52, %r8; - max.s32 %r73, %r49, %r53; + max.s32 %r69, %r49, %r53; add.s32 %r54, %r25, -1; min.s32 %r10, %r54, %r8; BB2_4: - mov.u32 %r69, %r70; - sub.s32 %r55, %r5, %r69; + mov.u32 %r70, %r68; + sub.s32 %r55, %r5, %r70; rem.s32 %r56, %r55, %r31; setp.ne.s32 %p3, %r56, 0; - setp.le.s32 %p4, %r69, %r7; - and.pred %p5, %p3, %p4; - add.s32 %r70, %r69, 1; + setp.le.s32 %p4, %r70, %r7; + and.pred %p5, %p4, %p3; + add.s32 %r68, %r70, 1; @%p5 bra BB2_4; BB2_5: - mov.u32 %r13, %r73; + mov.u32 %r13, %r69; sub.s32 %r57, %r8, %r13; rem.s32 %r58, %r57, %r32; setp.ne.s32 %p6, %r58, 0; setp.le.s32 %p7, %r13, %r10; - and.pred %p8, %p6, %p7; - add.s32 %r73, %r13, 1; + and.pred %p8, %p7, %p6; + add.s32 %r69, %r13, 1; @%p8 bra BB2_5; - setp.gt.s32 %p9, %r69, %r7; + setp.gt.s32 %p9, %r70, %r7; @%p9 bra BB2_11; mul.lo.s32 %r15, %r2, %r28; mul.lo.s32 %r16, %r4, %r29; - cvta.to.global.u64 %rd15, %rd7; + cvta.to.global.u64 %rd14, %rd6; BB2_8: - sub.s32 %r59, %r5, %r69; + sub.s32 %r59, %r5, %r70; div.s32 %r60, %r59, %r31; mad.lo.s32 %r18, %r60, %r27, %r15; setp.gt.s32 %p10, %r13, %r10; - mov.u32 %r72, %r13; + mov.u32 %r71, %r13; @%p10 bra BB2_10; BB2_9: - mov.u32 %r19, %r72; - sub.s32 %r61, %r8, %r19; + sub.s32 %r61, %r8, %r71; div.s32 %r62, %r61, %r32; - mad.lo.s32 %r63, %r69, %r25, %r16; - add.s32 %r64, %r63, %r19; + mad.lo.s32 %r63, %r70, %r25, %r16; + add.s32 %r64, %r63, %r71; mad.lo.s32 %r65, %r64, %r30, %r18; add.s32 %r66, %r65, %r62; - mul.wide.s32 %rd16, %r66, 8; - add.s64 %rd17, %rd15, %rd16; - st.global.f64 [%rd17], %fd1; - add.s32 %r20, %r19, %r32; - setp.le.s32 %p11, %r20, %r10; - mov.u32 %r72, %r20; + mul.wide.s32 %rd15, %r66, 8; + add.s64 %rd16, %rd14, %rd15; + st.global.f64 [%rd16], %fd1; + add.s32 %r71, %r71, %r32; + setp.le.s32 %p11, %r71, %r10; @%p11 bra BB2_9; BB2_10: - add.s32 %r69, %r69, %r31; - setp.le.s32 %p12, %r69, %r7; + add.s32 %r70, %r70, %r31; + setp.le.s32 %p12, %r70, %r7; @%p12 bra BB2_8; BB2_11: @@ -286,14 +283,14 @@ BB2_11: { .reg .pred %p<13>; .reg .f32 %f<2>; - .reg .b32 %r<74>; - .reg .b64 %rd<18>; + .reg .b32 %r<72>; + .reg .b64 %rd<17>; - ld.param.u64 %rd4, [sparse_dense_im2col_f_param_0]; - ld.param.u64 %rd5, [sparse_dense_im2col_f_param_1]; - ld.param.u64 %rd6, [sparse_dense_im2col_f_param_2]; - ld.param.u64 %rd7, [sparse_dense_im2col_f_param_3]; + ld.param.u64 %rd3, [sparse_dense_im2col_f_param_0]; + ld.param.u64 %rd4, [sparse_dense_im2col_f_param_1]; + ld.param.u64 %rd5, [sparse_dense_im2col_f_param_2]; + ld.param.u64 %rd6, [sparse_dense_im2col_f_param_3]; ld.param.u32 %r35, [sparse_dense_im2col_f_param_4]; ld.param.u32 %r22, [sparse_dense_im2col_f_param_7]; ld.param.u32 %r23, [sparse_dense_im2col_f_param_8]; @@ -315,27 +312,25 @@ BB2_11: setp.ge.s32 %p1, %r1, %r35; @%p1 bra BB3_11; - cvta.to.global.u64 %rd1, %rd6; - cvta.to.global.u64 %rd2, %rd5; - cvta.to.global.u64 %rd8, %rd4; - cvt.s64.s32 %rd3, %r1; - mul.wide.s32 %rd9, %r1, 4; - add.s64 %rd10, %rd8, %rd9; - ld.global.f32 %f1, [%rd10]; + cvta.to.global.u64 %rd1, %rd5; + cvta.to.global.u64 %rd2, %rd4; + cvta.to.global.u64 %rd7, %rd3; + mul.wide.s32 %rd8, %r1, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.f32 %f1, [%rd9]; mov.u32 %r67, 0; BB3_2: mov.u32 %r2, %r67; add.s32 %r67, %r2, 1; - mul.wide.s32 %rd11, %r67, 4; - add.s64 %rd12, %rd2, %rd11; - ld.global.u32 %r40, [%rd12]; + mul.wide.s32 %rd10, %r67, 4; + add.s64 %rd11, %rd2, %rd10; + ld.global.u32 %r40, [%rd11]; setp.le.s32 %p2, %r40, %r1; @%p2 bra BB3_2; - shl.b64 %rd13, %rd3, 2; - add.s64 %rd14, %rd1, %rd13; - ld.global.u32 %r41, [%rd14]; + add.s64 %rd13, %rd1, %rd8; + ld.global.u32 %r41, [%rd13]; div.s32 %r4, %r41, %r22; rem.s32 %r42, %r41, %r22; div.s32 %r43, %r42, %r23; @@ -346,71 +341,69 @@ BB3_2: sub.s32 %r47, %r46, %r45; add.s32 %r48, %r47, %r5; mov.u32 %r49, 0; - max.s32 %r70, %r49, %r48; + max.s32 %r68, %r49, %r48; add.s32 %r50, %r24, -1; min.s32 %r7, %r50, %r5; add.s32 %r8, %r44, %r34; mul.lo.s32 %r51, %r32, %r27; sub.s32 %r52, %r46, %r51; add.s32 %r53, %r52, %r8; - max.s32 %r73, %r49, %r53; + max.s32 %r69, %r49, %r53; add.s32 %r54, %r25, -1; min.s32 %r10, %r54, %r8; BB3_4: - mov.u32 %r69, %r70; - sub.s32 %r55, %r5, %r69; + mov.u32 %r70, %r68; + sub.s32 %r55, %r5, %r70; rem.s32 %r56, %r55, %r31; setp.ne.s32 %p3, %r56, 0; - setp.le.s32 %p4, %r69, %r7; - and.pred %p5, %p3, %p4; - add.s32 %r70, %r69, 1; + setp.le.s32 %p4, %r70, %r7; + and.pred %p5, %p4, %p3; + add.s32 %r68, %r70, 1; @%p5 bra BB3_4; BB3_5: - mov.u32 %r13, %r73; + mov.u32 %r13, %r69; sub.s32 %r57, %r8, %r13; rem.s32 %r58, %r57, %r32; setp.ne.s32 %p6, %r58, 0; setp.le.s32 %p7, %r13, %r10; - and.pred %p8, %p6, %p7; - add.s32 %r73, %r13, 1; + and.pred %p8, %p7, %p6; + add.s32 %r69, %r13, 1; @%p8 bra BB3_5; - setp.gt.s32 %p9, %r69, %r7; + setp.gt.s32 %p9, %r70, %r7; @%p9 bra BB3_11; mul.lo.s32 %r15, %r2, %r28; mul.lo.s32 %r16, %r4, %r29; - cvta.to.global.u64 %rd15, %rd7; + cvta.to.global.u64 %rd14, %rd6; BB3_8: - sub.s32 %r59, %r5, %r69; + sub.s32 %r59, %r5, %r70; div.s32 %r60, %r59, %r31; mad.lo.s32 %r18, %r60, %r27, %r15; setp.gt.s32 %p10, %r13, %r10; - mov.u32 %r72, %r13; + mov.u32 %r71, %r13; @%p10 bra BB3_10; BB3_9: - mov.u32 %r19, %r72; - sub.s32 %r61, %r8, %r19; + sub.s32 %r61, %r8, %r71; div.s32 %r62, %r61, %r32; - mad.lo.s32 %r63, %r69, %r25, %r16; - add.s32 %r64, %r63, %r19; + mad.lo.s32 %r63, %r70, %r25, %r16; + add.s32 %r64, %r63, %r71; mad.lo.s32 %r65, %r64, %r30, %r18; add.s32 %r66, %r65, %r62; - mul.wide.s32 %rd16, %r66, 4; - add.s64 %rd17, %rd15, %rd16; - st.global.f32 [%rd17], %f1; - add.s32 %r20, %r19, %r32; - setp.le.s32 %p11, %r20, %r10; - mov.u32 %r72, %r20; + mul.wide.s32 %rd15, %r66, 4; + add.s64 %rd16, %rd14, %rd15; + st.global.f32 [%rd16], %f1; + add.s32 %r71, %r71, %r32; + setp.le.s32 %p11, %r71, %r10; @%p11 bra BB3_9; BB3_10: - add.s32 %r69, %r69, %r31; - setp.le.s32 %p12, %r69, %r7; + add.s32 %r70, %r70, %r31; + setp.le.s32 %p12, %r70, %r7; @%p12 bra BB3_8; BB3_11: @@ -439,7 +432,7 @@ BB3_11: ) { .reg .pred %p<12>; - .reg .b32 %r<71>; + .reg .b32 %r<69>; .reg .f64 %fd<2>; .reg .b64 %rd<9>; @@ -484,38 +477,38 @@ BB3_11: sub.s32 %r45, %r44, %r43; add.s32 %r46, %r45, %r4; mov.u32 %r47, 0; - max.s32 %r67, %r47, %r46; + max.s32 %r65, %r47, %r46; add.s32 %r48, %r24, -1; min.s32 %r6, %r48, %r4; add.s32 %r7, %r42, %r34; mul.lo.s32 %r49, %r32, %r27; sub.s32 %r50, %r44, %r49; add.s32 %r51, %r50, %r7; - max.s32 %r70, %r47, %r51; + max.s32 %r66, %r47, %r51; add.s32 %r52, %r25, -1; min.s32 %r9, %r52, %r7; BB4_2: - mov.u32 %r66, %r67; - sub.s32 %r53, %r4, %r66; + mov.u32 %r67, %r65; + sub.s32 %r53, %r4, %r67; rem.s32 %r54, %r53, %r31; setp.ne.s32 %p2, %r54, 0; - setp.le.s32 %p3, %r66, %r6; - and.pred %p4, %p2, %p3; - add.s32 %r67, %r66, 1; + setp.le.s32 %p3, %r67, %r6; + and.pred %p4, %p3, %p2; + add.s32 %r65, %r67, 1; @%p4 bra BB4_2; BB4_3: - mov.u32 %r12, %r70; + mov.u32 %r12, %r66; sub.s32 %r55, %r7, %r12; rem.s32 %r56, %r55, %r32; setp.ne.s32 %p5, %r56, 0; setp.le.s32 %p6, %r12, %r9; - and.pred %p7, %p5, %p6; - add.s32 %r70, %r12, 1; + and.pred %p7, %p6, %p5; + add.s32 %r66, %r12, 1; @%p7 bra BB4_3; - setp.gt.s32 %p8, %r66, %r6; + setp.gt.s32 %p8, %r67, %r6; @%p8 bra BB4_9; mul.lo.s32 %r14, %r2, %r28; @@ -523,32 +516,30 @@ BB4_3: cvta.to.global.u64 %rd6, %rd2; BB4_6: - sub.s32 %r57, %r4, %r66; + sub.s32 %r57, %r4, %r67; div.s32 %r58, %r57, %r31; mad.lo.s32 %r17, %r58, %r27, %r14; setp.gt.s32 %p9, %r12, %r9; - mov.u32 %r69, %r12; + mov.u32 %r68, %r12; @%p9 bra BB4_8; BB4_7: - mov.u32 %r18, %r69; - sub.s32 %r59, %r7, %r18; + sub.s32 %r59, %r7, %r68; div.s32 %r60, %r59, %r32; - mad.lo.s32 %r61, %r66, %r25, %r15; - add.s32 %r62, %r61, %r18; + mad.lo.s32 %r61, %r67, %r25, %r15; + add.s32 %r62, %r61, %r68; mad.lo.s32 %r63, %r62, %r30, %r17; add.s32 %r64, %r63, %r60; mul.wide.s32 %rd7, %r64, 8; add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd1; - add.s32 %r19, %r18, %r32; - setp.le.s32 %p10, %r19, %r9; - mov.u32 %r69, %r19; + add.s32 %r68, %r68, %r32; + setp.le.s32 %p10, %r68, %r9; @%p10 bra BB4_7; BB4_8: - add.s32 %r66, %r66, %r31; - setp.le.s32 %p11, %r66, %r6; + add.s32 %r67, %r67, %r31; + setp.le.s32 %p11, %r67, %r6; @%p11 bra BB4_6; BB4_9: @@ -578,7 +569,7 @@ BB4_9: { .reg .pred %p<12>; .reg .f32 %f<2>; - .reg .b32 %r<71>; + .reg .b32 %r<69>; .reg .b64 %rd<9>; @@ -622,38 +613,38 @@ BB4_9: sub.s32 %r45, %r44, %r43; add.s32 %r46, %r45, %r4; mov.u32 %r47, 0; - max.s32 %r67, %r47, %r46; + max.s32 %r65, %r47, %r46; add.s32 %r48, %r24, -1; min.s32 %r6, %r48, %r4; add.s32 %r7, %r42, %r34; mul.lo.s32 %r49, %r32, %r27; sub.s32 %r50, %r44, %r49; add.s32 %r51, %r50, %r7; - max.s32 %r70, %r47, %r51; + max.s32 %r66, %r47, %r51; add.s32 %r52, %r25, -1; min.s32 %r9, %r52, %r7; BB5_2: - mov.u32 %r66, %r67; - sub.s32 %r53, %r4, %r66; + mov.u32 %r67, %r65; + sub.s32 %r53, %r4, %r67; rem.s32 %r54, %r53, %r31; setp.ne.s32 %p2, %r54, 0; - setp.le.s32 %p3, %r66, %r6; - and.pred %p4, %p2, %p3; - add.s32 %r67, %r66, 1; + setp.le.s32 %p3, %r67, %r6; + and.pred %p4, %p3, %p2; + add.s32 %r65, %r67, 1; @%p4 bra BB5_2; BB5_3: - mov.u32 %r12, %r70; + mov.u32 %r12, %r66; sub.s32 %r55, %r7, %r12; rem.s32 %r56, %r55, %r32; setp.ne.s32 %p5, %r56, 0; setp.le.s32 %p6, %r12, %r9; - and.pred %p7, %p5, %p6; - add.s32 %r70, %r12, 1; + and.pred %p7, %p6, %p5; + add.s32 %r66, %r12, 1; @%p7 bra BB5_3; - setp.gt.s32 %p8, %r66, %r6; + setp.gt.s32 %p8, %r67, %r6; @%p8 bra BB5_9; mul.lo.s32 %r14, %r2, %r28; @@ -661,32 +652,30 @@ BB5_3: cvta.to.global.u64 %rd6, %rd2; BB5_6: - sub.s32 %r57, %r4, %r66; + sub.s32 %r57, %r4, %r67; div.s32 %r58, %r57, %r31; mad.lo.s32 %r17, %r58, %r27, %r14; setp.gt.s32 %p9, %r12, %r9; - mov.u32 %r69, %r12; + mov.u32 %r68, %r12; @%p9 bra BB5_8; BB5_7: - mov.u32 %r18, %r69; - sub.s32 %r59, %r7, %r18; + sub.s32 %r59, %r7, %r68; div.s32 %r60, %r59, %r32; - mad.lo.s32 %r61, %r66, %r25, %r15; - add.s32 %r62, %r61, %r18; + mad.lo.s32 %r61, %r67, %r25, %r15; + add.s32 %r62, %r61, %r68; mad.lo.s32 %r63, %r62, %r30, %r17; add.s32 %r64, %r63, %r60; mul.wide.s32 %rd7, %r64, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; - add.s32 %r19, %r18, %r32; - setp.le.s32 %p10, %r19, %r9; - mov.u32 %r69, %r19; + add.s32 %r68, %r68, %r32; + setp.le.s32 %p10, %r68, %r9; @%p10 bra BB5_7; BB5_8: - add.s32 %r66, %r66, %r31; - setp.le.s32 %p11, %r66, %r6; + add.s32 %r67, %r67, %r31; + setp.le.s32 %p11, %r67, %r6; @%p11 bra BB5_6; BB5_9: @@ -805,7 +794,7 @@ BB7_2: ) { .reg .pred %p<7>; - .reg .b32 %r<24>; + .reg .b32 %r<25>; .reg .f64 %fd<2>; .reg .b64 %rd<23>; @@ -831,8 +820,8 @@ BB7_2: mul.wide.s32 %rd14, %r2, 4; add.s64 %rd1, %rd13, %rd14; ld.global.u32 %r23, [%rd1]; - ld.global.u32 %r22, [%rd1+4]; - setp.ge.s32 %p2, %r23, %r22; + ld.global.u32 %r24, [%rd1+4]; + setp.ge.s32 %p2, %r23, %r24; @%p2 bra BB8_6; cvta.to.global.u64 %rd2, %rd12; @@ -857,13 +846,13 @@ BB8_3: mul.wide.s32 %rd19, %r21, 8; add.s64 %rd20, %rd2, %rd19; st.global.f64 [%rd20], %fd1; - ld.global.u32 %r22, [%rd1+4]; + ld.global.u32 %r24, [%rd1+4]; BB8_5: add.s64 %rd22, %rd22, 8; add.s64 %rd21, %rd21, 4; add.s32 %r23, %r23, 1; - setp.lt.s32 %p6, %r23, %r22; + setp.lt.s32 %p6, %r23, %r24; @%p6 bra BB8_3; BB8_6: @@ -885,7 +874,7 @@ BB8_6: { .reg .pred %p<7>; .reg .f32 %f<2>; - .reg .b32 %r<24>; + .reg .b32 %r<25>; .reg .b64 %rd<22>; @@ -910,8 +899,8 @@ BB8_6: mul.wide.s32 %rd14, %r2, 4; add.s64 %rd1, %rd13, %rd14; ld.global.u32 %r23, [%rd1]; - ld.global.u32 %r22, [%rd1+4]; - setp.ge.s32 %p2, %r23, %r22; + ld.global.u32 %r24, [%rd1+4]; + setp.ge.s32 %p2, %r23, %r24; @%p2 bra BB9_6; cvta.to.global.u64 %rd2, %rd12; @@ -935,13 +924,13 @@ BB9_3: mul.wide.s32 %rd18, %r21, 4; add.s64 %rd19, %rd2, %rd18; st.global.f32 [%rd19], %f1; - ld.global.u32 %r22, [%rd1+4]; + ld.global.u32 %r24, [%rd1+4]; BB9_5: add.s64 %rd21, %rd21, 4; add.s64 %rd20, %rd20, 4; add.s32 %r23, %r23, 1; - setp.lt.s32 %p6, %r23, %r22; + setp.lt.s32 %p6, %r23, %r24; @%p6 bra BB9_3; BB9_6: @@ -964,13 +953,13 @@ BB9_6: .reg .pred %p<6>; .reg .b32 %r<22>; .reg .f64 %fd<2>; - .reg .b64 %rd<22>; + .reg .b64 %rd<21>; - ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_0]; - ld.param.u64 %rd8, [slice_sparse_dense_nnz_d_param_1]; - ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_2]; - ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_3]; + ld.param.u64 %rd4, [slice_sparse_dense_nnz_d_param_0]; + ld.param.u64 %rd7, [slice_sparse_dense_nnz_d_param_1]; + ld.param.u64 %rd5, [slice_sparse_dense_nnz_d_param_2]; + ld.param.u64 %rd6, [slice_sparse_dense_nnz_d_param_3]; ld.param.u32 %r5, [slice_sparse_dense_nnz_d_param_4]; ld.param.u32 %r9, [slice_sparse_dense_nnz_d_param_5]; ld.param.u32 %r6, [slice_sparse_dense_nnz_d_param_6]; @@ -980,24 +969,23 @@ BB9_6: mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r13, %r10, %r11, %r12; - cvta.to.global.u64 %rd1, %rd8; - mul.wide.s32 %rd9, %r5, 4; - add.s64 %rd10, %rd1, %rd9; - ld.global.u32 %r14, [%rd10]; + cvta.to.global.u64 %rd1, %rd7; + mul.wide.s32 %rd8, %r5, 4; + add.s64 %rd9, %rd1, %rd8; + ld.global.u32 %r14, [%rd9]; add.s32 %r1, %r13, %r14; - mul.wide.s32 %rd11, %r9, 4; - add.s64 %rd12, %rd1, %rd11; - ld.global.u32 %r15, [%rd12+4]; + mul.wide.s32 %rd10, %r9, 4; + add.s64 %rd11, %rd1, %rd10; + ld.global.u32 %r15, [%rd11+4]; setp.ge.s32 %p1, %r1, %r15; @%p1 bra BB10_5; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd5; - cvta.to.global.u64 %rd13, %rd6; - cvt.s64.s32 %rd4, %r1; - mul.wide.s32 %rd14, %r1, 4; - add.s64 %rd15, %rd13, %rd14; - ld.global.u32 %r2, [%rd15]; + cvta.to.global.u64 %rd2, %rd6; + cvta.to.global.u64 %rd3, %rd4; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.s32 %rd13, %r1, 4; + add.s64 %rd14, %rd12, %rd13; + ld.global.u32 %r2, [%rd14]; setp.lt.s32 %p2, %r2, %r6; setp.gt.s32 %p3, %r2, %r7; or.pred %p4, %p2, %p3; @@ -1007,24 +995,23 @@ BB9_6: BB10_3: mov.u32 %r3, %r21; - add.s32 %r4, %r3, 1; - mul.wide.s32 %rd16, %r4, 4; - add.s64 %rd17, %rd1, %rd16; - ld.global.u32 %r16, [%rd17]; + add.s32 %r21, %r3, 1; + mul.wide.s32 %rd15, %r21, 4; + add.s64 %rd16, %rd1, %rd15; + ld.global.u32 %r16, [%rd16]; setp.le.s32 %p5, %r16, %r1; - mov.u32 %r21, %r4; @%p5 bra BB10_3; - shl.b64 %rd18, %rd4, 3; - add.s64 %rd19, %rd3, %rd18; - ld.global.f64 %fd1, [%rd19]; + mul.wide.s32 %rd17, %r1, 8; + add.s64 %rd18, %rd3, %rd17; + ld.global.f64 %fd1, [%rd18]; sub.s32 %r17, %r3, %r5; mul.lo.s32 %r18, %r17, %r8; sub.s32 %r19, %r18, %r6; add.s32 %r20, %r19, %r2; - mul.wide.s32 %rd20, %r20, 8; - add.s64 %rd21, %rd2, %rd20; - st.global.f64 [%rd21], %fd1; + mul.wide.s32 %rd19, %r20, 8; + add.s64 %rd20, %rd2, %rd19; + st.global.f64 [%rd20], %fd1; BB10_5: ret; @@ -1046,13 +1033,13 @@ BB10_5: .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; - .reg .b64 %rd<22>; + .reg .b64 %rd<21>; - ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_0]; - ld.param.u64 %rd8, [slice_sparse_dense_nnz_f_param_1]; - ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_2]; - ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_3]; + ld.param.u64 %rd4, [slice_sparse_dense_nnz_f_param_0]; + ld.param.u64 %rd7, [slice_sparse_dense_nnz_f_param_1]; + ld.param.u64 %rd5, [slice_sparse_dense_nnz_f_param_2]; + ld.param.u64 %rd6, [slice_sparse_dense_nnz_f_param_3]; ld.param.u32 %r5, [slice_sparse_dense_nnz_f_param_4]; ld.param.u32 %r9, [slice_sparse_dense_nnz_f_param_5]; ld.param.u32 %r6, [slice_sparse_dense_nnz_f_param_6]; @@ -1062,24 +1049,23 @@ BB10_5: mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r13, %r10, %r11, %r12; - cvta.to.global.u64 %rd1, %rd8; - mul.wide.s32 %rd9, %r5, 4; - add.s64 %rd10, %rd1, %rd9; - ld.global.u32 %r14, [%rd10]; + cvta.to.global.u64 %rd1, %rd7; + mul.wide.s32 %rd8, %r5, 4; + add.s64 %rd9, %rd1, %rd8; + ld.global.u32 %r14, [%rd9]; add.s32 %r1, %r13, %r14; - mul.wide.s32 %rd11, %r9, 4; - add.s64 %rd12, %rd1, %rd11; - ld.global.u32 %r15, [%rd12+4]; + mul.wide.s32 %rd10, %r9, 4; + add.s64 %rd11, %rd1, %rd10; + ld.global.u32 %r15, [%rd11+4]; setp.ge.s32 %p1, %r1, %r15; @%p1 bra BB11_5; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd5; - cvta.to.global.u64 %rd13, %rd6; - cvt.s64.s32 %rd4, %r1; - mul.wide.s32 %rd14, %r1, 4; - add.s64 %rd15, %rd13, %rd14; - ld.global.u32 %r2, [%rd15]; + cvta.to.global.u64 %rd2, %rd6; + cvta.to.global.u64 %rd3, %rd4; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.s32 %rd13, %r1, 4; + add.s64 %rd14, %rd12, %rd13; + ld.global.u32 %r2, [%rd14]; setp.lt.s32 %p2, %r2, %r6; setp.gt.s32 %p3, %r2, %r7; or.pred %p4, %p2, %p3; @@ -1089,24 +1075,22 @@ BB10_5: BB11_3: mov.u32 %r3, %r21; - add.s32 %r4, %r3, 1; - mul.wide.s32 %rd16, %r4, 4; - add.s64 %rd17, %rd1, %rd16; - ld.global.u32 %r16, [%rd17]; + add.s32 %r21, %r3, 1; + mul.wide.s32 %rd15, %r21, 4; + add.s64 %rd16, %rd1, %rd15; + ld.global.u32 %r16, [%rd16]; setp.le.s32 %p5, %r16, %r1; - mov.u32 %r21, %r4; @%p5 bra BB11_3; - shl.b64 %rd18, %rd4, 2; - add.s64 %rd19, %rd3, %rd18; - ld.global.f32 %f1, [%rd19]; + add.s64 %rd18, %rd3, %rd13; + ld.global.f32 %f1, [%rd18]; sub.s32 %r17, %r3, %r5; mul.lo.s32 %r18, %r17, %r8; sub.s32 %r19, %r18, %r6; add.s32 %r20, %r19, %r2; - mul.wide.s32 %rd20, %r20, 4; - add.s64 %rd21, %rd2, %rd20; - st.global.f32 [%rd21], %f1; + mul.wide.s32 %rd19, %r20, 4; + add.s64 %rd20, %rd2, %rd19; + st.global.f32 [%rd20], %f1; BB11_5: ret; @@ -1409,12 +1393,12 @@ BB17_2: .reg .pred %p<5>; .reg .b32 %r<8>; .reg .f64 %fd<6>; - .reg .b64 %rd<14>; + .reg .b64 %rd<13>; - ld.param.u64 %rd2, [relu_backward_d_param_0]; - ld.param.u64 %rd3, [relu_backward_d_param_1]; - ld.param.u64 %rd4, [relu_backward_d_param_2]; + ld.param.u64 %rd1, [relu_backward_d_param_0]; + ld.param.u64 %rd2, [relu_backward_d_param_1]; + ld.param.u64 %rd3, [relu_backward_d_param_2]; ld.param.u32 %r2, [relu_backward_d_param_3]; ld.param.u32 %r3, [relu_backward_d_param_4]; mov.u32 %r4, %ntid.x; @@ -1429,25 +1413,22 @@ BB17_2: bra.uni BB18_1; BB18_1: - cvta.to.global.u64 %rd5, %rd2; - cvt.s64.s32 %rd1, %r1; - mul.wide.s32 %rd6, %r1, 8; - add.s64 %rd7, %rd5, %rd6; - ld.global.f64 %fd4, [%rd7]; + cvta.to.global.u64 %rd4, %rd1; + mul.wide.s32 %rd5, %r1, 8; + add.s64 %rd6, %rd4, %rd5; + ld.global.f64 %fd4, [%rd6]; mov.f64 %fd5, 0d0000000000000000; setp.leu.f64 %p4, %fd4, 0d0000000000000000; @%p4 bra BB18_3; - cvta.to.global.u64 %rd8, %rd3; - shl.b64 %rd9, %rd1, 3; - add.s64 %rd10, %rd8, %rd9; - ld.global.f64 %fd5, [%rd10]; + cvta.to.global.u64 %rd7, %rd2; + add.s64 %rd9, %rd7, %rd5; + ld.global.f64 %fd5, [%rd9]; BB18_3: - cvta.to.global.u64 %rd11, %rd4; - shl.b64 %rd12, %rd1, 3; - add.s64 %rd13, %rd11, %rd12; - st.global.f64 [%rd13], %fd5; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd12, %rd10, %rd5; + st.global.f64 [%rd12], %fd5; BB18_4: ret; @@ -1465,12 +1446,12 @@ BB18_4: .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<8>; - .reg .b64 %rd<14>; + .reg .b64 %rd<13>; - ld.param.u64 %rd2, [relu_backward_f_param_0]; - ld.param.u64 %rd3, [relu_backward_f_param_1]; - ld.param.u64 %rd4, [relu_backward_f_param_2]; + ld.param.u64 %rd1, [relu_backward_f_param_0]; + ld.param.u64 %rd2, [relu_backward_f_param_1]; + ld.param.u64 %rd3, [relu_backward_f_param_2]; ld.param.u32 %r2, [relu_backward_f_param_3]; ld.param.u32 %r3, [relu_backward_f_param_4]; mov.u32 %r4, %ntid.x; @@ -1485,25 +1466,22 @@ BB18_4: bra.uni BB19_1; BB19_1: - cvta.to.global.u64 %rd5, %rd2; - cvt.s64.s32 %rd1, %r1; - mul.wide.s32 %rd6, %r1, 4; - add.s64 %rd7, %rd5, %rd6; - ld.global.f32 %f4, [%rd7]; + cvta.to.global.u64 %rd4, %rd1; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.f32 %f4, [%rd6]; mov.f32 %f5, 0f00000000; setp.leu.f32 %p4, %f4, 0f00000000; @%p4 bra BB19_3; - cvta.to.global.u64 %rd8, %rd3; - shl.b64 %rd9, %rd1, 2; - add.s64 %rd10, %rd8, %rd9; - ld.global.f32 %f5, [%rd10]; + cvta.to.global.u64 %rd7, %rd2; + add.s64 %rd9, %rd7, %rd5; + ld.global.f32 %f5, [%rd9]; BB19_3: - cvta.to.global.u64 %rd11, %rd4; - shl.b64 %rd12, %rd1, 2; - add.s64 %rd13, %rd11, %rd12; - st.global.f32 [%rd13], %f5; + cvta.to.global.u64 %rd10, %rd3; + add.s64 %rd12, %rd10, %rd5; + st.global.f32 [%rd12], %f5; BB19_4: ret; @@ -1965,106 +1943,119 @@ BB27_2: .param .u32 matrix_matrix_cellwise_op_d_param_7 ) { - .reg .pred %p<77>; - .reg .b32 %r<56>; - .reg .f64 %fd<55>; + .reg .pred %p<73>; + .reg .b32 %r<61>; + .reg .f64 %fd<51>; .reg .b64 %rd<19>; ld.param.u64 %rd2, [matrix_matrix_cellwise_op_d_param_0]; ld.param.u64 %rd3, [matrix_matrix_cellwise_op_d_param_1]; ld.param.u64 %rd4, [matrix_matrix_cellwise_op_d_param_2]; - ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_3]; - ld.param.u32 %r6, [matrix_matrix_cellwise_op_d_param_4]; - ld.param.u32 %r7, [matrix_matrix_cellwise_op_d_param_5]; - ld.param.u32 %r8, [matrix_matrix_cellwise_op_d_param_6]; - ld.param.u32 %r9, [matrix_matrix_cellwise_op_d_param_7]; - mov.u32 %r11, %ctaid.x; - mov.u32 %r12, %ntid.x; - mov.u32 %r13, %tid.x; - mad.lo.s32 %r1, %r12, %r11, %r13; - div.s32 %r2, %r1, %r6; - setp.lt.s32 %p2, %r2, %r10; - setp.gt.s32 %p3, %r6, -1; + ld.param.u32 %r14, [matrix_matrix_cellwise_op_d_param_3]; + ld.param.u32 %r10, [matrix_matrix_cellwise_op_d_param_4]; + ld.param.u32 %r11, [matrix_matrix_cellwise_op_d_param_5]; + ld.param.u32 %r12, [matrix_matrix_cellwise_op_d_param_6]; + ld.param.u32 %r13, [matrix_matrix_cellwise_op_d_param_7]; + mov.u32 %r15, %ntid.x; + mov.u32 %r16, %ctaid.x; + mov.u32 %r17, %tid.x; + mad.lo.s32 %r18, %r15, %r16, %r17; + div.s32 %r60, %r18, %r10; + rem.s32 %r2, %r18, %r10; + setp.lt.s32 %p2, %r60, %r14; + setp.gt.s32 %p3, %r10, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB28_65; + @!%p4 bra BB28_77; bra.uni BB28_1; BB28_1: - rem.s32 %r14, %r1, %r6; - cvta.to.global.u64 %rd5, %rd2; - mad.lo.s32 %r3, %r2, %r6, %r14; - setp.eq.s32 %p5, %r7, 2; - selp.b32 %r15, %r14, %r3, %p5; - setp.eq.s32 %p6, %r7, 1; - selp.b32 %r16, %r2, %r15, %p6; - setp.eq.s32 %p7, %r8, 2; - selp.b32 %r17, %r14, %r3, %p7; - setp.eq.s32 %p8, %r8, 1; - selp.b32 %r18, %r2, %r17, %p8; - mul.wide.s32 %rd6, %r16, 8; - add.s64 %rd7, %rd5, %rd6; - ld.global.f64 %fd1, [%rd7]; - cvta.to.global.u64 %rd8, %rd3; - mul.wide.s32 %rd9, %r18, 8; - add.s64 %rd10, %rd8, %rd9; + mad.lo.s32 %r3, %r60, %r10, %r2; + setp.eq.s32 %p5, %r11, 1; + mov.u32 %r58, %r60; + @%p5 bra BB28_4; + + setp.ne.s32 %p6, %r11, 2; + mov.u32 %r58, %r3; + @%p6 bra BB28_4; + + mov.u32 %r58, %r2; + +BB28_4: + setp.eq.s32 %p7, %r12, 1; + @%p7 bra BB28_7; + + setp.ne.s32 %p8, %r12, 2; + mov.u32 %r60, %r3; + @%p8 bra BB28_7; + + mov.u32 %r60, %r2; + +BB28_7: + cvta.to.global.u64 %rd5, %rd3; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r58, 8; + add.s64 %rd8, %rd6, %rd7; + ld.global.f64 %fd1, [%rd8]; + mul.wide.s32 %rd9, %r60, 8; + add.s64 %rd10, %rd5, %rd9; ld.global.f64 %fd2, [%rd10]; - mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p9, %r9, 8; - @%p9 bra BB28_18; + mov.f64 %fd50, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p9, %r13, 8; + @%p9 bra BB28_24; - setp.gt.s32 %p23, %r9, 3; - @%p23 bra BB28_10; + setp.gt.s32 %p23, %r13, 3; + @%p23 bra BB28_16; - setp.gt.s32 %p30, %r9, 1; - @%p30 bra BB28_7; + setp.gt.s32 %p30, %r13, 1; + @%p30 bra BB28_13; - setp.eq.s32 %p33, %r9, 0; - @%p33 bra BB28_63; - bra.uni BB28_5; + setp.eq.s32 %p33, %r13, 0; + @%p33 bra BB28_75; + bra.uni BB28_11; -BB28_63: - add.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_75: + add.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_18: - setp.gt.s32 %p10, %r9, 13; - @%p10 bra BB28_27; +BB28_24: + setp.gt.s32 %p10, %r13, 13; + @%p10 bra BB28_33; - setp.gt.s32 %p17, %r9, 10; - @%p17 bra BB28_23; + setp.gt.s32 %p17, %r13, 10; + @%p17 bra BB28_29; - setp.eq.s32 %p21, %r9, 9; - @%p21 bra BB28_45; - bra.uni BB28_21; + setp.eq.s32 %p21, %r13, 9; + @%p21 bra BB28_53; + bra.uni BB28_27; -BB28_45: - setp.eq.f64 %p50, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB28_64; +BB28_53: + setp.eq.f64 %p48, %fd1, %fd2; + selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB28_76; -BB28_10: - setp.gt.s32 %p24, %r9, 5; - @%p24 bra BB28_14; +BB28_16: + setp.gt.s32 %p24, %r13, 5; + @%p24 bra BB28_20; - setp.eq.s32 %p28, %r9, 4; - @%p28 bra BB28_48; - bra.uni BB28_12; + setp.eq.s32 %p28, %r13, 4; + @%p28 bra BB28_56; + bra.uni BB28_18; -BB28_48: +BB28_56: { .reg .b32 %temp; - mov.b64 {%temp, %r4}, %fd1; + mov.b64 {%temp, %r8}, %fd1; } { .reg .b32 %temp; - mov.b64 {%temp, %r5}, %fd2; + mov.b64 {%temp, %r9}, %fd2; } - bfe.u32 %r31, %r5, 20, 11; + bfe.u32 %r31, %r9, 20, 11; add.s32 %r32, %r31, -1012; mov.b64 %rd15, %fd2; shl.b64 %rd1, %rd15, %r32; - setp.eq.s64 %p55, %rd1, -9223372036854775808; + setp.eq.s64 %p53, %rd1, -9223372036854775808; abs.f64 %fd19, %fd1; // Callseq Start 0 { @@ -2081,341 +2072,343 @@ BB28_48: param0, param1 ); - ld.param.f64 %fd53, [retval0+0]; + ld.param.f64 %fd25, [retval0+0]; //{ }// Callseq End 0 - setp.lt.s32 %p56, %r4, 0; - and.pred %p1, %p56, %p55; - @!%p1 bra BB28_50; - bra.uni BB28_49; + setp.lt.s32 %p54, %r8, 0; + and.pred %p1, %p54, %p53; + @!%p1 bra BB28_58; + bra.uni BB28_57; -BB28_49: +BB28_57: { .reg .b32 %temp; - mov.b64 {%temp, %r33}, %fd53; + mov.b64 {%temp, %r33}, %fd25; } xor.b32 %r34, %r33, -2147483648; { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd53; + mov.b64 {%r35, %temp}, %fd25; } - mov.b64 %fd53, {%r35, %r34}; + mov.b64 %fd25, {%r35, %r34}; -BB28_50: - mov.f64 %fd52, %fd53; - setp.eq.f64 %p57, %fd1, 0d0000000000000000; - @%p57 bra BB28_53; - bra.uni BB28_51; +BB28_58: + setp.eq.f64 %p55, %fd1, 0d0000000000000000; + @%p55 bra BB28_61; + bra.uni BB28_59; -BB28_53: - selp.b32 %r36, %r4, 0, %p55; +BB28_61: + selp.b32 %r36, %r8, 0, %p53; or.b32 %r37, %r36, 2146435072; - setp.lt.s32 %p61, %r5, 0; - selp.b32 %r38, %r37, %r36, %p61; + setp.lt.s32 %p59, %r9, 0; + selp.b32 %r38, %r37, %r36, %p59; mov.u32 %r39, 0; - mov.b64 %fd52, {%r39, %r38}; - bra.uni BB28_54; + mov.b64 %fd25, {%r39, %r38}; + bra.uni BB28_62; -BB28_27: - setp.gt.s32 %p11, %r9, 15; - @%p11 bra BB28_31; +BB28_33: + setp.gt.s32 %p11, %r13, 15; + @%p11 bra BB28_37; - setp.eq.s32 %p15, %r9, 14; - @%p15 bra BB28_42; - bra.uni BB28_29; + setp.eq.s32 %p15, %r13, 14; + @%p15 bra BB28_50; + bra.uni BB28_35; -BB28_42: +BB28_50: cvt.rni.s64.f64 %rd11, %fd1; - cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r25, %rd11; + cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r26, %rd12; or.b32 %r27, %r26, %r25; - setp.eq.s32 %p47, %r27, 0; - selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47; - bra.uni BB28_64; + setp.eq.s32 %p45, %r27, 0; + selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p45; + bra.uni BB28_76; -BB28_7: - setp.eq.s32 %p31, %r9, 2; - @%p31 bra BB28_62; - bra.uni BB28_8; +BB28_13: + setp.eq.s32 %p31, %r13, 2; + @%p31 bra BB28_74; + bra.uni BB28_14; -BB28_62: - mul.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_74: + mul.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_23: - setp.eq.s32 %p18, %r9, 11; - @%p18 bra BB28_44; +BB28_29: + setp.eq.s32 %p18, %r13, 11; + @%p18 bra BB28_52; - setp.eq.s32 %p19, %r9, 12; - @%p19 bra BB28_43; - bra.uni BB28_25; + setp.eq.s32 %p19, %r13, 12; + @%p19 bra BB28_51; + bra.uni BB28_31; -BB28_43: - max.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_51: + max.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_14: - setp.eq.s32 %p25, %r9, 6; - @%p25 bra BB28_47; +BB28_20: + setp.eq.s32 %p25, %r13, 6; + @%p25 bra BB28_55; - setp.eq.s32 %p26, %r9, 7; - @%p26 bra BB28_46; - bra.uni BB28_16; + setp.eq.s32 %p26, %r13, 7; + @%p26 bra BB28_54; + bra.uni BB28_22; -BB28_46: - setp.gt.f64 %p52, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52; - bra.uni BB28_64; +BB28_54: + setp.gt.f64 %p50, %fd1, %fd2; + selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p50; + bra.uni BB28_76; -BB28_31: - setp.eq.s32 %p12, %r9, 16; - @%p12 bra BB28_41; +BB28_37: + setp.eq.s32 %p12, %r13, 16; + @%p12 bra BB28_49; - setp.eq.s32 %p13, %r9, 17; - @%p13 bra BB28_37; - bra.uni BB28_33; + setp.eq.s32 %p13, %r13, 17; + @%p13 bra BB28_44; + bra.uni BB28_39; -BB28_37: - setp.eq.f64 %p39, %fd2, 0d0000000000000000; - setp.eq.f64 %p40, %fd2, 0d8000000000000000; - or.pred %p41, %p39, %p40; - mov.f64 %fd54, 0d7FF8000000000000; - @%p41 bra BB28_64; +BB28_44: + setp.eq.f64 %p38, %fd2, 0d0000000000000000; + setp.eq.f64 %p39, %fd2, 0d8000000000000000; + or.pred %p40, %p38, %p39; + mov.f64 %fd50, 0d7FF8000000000000; + @%p40 bra BB28_76; - div.rn.f64 %fd54, %fd1, %fd2; - abs.f64 %fd39, %fd54; - setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000; - @%p42 bra BB28_64; + div.rn.f64 %fd50, %fd1, %fd2; + abs.f64 %fd39, %fd50; + setp.gtu.f64 %p41, %fd39, 0d7FF0000000000000; + @%p41 bra BB28_76; { .reg .b32 %temp; - mov.b64 {%r22, %temp}, %fd54; + mov.b64 {%temp, %r22}, %fd50; } + and.b32 %r23, %r22, 2147483647; + setp.ne.s32 %p42, %r23, 2146435072; + @%p42 bra BB28_48; + { .reg .b32 %temp; - mov.b64 {%temp, %r23}, %fd54; + mov.b64 {%r24, %temp}, %fd50; } - and.b32 %r24, %r23, 2147483647; - setp.ne.s32 %p43, %r24, 2146435072; - setp.ne.s32 %p44, %r22, 0; - or.pred %p45, %p43, %p44; - @!%p45 bra BB28_64; - bra.uni BB28_40; + setp.eq.s32 %p43, %r24, 0; + @%p43 bra BB28_76; -BB28_40: - cvt.rmi.f64.f64 %fd40, %fd54; +BB28_48: + cvt.rmi.f64.f64 %fd40, %fd50; mul.f64 %fd41, %fd2, %fd40; - sub.f64 %fd54, %fd1, %fd41; - bra.uni BB28_64; + sub.f64 %fd50, %fd1, %fd41; + bra.uni BB28_76; -BB28_5: - setp.eq.s32 %p34, %r9, 1; - @%p34 bra BB28_6; - bra.uni BB28_64; +BB28_11: + setp.eq.s32 %p34, %r13, 1; + @%p34 bra BB28_12; + bra.uni BB28_76; -BB28_6: - sub.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_12: + sub.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_21: - setp.eq.s32 %p22, %r9, 10; - @%p22 bra BB28_22; - bra.uni BB28_64; +BB28_27: + setp.eq.s32 %p22, %r13, 10; + @%p22 bra BB28_28; + bra.uni BB28_76; -BB28_22: - setp.neu.f64 %p49, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB28_64; +BB28_28: + setp.neu.f64 %p47, %fd1, %fd2; + selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p47; + bra.uni BB28_76; -BB28_12: - setp.eq.s32 %p29, %r9, 5; - @%p29 bra BB28_13; - bra.uni BB28_64; +BB28_18: + setp.eq.s32 %p29, %r13, 5; + @%p29 bra BB28_19; + bra.uni BB28_76; -BB28_13: - setp.lt.f64 %p54, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54; - bra.uni BB28_64; +BB28_19: + setp.lt.f64 %p52, %fd1, %fd2; + selp.f64 %fd50, 0d3FF0000000000000, 0d0000000000000000, %p52; + bra.uni BB28_76; -BB28_29: - setp.eq.s32 %p16, %r9, 15; - @%p16 bra BB28_30; - bra.uni BB28_64; +BB28_35: + setp.eq.s32 %p16, %r13, 15; + @%p16 bra BB28_36; + bra.uni BB28_76; -BB28_30: +BB28_36: mul.f64 %fd43, %fd1, %fd2; mov.f64 %fd44, 0d3FF0000000000000; - sub.f64 %fd54, %fd44, %fd43; - bra.uni BB28_64; + sub.f64 %fd50, %fd44, %fd43; + bra.uni BB28_76; -BB28_8: - setp.eq.s32 %p32, %r9, 3; - @%p32 bra BB28_9; - bra.uni BB28_64; +BB28_14: + setp.eq.s32 %p32, %r13, 3; + @%p32 bra BB28_15; + bra.uni BB28_76; -BB28_9: - div.rn.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_15: + div.rn.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_44: - min.f64 %fd54, %fd1, %fd2; - bra.uni BB28_64; +BB28_52: + min.f64 %fd50, %fd1, %fd2; + bra.uni BB28_76; -BB28_25: - setp.eq.s32 %p20, %r9, 13; - @%p20 bra BB28_26; - bra.uni BB28_64; +BB28_31: + setp.eq.s32 %p20, %r13, 13; + @%p20 bra BB28_32; + bra.uni BB28_76; -BB28_26: +BB28_32: cvt.rni.s64.f64 %rd13, %fd1; - cvt.rni.s64.f64 %rd14, %fd2; cvt.u32.u64 %r28, %rd13; + cvt.rni.s64.f64 %rd14, %fd2; cvt.u32.u64 %r29, %rd14; and.b32 %r30, %r29, %r28; - setp.eq.s32 %p48, %r30, 0; - selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48; - bra.uni BB28_64; + setp.eq.s32 %p46, %r30, 0; + selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p46; + bra.uni BB28_76; -BB28_47: - setp.le.f64 %p53, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53; - bra.uni BB28_64; +BB28_55: + setp.gtu.f64 %p51, %fd1, %fd2; + selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p51; + bra.uni BB28_76; -BB28_16: - setp.eq.s32 %p27, %r9, 8; - @%p27 bra BB28_17; - bra.uni BB28_64; +BB28_22: + setp.eq.s32 %p27, %r13, 8; + @%p27 bra BB28_23; + bra.uni BB28_76; -BB28_17: - setp.ge.f64 %p51, %fd1, %fd2; - selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51; - bra.uni BB28_64; +BB28_23: + setp.ltu.f64 %p49, %fd1, %fd2; + selp.f64 %fd50, 0d0000000000000000, 0d3FF0000000000000, %p49; + bra.uni BB28_76; -BB28_41: - setp.neu.f64 %p46, %fd1, 0d0000000000000000; +BB28_49: + setp.neu.f64 %p44, %fd1, 0d0000000000000000; sub.f64 %fd42, %fd1, %fd2; - selp.f64 %fd54, %fd42, 0d0000000000000000, %p46; - bra.uni BB28_64; + selp.f64 %fd50, %fd42, 0d0000000000000000, %p44; + bra.uni BB28_76; -BB28_33: - setp.ne.s32 %p14, %r9, 18; - @%p14 bra BB28_64; +BB28_39: + setp.ne.s32 %p14, %r13, 18; + @%p14 bra BB28_76; - div.rn.f64 %fd54, %fd1, %fd2; - abs.f64 %fd37, %fd54; + div.rn.f64 %fd50, %fd1, %fd2; + abs.f64 %fd37, %fd50; setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; - @%p35 bra BB28_64; + @%p35 bra BB28_76; { .reg .b32 %temp; - mov.b64 {%r19, %temp}, %fd54; + mov.b64 {%temp, %r19}, %fd50; } + and.b32 %r20, %r19, 2147483647; + setp.ne.s32 %p36, %r20, 2146435072; + @%p36 bra BB28_43; + { .reg .b32 %temp; - mov.b64 {%temp, %r20}, %fd54; + mov.b64 {%r21, %temp}, %fd50; } - and.b32 %r21, %r20, 2147483647; - setp.ne.s32 %p36, %r21, 2146435072; - setp.ne.s32 %p37, %r19, 0; - or.pred %p38, %p36, %p37; - @!%p38 bra BB28_64; - bra.uni BB28_36; + setp.eq.s32 %p37, %r21, 0; + @%p37 bra BB28_76; -BB28_36: - cvt.rmi.f64.f64 %fd54, %fd54; - bra.uni BB28_64; +BB28_43: + cvt.rmi.f64.f64 %fd50, %fd50; + bra.uni BB28_76; -BB28_51: - setp.gt.s32 %p58, %r4, -1; - @%p58 bra BB28_54; +BB28_59: + setp.gt.s32 %p56, %r8, -1; + @%p56 bra BB28_62; cvt.rzi.f64.f64 %fd45, %fd2; - setp.neu.f64 %p59, %fd45, %fd2; - selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59; + setp.neu.f64 %p57, %fd45, %fd2; + selp.f64 %fd25, 0dFFF8000000000000, %fd25, %p57; -BB28_54: - mov.f64 %fd25, %fd52; - add.f64 %fd26, %fd1, %fd2; +BB28_62: + add.f64 %fd49, %fd1, %fd2; { .reg .b32 %temp; - mov.b64 {%temp, %r40}, %fd26; + mov.b64 {%temp, %r40}, %fd49; } and.b32 %r41, %r40, 2146435072; - setp.ne.s32 %p62, %r41, 2146435072; - mov.f64 %fd51, %fd25; - @%p62 bra BB28_61; + setp.ne.s32 %p60, %r41, 2146435072; + @%p60 bra BB28_63; - setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000; - mov.f64 %fd51, %fd26; - @%p63 bra BB28_61; + setp.gtu.f64 %p61, %fd19, 0d7FF0000000000000; + @%p61 bra BB28_73; abs.f64 %fd46, %fd2; - setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000; - mov.f64 %fd50, %fd26; - mov.f64 %fd51, %fd50; - @%p64 bra BB28_61; + setp.gtu.f64 %p62, %fd46, 0d7FF0000000000000; + @%p62 bra BB28_73; + + and.b32 %r42, %r9, 2147483647; + setp.ne.s32 %p63, %r42, 2146435072; + @%p63 bra BB28_68; { .reg .b32 %temp; - mov.b64 {%r42, %temp}, %fd2; - } - and.b32 %r43, %r5, 2147483647; - setp.eq.s32 %p65, %r43, 2146435072; - setp.eq.s32 %p66, %r42, 0; - and.pred %p67, %p65, %p66; - @%p67 bra BB28_60; - bra.uni BB28_58; - -BB28_60: - setp.gt.f64 %p71, %fd19, 0d3FF0000000000000; - selp.b32 %r51, 2146435072, 0, %p71; - xor.b32 %r52, %r51, 2146435072; - setp.lt.s32 %p72, %r5, 0; - selp.b32 %r53, %r52, %r51, %p72; - setp.eq.f64 %p73, %fd1, 0dBFF0000000000000; - selp.b32 %r54, 1072693248, %r53, %p73; - mov.u32 %r55, 0; - mov.b64 %fd51, {%r55, %r54}; - bra.uni BB28_61; + mov.b64 {%r43, %temp}, %fd2; + } + setp.eq.s32 %p64, %r43, 0; + @%p64 bra BB28_72; + +BB28_68: + and.b32 %r44, %r8, 2147483647; + setp.ne.s32 %p65, %r44, 2146435072; + @%p65 bra BB28_69; -BB28_58: { .reg .b32 %temp; - mov.b64 {%r44, %temp}, %fd1; - } - and.b32 %r45, %r4, 2147483647; - setp.eq.s32 %p68, %r45, 2146435072; - setp.eq.s32 %p69, %r44, 0; - and.pred %p70, %p68, %p69; - mov.f64 %fd51, %fd25; - @!%p70 bra BB28_61; - bra.uni BB28_59; + mov.b64 {%r45, %temp}, %fd1; + } + setp.ne.s32 %p66, %r45, 0; + mov.f64 %fd49, %fd25; + @%p66 bra BB28_73; -BB28_59: - shr.s32 %r46, %r5, 31; + shr.s32 %r46, %r9, 31; and.b32 %r47, %r46, -2146435072; - selp.b32 %r48, -1048576, 2146435072, %p1; - add.s32 %r49, %r48, %r47; - mov.u32 %r50, 0; - mov.b64 %fd51, {%r50, %r49}; + add.s32 %r48, %r47, 2146435072; + or.b32 %r49, %r48, -2147483648; + selp.b32 %r50, %r49, %r48, %p1; + mov.u32 %r51, 0; + mov.b64 %fd49, {%r51, %r50}; + bra.uni BB28_73; -BB28_61: - setp.eq.f64 %p74, %fd2, 0d0000000000000000; - setp.eq.f64 %p75, %fd1, 0d3FF0000000000000; - or.pred %p76, %p75, %p74; - selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76; +BB28_63: + mov.f64 %fd49, %fd25; + +BB28_73: + setp.eq.f64 %p70, %fd2, 0d0000000000000000; + setp.eq.f64 %p71, %fd1, 0d3FF0000000000000; + or.pred %p72, %p71, %p70; + selp.f64 %fd50, 0d3FF0000000000000, %fd49, %p72; -BB28_64: +BB28_76: cvta.to.global.u64 %rd16, %rd4; mul.wide.s32 %rd17, %r3, 8; add.s64 %rd18, %rd16, %rd17; - st.global.f64 [%rd18], %fd54; + st.global.f64 [%rd18], %fd50; bar.sync 0; -BB28_65: +BB28_77: ret; + +BB28_69: + mov.f64 %fd49, %fd25; + bra.uni BB28_73; + +BB28_72: + setp.gt.f64 %p67, %fd19, 0d3FF0000000000000; + selp.b32 %r52, 2146435072, 0, %p67; + xor.b32 %r53, %r52, 2146435072; + setp.lt.s32 %p68, %r9, 0; + selp.b32 %r54, %r53, %r52, %p68; + setp.eq.f64 %p69, %fd1, 0dBFF0000000000000; + selp.b32 %r55, 1072693248, %r54, %p69; + mov.u32 %r56, 0; + mov.b64 %fd49, {%r56, %r55}; + bra.uni BB28_73; } // .globl matrix_matrix_cellwise_op_f @@ -2431,425 +2424,436 @@ BB28_65: ) { .reg .pred %p<76>; - .reg .f32 %f<134>; - .reg .b32 %r<42>; + .reg .f32 %f<135>; + .reg .b32 %r<46>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [matrix_matrix_cellwise_op_f_param_0]; ld.param.u64 %rd2, [matrix_matrix_cellwise_op_f_param_1]; ld.param.u64 %rd3, [matrix_matrix_cellwise_op_f_param_2]; - ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_3]; - ld.param.u32 %r4, [matrix_matrix_cellwise_op_f_param_4]; - ld.param.u32 %r5, [matrix_matrix_cellwise_op_f_param_5]; - ld.param.u32 %r6, [matrix_matrix_cellwise_op_f_param_6]; - ld.param.u32 %r7, [matrix_matrix_cellwise_op_f_param_7]; - mov.u32 %r9, %ntid.x; - mov.u32 %r10, %ctaid.x; - mov.u32 %r11, %tid.x; - mad.lo.s32 %r1, %r9, %r10, %r11; - div.s32 %r2, %r1, %r4; - setp.lt.s32 %p2, %r2, %r8; - setp.gt.s32 %p3, %r4, -1; + ld.param.u32 %r12, [matrix_matrix_cellwise_op_f_param_3]; + ld.param.u32 %r8, [matrix_matrix_cellwise_op_f_param_4]; + ld.param.u32 %r9, [matrix_matrix_cellwise_op_f_param_5]; + ld.param.u32 %r10, [matrix_matrix_cellwise_op_f_param_6]; + ld.param.u32 %r11, [matrix_matrix_cellwise_op_f_param_7]; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %ctaid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r16, %r13, %r14, %r15; + div.s32 %r45, %r16, %r8; + rem.s32 %r2, %r16, %r8; + setp.lt.s32 %p2, %r45, %r12; + setp.gt.s32 %p3, %r8, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB29_63; + @!%p4 bra BB29_69; bra.uni BB29_1; BB29_1: - rem.s32 %r12, %r1, %r4; - cvta.to.global.u64 %rd4, %rd1; - mad.lo.s32 %r3, %r2, %r4, %r12; - setp.eq.s32 %p5, %r5, 2; - selp.b32 %r13, %r12, %r3, %p5; - setp.eq.s32 %p6, %r5, 1; - selp.b32 %r14, %r2, %r13, %p6; - setp.eq.s32 %p7, %r6, 2; - selp.b32 %r15, %r12, %r3, %p7; - setp.eq.s32 %p8, %r6, 1; - selp.b32 %r16, %r2, %r15, %p8; - mul.wide.s32 %rd5, %r14, 4; - add.s64 %rd6, %rd4, %rd5; - ld.global.f32 %f1, [%rd6]; - cvta.to.global.u64 %rd7, %rd2; - mul.wide.s32 %rd8, %r16, 4; - add.s64 %rd9, %rd7, %rd8; + mad.lo.s32 %r3, %r45, %r8, %r2; + setp.eq.s32 %p5, %r9, 1; + mov.u32 %r43, %r45; + @%p5 bra BB29_4; + + setp.ne.s32 %p6, %r9, 2; + mov.u32 %r43, %r3; + @%p6 bra BB29_4; + + mov.u32 %r43, %r2; + +BB29_4: + setp.eq.s32 %p7, %r10, 1; + @%p7 bra BB29_7; + + setp.ne.s32 %p8, %r10, 2; + mov.u32 %r45, %r3; + @%p8 bra BB29_7; + + mov.u32 %r45, %r2; + +BB29_7: + cvta.to.global.u64 %rd4, %rd2; + cvta.to.global.u64 %rd5, %rd1; + mul.wide.s32 %rd6, %r43, 4; + add.s64 %rd7, %rd5, %rd6; + ld.global.f32 %f1, [%rd7]; + mul.wide.s32 %rd8, %r45, 4; + add.s64 %rd9, %rd4, %rd8; ld.global.f32 %f2, [%rd9]; - mov.f32 %f133, 0f7F7FFFFF; - setp.gt.s32 %p9, %r7, 8; - @%p9 bra BB29_18; + mov.f32 %f134, 0f7F7FFFFF; + setp.gt.s32 %p9, %r11, 8; + @%p9 bra BB29_24; - setp.gt.s32 %p23, %r7, 3; - @%p23 bra BB29_10; + setp.gt.s32 %p23, %r11, 3; + @%p23 bra BB29_16; - setp.gt.s32 %p30, %r7, 1; - @%p30 bra BB29_7; + setp.gt.s32 %p30, %r11, 1; + @%p30 bra BB29_13; - setp.eq.s32 %p33, %r7, 0; - @%p33 bra BB29_61; - bra.uni BB29_5; + setp.eq.s32 %p33, %r11, 0; + @%p33 bra BB29_67; + bra.uni BB29_11; -BB29_61: - add.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_67: + add.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_18: - setp.gt.s32 %p10, %r7, 13; - @%p10 bra BB29_27; +BB29_24: + setp.gt.s32 %p10, %r11, 13; + @%p10 bra BB29_33; - setp.gt.s32 %p17, %r7, 10; - @%p17 bra BB29_23; + setp.gt.s32 %p17, %r11, 10; + @%p17 bra BB29_29; - setp.eq.s32 %p21, %r7, 9; - @%p21 bra BB29_43; - bra.uni BB29_21; + setp.eq.s32 %p21, %r11, 9; + @%p21 bra BB29_49; + bra.uni BB29_27; -BB29_43: +BB29_49: setp.eq.f32 %p44, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p44; - bra.uni BB29_62; + selp.f32 %f134, 0f3F800000, 0f00000000, %p44; + bra.uni BB29_68; -BB29_10: - setp.gt.s32 %p24, %r7, 5; - @%p24 bra BB29_14; +BB29_16: + setp.gt.s32 %p24, %r11, 5; + @%p24 bra BB29_20; - setp.eq.s32 %p28, %r7, 4; - @%p28 bra BB29_46; - bra.uni BB29_12; + setp.eq.s32 %p28, %r11, 4; + @%p28 bra BB29_52; + bra.uni BB29_18; -BB29_46: - mul.f32 %f53, %f2, 0f3F000000; - cvt.rzi.f32.f32 %f54, %f53; - fma.rn.f32 %f55, %f54, 0fC0000000, %f2; - abs.f32 %f19, %f55; +BB29_52: + mul.f32 %f51, %f2, 0f3F000000; + cvt.rzi.f32.f32 %f52, %f51; + fma.rn.f32 %f53, %f52, 0fC0000000, %f2; + abs.f32 %f19, %f53; abs.f32 %f20, %f1; setp.lt.f32 %p49, %f20, 0f00800000; - mul.f32 %f56, %f20, 0f4B800000; - selp.f32 %f57, 0fC3170000, 0fC2FE0000, %p49; - selp.f32 %f58, %f56, %f20, %p49; - mov.b32 %r23, %f58; + mul.f32 %f54, %f20, 0f4B800000; + selp.f32 %f55, 0fC3170000, 0fC2FE0000, %p49; + selp.f32 %f56, %f54, %f20, %p49; + mov.b32 %r23, %f56; and.b32 %r24, %r23, 8388607; or.b32 %r25, %r24, 1065353216; - mov.b32 %f59, %r25; + mov.b32 %f57, %r25; shr.u32 %r26, %r23, 23; - cvt.rn.f32.u32 %f60, %r26; - add.f32 %f61, %f57, %f60; - setp.gt.f32 %p50, %f59, 0f3FB504F3; - mul.f32 %f62, %f59, 0f3F000000; - add.f32 %f63, %f61, 0f3F800000; - selp.f32 %f64, %f62, %f59, %p50; - selp.f32 %f65, %f63, %f61, %p50; - add.f32 %f66, %f64, 0fBF800000; - add.f32 %f50, %f64, 0f3F800000; + cvt.rn.f32.u32 %f58, %r26; + add.f32 %f59, %f55, %f58; + setp.gt.f32 %p50, %f57, 0f3FB504F3; + mul.f32 %f60, %f57, 0f3F000000; + add.f32 %f61, %f59, 0f3F800000; + selp.f32 %f62, %f60, %f57, %p50; + selp.f32 %f63, %f61, %f59, %p50; + add.f32 %f64, %f62, 0fBF800000; + add.f32 %f50, %f62, 0f3F800000; // inline asm rcp.approx.ftz.f32 %f49,%f50; // inline asm - add.f32 %f67, %f66, %f66; - mul.f32 %f68, %f49, %f67; - mul.f32 %f69, %f68, %f68; - mov.f32 %f70, 0f3C4CAF63; - mov.f32 %f71, 0f3B18F0FE; - fma.rn.f32 %f72, %f71, %f69, %f70; - mov.f32 %f73, 0f3DAAAABD; - fma.rn.f32 %f74, %f72, %f69, %f73; - mul.rn.f32 %f75, %f74, %f69; - mul.rn.f32 %f76, %f75, %f68; - sub.f32 %f77, %f66, %f68; - neg.f32 %f78, %f68; - add.f32 %f79, %f77, %f77; - fma.rn.f32 %f80, %f78, %f66, %f79; - mul.rn.f32 %f81, %f49, %f80; - add.f32 %f82, %f76, %f68; - sub.f32 %f83, %f68, %f82; - add.f32 %f84, %f76, %f83; - add.f32 %f85, %f81, %f84; - add.f32 %f86, %f82, %f85; - sub.f32 %f87, %f82, %f86; - add.f32 %f88, %f85, %f87; - mov.f32 %f89, 0f3F317200; - mul.rn.f32 %f90, %f65, %f89; - mov.f32 %f91, 0f35BFBE8E; - mul.rn.f32 %f92, %f65, %f91; - add.f32 %f93, %f90, %f86; - sub.f32 %f94, %f90, %f93; - add.f32 %f95, %f86, %f94; - add.f32 %f96, %f88, %f95; - add.f32 %f97, %f92, %f96; - add.f32 %f98, %f93, %f97; - sub.f32 %f99, %f93, %f98; - add.f32 %f100, %f97, %f99; + add.f32 %f65, %f64, %f64; + mul.f32 %f66, %f49, %f65; + mul.f32 %f67, %f66, %f66; + mov.f32 %f68, 0f3C4CAF63; + mov.f32 %f69, 0f3B18F0FE; + fma.rn.f32 %f70, %f69, %f67, %f68; + mov.f32 %f71, 0f3DAAAABD; + fma.rn.f32 %f72, %f70, %f67, %f71; + mul.rn.f32 %f73, %f72, %f67; + mul.rn.f32 %f74, %f73, %f66; + sub.f32 %f75, %f64, %f66; + neg.f32 %f76, %f66; + add.f32 %f77, %f75, %f75; + fma.rn.f32 %f78, %f76, %f64, %f77; + mul.rn.f32 %f79, %f49, %f78; + add.f32 %f80, %f74, %f66; + sub.f32 %f81, %f66, %f80; + add.f32 %f82, %f74, %f81; + add.f32 %f83, %f79, %f82; + add.f32 %f84, %f80, %f83; + sub.f32 %f85, %f80, %f84; + add.f32 %f86, %f83, %f85; + mov.f32 %f87, 0f3F317200; + mul.rn.f32 %f88, %f63, %f87; + mov.f32 %f89, 0f35BFBE8E; + mul.rn.f32 %f90, %f63, %f89; + add.f32 %f91, %f88, %f84; + sub.f32 %f92, %f88, %f91; + add.f32 %f93, %f84, %f92; + add.f32 %f94, %f86, %f93; + add.f32 %f95, %f90, %f94; + add.f32 %f96, %f91, %f95; + sub.f32 %f97, %f91, %f96; + add.f32 %f98, %f95, %f97; abs.f32 %f21, %f2; setp.gt.f32 %p51, %f21, 0f77F684DF; - mul.f32 %f101, %f2, 0f39000000; - selp.f32 %f102, %f101, %f2, %p51; - mul.rn.f32 %f103, %f102, %f98; - neg.f32 %f104, %f103; - fma.rn.f32 %f105, %f102, %f98, %f104; - fma.rn.f32 %f106, %f102, %f100, %f105; - mov.f32 %f107, 0f00000000; - fma.rn.f32 %f108, %f107, %f98, %f106; - add.rn.f32 %f109, %f103, %f108; - neg.f32 %f110, %f109; - add.rn.f32 %f111, %f103, %f110; - add.rn.f32 %f112, %f111, %f108; - mov.b32 %r27, %f109; + mul.f32 %f99, %f2, 0f39000000; + selp.f32 %f100, %f99, %f2, %p51; + mul.rn.f32 %f101, %f100, %f96; + neg.f32 %f102, %f101; + fma.rn.f32 %f103, %f100, %f96, %f102; + fma.rn.f32 %f104, %f100, %f98, %f103; + mov.f32 %f105, 0f00000000; + fma.rn.f32 %f106, %f105, %f96, %f104; + add.rn.f32 %f107, %f101, %f106; + neg.f32 %f108, %f107; + add.rn.f32 %f109, %f101, %f108; + add.rn.f32 %f110, %f109, %f106; + mov.b32 %r27, %f107; setp.eq.s32 %p52, %r27, 1118925336; add.s32 %r28, %r27, -1; - mov.b32 %f113, %r28; - add.f32 %f114, %f112, 0f37000000; - selp.f32 %f115, %f113, %f109, %p52; - selp.f32 %f22, %f114, %f112, %p52; - mul.f32 %f116, %f115, 0f3FB8AA3B; - cvt.rzi.f32.f32 %f117, %f116; - mov.f32 %f118, 0fBF317200; - fma.rn.f32 %f119, %f117, %f118, %f115; - mov.f32 %f120, 0fB5BFBE8E; - fma.rn.f32 %f121, %f117, %f120, %f119; - mul.f32 %f52, %f121, 0f3FB8AA3B; - // inline asm - ex2.approx.ftz.f32 %f51,%f52; - // inline asm - add.f32 %f122, %f117, 0f00000000; + mov.b32 %f111, %r28; + add.f32 %f112, %f110, 0f37000000; + selp.f32 %f113, %f111, %f107, %p52; + selp.f32 %f22, %f112, %f110, %p52; + mul.f32 %f114, %f113, 0f3FB8AA3B; + cvt.rzi.f32.f32 %f115, %f114; + mov.f32 %f116, 0fBF317200; + fma.rn.f32 %f117, %f115, %f116, %f113; + mov.f32 %f118, 0fB5BFBE8E; + fma.rn.f32 %f119, %f115, %f118, %f117; + mul.f32 %f120, %f119, 0f3FB8AA3B; + ex2.approx.ftz.f32 %f121, %f120; + add.f32 %f122, %f115, 0f00000000; ex2.approx.f32 %f123, %f122; - mul.f32 %f124, %f51, %f123; - setp.lt.f32 %p53, %f115, 0fC2D20000; + mul.f32 %f124, %f121, %f123; + setp.lt.f32 %p53, %f113, 0fC2D20000; selp.f32 %f125, 0f00000000, %f124, %p53; - setp.gt.f32 %p54, %f115, 0f42D20000; + setp.gt.f32 %p54, %f113, 0f42D20000; selp.f32 %f131, 0f7F800000, %f125, %p54; setp.eq.f32 %p55, %f131, 0f7F800000; - @%p55 bra BB29_48; + @%p55 bra BB29_54; fma.rn.f32 %f131, %f131, %f22, %f131; -BB29_48: +BB29_54: setp.lt.f32 %p56, %f1, 0f00000000; setp.eq.f32 %p57, %f19, 0f3F800000; and.pred %p1, %p56, %p57; mov.b32 %r29, %f131; xor.b32 %r30, %r29, -2147483648; mov.b32 %f126, %r30; - selp.f32 %f132, %f126, %f131, %p1; + selp.f32 %f133, %f126, %f131, %p1; setp.eq.f32 %p58, %f1, 0f00000000; - @%p58 bra BB29_51; - bra.uni BB29_49; + @%p58 bra BB29_57; + bra.uni BB29_55; -BB29_51: +BB29_57: add.f32 %f128, %f1, %f1; mov.b32 %r31, %f128; selp.b32 %r32, %r31, 0, %p57; or.b32 %r33, %r32, 2139095040; setp.lt.f32 %p62, %f2, 0f00000000; selp.b32 %r34, %r33, %r32, %p62; - mov.b32 %f132, %r34; - bra.uni BB29_52; + mov.b32 %f133, %r34; + bra.uni BB29_58; -BB29_27: - setp.gt.s32 %p11, %r7, 15; - @%p11 bra BB29_31; +BB29_33: + setp.gt.s32 %p11, %r11, 15; + @%p11 bra BB29_37; - setp.eq.s32 %p15, %r7, 14; - @%p15 bra BB29_40; - bra.uni BB29_29; + setp.eq.s32 %p15, %r11, 14; + @%p15 bra BB29_46; + bra.uni BB29_35; -BB29_40: +BB29_46: cvt.rni.s64.f32 %rd10, %f1; - cvt.rni.s64.f32 %rd11, %f2; cvt.u32.u64 %r17, %rd10; + cvt.rni.s64.f32 %rd11, %f2; cvt.u32.u64 %r18, %rd11; or.b32 %r19, %r18, %r17; setp.eq.s32 %p41, %r19, 0; - selp.f32 %f133, 0f00000000, 0f3F800000, %p41; - bra.uni BB29_62; + selp.f32 %f134, 0f00000000, 0f3F800000, %p41; + bra.uni BB29_68; -BB29_7: - setp.eq.s32 %p31, %r7, 2; - @%p31 bra BB29_60; - bra.uni BB29_8; +BB29_13: + setp.eq.s32 %p31, %r11, 2; + @%p31 bra BB29_66; + bra.uni BB29_14; -BB29_60: - mul.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_66: + mul.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_23: - setp.eq.s32 %p18, %r7, 11; - @%p18 bra BB29_42; +BB29_29: + setp.eq.s32 %p18, %r11, 11; + @%p18 bra BB29_48; - setp.eq.s32 %p19, %r7, 12; - @%p19 bra BB29_41; - bra.uni BB29_25; + setp.eq.s32 %p19, %r11, 12; + @%p19 bra BB29_47; + bra.uni BB29_31; -BB29_41: - max.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_47: + max.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_14: - setp.eq.s32 %p25, %r7, 6; - @%p25 bra BB29_45; +BB29_20: + setp.eq.s32 %p25, %r11, 6; + @%p25 bra BB29_51; - setp.eq.s32 %p26, %r7, 7; - @%p26 bra BB29_44; - bra.uni BB29_16; + setp.eq.s32 %p26, %r11, 7; + @%p26 bra BB29_50; + bra.uni BB29_22; -BB29_44: +BB29_50: setp.gt.f32 %p46, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p46; - bra.uni BB29_62; + selp.f32 %f134, 0f3F800000, 0f00000000, %p46; + bra.uni BB29_68; -BB29_31: - setp.eq.s32 %p12, %r7, 16; - @%p12 bra BB29_39; +BB29_37: + setp.eq.s32 %p12, %r11, 16; + @%p12 bra BB29_45; - setp.eq.s32 %p13, %r7, 17; - @%p13 bra BB29_36; - bra.uni BB29_33; + setp.eq.s32 %p13, %r11, 17; + @%p13 bra BB29_42; + bra.uni BB29_39; -BB29_36: +BB29_42: setp.eq.f32 %p36, %f2, 0f00000000; setp.eq.f32 %p37, %f2, 0f80000000; or.pred %p38, %p36, %p37; - mov.f32 %f133, 0f7FC00000; - @%p38 bra BB29_62; + mov.f32 %f134, 0f7FC00000; + @%p38 bra BB29_68; - div.rn.f32 %f133, %f1, %f2; - abs.f32 %f43, %f133; + div.rn.f32 %f134, %f1, %f2; + abs.f32 %f43, %f134; setp.geu.f32 %p39, %f43, 0f7F800000; - @%p39 bra BB29_62; + @%p39 bra BB29_68; - cvt.rmi.f32.f32 %f44, %f133; + cvt.rmi.f32.f32 %f44, %f134; mul.f32 %f45, %f2, %f44; - sub.f32 %f133, %f1, %f45; - bra.uni BB29_62; + sub.f32 %f134, %f1, %f45; + bra.uni BB29_68; -BB29_5: - setp.eq.s32 %p34, %r7, 1; - @%p34 bra BB29_6; - bra.uni BB29_62; +BB29_11: + setp.eq.s32 %p34, %r11, 1; + @%p34 bra BB29_12; + bra.uni BB29_68; -BB29_6: - sub.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_12: + sub.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_21: - setp.eq.s32 %p22, %r7, 10; - @%p22 bra BB29_22; - bra.uni BB29_62; +BB29_27: + setp.eq.s32 %p22, %r11, 10; + @%p22 bra BB29_28; + bra.uni BB29_68; -BB29_22: +BB29_28: setp.neu.f32 %p43, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p43; - bra.uni BB29_62; + selp.f32 %f134, 0f3F800000, 0f00000000, %p43; + bra.uni BB29_68; -BB29_12: - setp.eq.s32 %p29, %r7, 5; - @%p29 bra BB29_13; - bra.uni BB29_62; +BB29_18: + setp.eq.s32 %p29, %r11, 5; + @%p29 bra BB29_19; + bra.uni BB29_68; -BB29_13: +BB29_19: setp.lt.f32 %p48, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p48; - bra.uni BB29_62; + selp.f32 %f134, 0f3F800000, 0f00000000, %p48; + bra.uni BB29_68; -BB29_29: - setp.eq.s32 %p16, %r7, 15; - @%p16 bra BB29_30; - bra.uni BB29_62; +BB29_35: + setp.eq.s32 %p16, %r11, 15; + @%p16 bra BB29_36; + bra.uni BB29_68; -BB29_30: +BB29_36: mul.f32 %f47, %f1, %f2; mov.f32 %f48, 0f3F800000; - sub.f32 %f133, %f48, %f47; - bra.uni BB29_62; + sub.f32 %f134, %f48, %f47; + bra.uni BB29_68; -BB29_8: - setp.eq.s32 %p32, %r7, 3; - @%p32 bra BB29_9; - bra.uni BB29_62; +BB29_14: + setp.eq.s32 %p32, %r11, 3; + @%p32 bra BB29_15; + bra.uni BB29_68; -BB29_9: - div.rn.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_15: + div.rn.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_42: - min.f32 %f133, %f1, %f2; - bra.uni BB29_62; +BB29_48: + min.f32 %f134, %f1, %f2; + bra.uni BB29_68; -BB29_25: - setp.eq.s32 %p20, %r7, 13; - @%p20 bra BB29_26; - bra.uni BB29_62; +BB29_31: + setp.eq.s32 %p20, %r11, 13; + @%p20 bra BB29_32; + bra.uni BB29_68; -BB29_26: +BB29_32: cvt.rni.s64.f32 %rd12, %f1; - cvt.rni.s64.f32 %rd13, %f2; cvt.u32.u64 %r20, %rd12; + cvt.rni.s64.f32 %rd13, %f2; cvt.u32.u64 %r21, %rd13; and.b32 %r22, %r21, %r20; setp.eq.s32 %p42, %r22, 0; - selp.f32 %f133, 0f00000000, 0f3F800000, %p42; - bra.uni BB29_62; + selp.f32 %f134, 0f00000000, 0f3F800000, %p42; + bra.uni BB29_68; -BB29_45: - setp.le.f32 %p47, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p47; - bra.uni BB29_62; +BB29_51: + setp.gtu.f32 %p47, %f1, %f2; + selp.f32 %f134, 0f00000000, 0f3F800000, %p47; + bra.uni BB29_68; -BB29_16: - setp.eq.s32 %p27, %r7, 8; - @%p27 bra BB29_17; - bra.uni BB29_62; +BB29_22: + setp.eq.s32 %p27, %r11, 8; + @%p27 bra BB29_23; + bra.uni BB29_68; -BB29_17: - setp.ge.f32 %p45, %f1, %f2; - selp.f32 %f133, 0f3F800000, 0f00000000, %p45; - bra.uni BB29_62; +BB29_23: + setp.ltu.f32 %p45, %f1, %f2; + selp.f32 %f134, 0f00000000, 0f3F800000, %p45; + bra.uni BB29_68; -BB29_39: +BB29_45: setp.neu.f32 %p40, %f1, 0f00000000; sub.f32 %f46, %f1, %f2; - selp.f32 %f133, %f46, 0f00000000, %p40; - bra.uni BB29_62; + selp.f32 %f134, %f46, 0f00000000, %p40; + bra.uni BB29_68; -BB29_33: - setp.ne.s32 %p14, %r7, 18; - @%p14 bra BB29_62; +BB29_39: + setp.ne.s32 %p14, %r11, 18; + @%p14 bra BB29_68; - div.rn.f32 %f133, %f1, %f2; - abs.f32 %f41, %f133; + div.rn.f32 %f134, %f1, %f2; + abs.f32 %f41, %f134; setp.geu.f32 %p35, %f41, 0f7F800000; - @%p35 bra BB29_62; + @%p35 bra BB29_68; - cvt.rmi.f32.f32 %f133, %f133; - bra.uni BB29_62; + cvt.rmi.f32.f32 %f134, %f134; + bra.uni BB29_68; -BB29_49: +BB29_55: setp.geu.f32 %p59, %f1, 0f00000000; - @%p59 bra BB29_52; + @%p59 bra BB29_58; cvt.rzi.f32.f32 %f127, %f2; setp.neu.f32 %p60, %f127, %f2; - selp.f32 %f132, 0f7FFFFFFF, %f132, %p60; + selp.f32 %f133, 0f7FFFFFFF, %f133, %p60; -BB29_52: +BB29_58: add.f32 %f129, %f20, %f21; mov.b32 %r35, %f129; setp.lt.s32 %p63, %r35, 2139095040; - @%p63 bra BB29_59; + @%p63 bra BB29_65; setp.gtu.f32 %p64, %f20, 0f7F800000; setp.gtu.f32 %p65, %f21, 0f7F800000; or.pred %p66, %p64, %p65; - @%p66 bra BB29_58; - bra.uni BB29_54; + @%p66 bra BB29_64; + bra.uni BB29_60; -BB29_58: - add.f32 %f132, %f1, %f2; - bra.uni BB29_59; +BB29_64: + add.f32 %f133, %f1, %f2; + bra.uni BB29_65; -BB29_54: +BB29_60: setp.eq.f32 %p67, %f21, 0f7F800000; - @%p67 bra BB29_57; - bra.uni BB29_55; + @%p67 bra BB29_63; + bra.uni BB29_61; -BB29_57: +BB29_63: setp.gt.f32 %p70, %f20, 0f3F800000; selp.b32 %r39, 2139095040, 0, %p70; xor.b32 %r40, %r39, 2139095040; @@ -2857,33 +2861,33 @@ BB29_57: selp.b32 %r41, %r40, %r39, %p71; mov.b32 %f130, %r41; setp.eq.f32 %p72, %f1, 0fBF800000; - selp.f32 %f132, 0f3F800000, %f130, %p72; - bra.uni BB29_59; + selp.f32 %f133, 0f3F800000, %f130, %p72; + bra.uni BB29_65; -BB29_55: +BB29_61: setp.neu.f32 %p68, %f20, 0f7F800000; - @%p68 bra BB29_59; + @%p68 bra BB29_65; - setp.ge.f32 %p69, %f2, 0f00000000; - selp.b32 %r36, 2139095040, 0, %p69; + setp.ltu.f32 %p69, %f2, 0f00000000; + selp.b32 %r36, 0, 2139095040, %p69; or.b32 %r37, %r36, -2147483648; selp.b32 %r38, %r37, %r36, %p1; - mov.b32 %f132, %r38; + mov.b32 %f133, %r38; -BB29_59: +BB29_65: setp.eq.f32 %p73, %f2, 0f00000000; setp.eq.f32 %p74, %f1, 0f3F800000; or.pred %p75, %p74, %p73; - selp.f32 %f133, 0f3F800000, %f132, %p75; + selp.f32 %f134, 0f3F800000, %f133, %p75; -BB29_62: +BB29_68: cvta.to.global.u64 %rd14, %rd3; mul.wide.s32 %rd15, %r3, 4; add.s64 %rd16, %rd14, %rd15; - st.global.f32 [%rd16], %f133; + st.global.f32 [%rd16], %f134; bar.sync 0; -BB29_63: +BB29_69: ret; } @@ -2897,9 +2901,9 @@ BB29_63: .param .u32 matrix_scalar_op_d_param_5 ) { - .reg .pred %p<141>; - .reg .b32 %r<86>; - .reg .f64 %fd<107>; + .reg .pred %p<133>; + .reg .b32 %r<88>; + .reg .f64 %fd<99>; .reg .b64 %rd<20>; @@ -2914,7 +2918,7 @@ BB29_63: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB30_130; + @%p3 bra BB30_142; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -2923,9 +2927,9 @@ BB29_63: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB30_66; + @%p4 bra BB30_72; - mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; + mov.f64 %fd94, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 8; @%p5 bra BB30_19; @@ -2936,31 +2940,31 @@ BB29_63: @%p26 bra BB30_8; setp.eq.s32 %p29, %r6, 0; - @%p29 bra BB30_64; + @%p29 bra BB30_70; bra.uni BB30_6; -BB30_64: - add.f64 %fd98, %fd1, %fd68; - bra.uni BB30_65; +BB30_70: + add.f64 %fd94, %fd1, %fd68; + bra.uni BB30_71; -BB30_66: - mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF; - setp.gt.s32 %p73, %r6, 8; - @%p73 bra BB30_83; +BB30_72: + mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; + setp.gt.s32 %p69, %r6, 8; + @%p69 bra BB30_89; - setp.gt.s32 %p87, %r6, 3; - @%p87 bra BB30_75; + setp.gt.s32 %p83, %r6, 3; + @%p83 bra BB30_81; - setp.gt.s32 %p94, %r6, 1; - @%p94 bra BB30_72; + setp.gt.s32 %p90, %r6, 1; + @%p90 bra BB30_78; - setp.eq.s32 %p97, %r6, 0; - @%p97 bra BB30_128; - bra.uni BB30_70; + setp.eq.s32 %p93, %r6, 0; + @%p93 bra BB30_140; + bra.uni BB30_76; -BB30_128: - add.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_140: + add.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; BB30_19: setp.gt.s32 %p6, %r6, 13; @@ -2970,39 +2974,39 @@ BB30_19: @%p13 bra BB30_24; setp.eq.s32 %p17, %r6, 9; - @%p17 bra BB30_46; + @%p17 bra BB30_48; bra.uni BB30_22; -BB30_46: - setp.eq.f64 %p46, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46; - bra.uni BB30_65; +BB30_48: + setp.eq.f64 %p44, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p44; + bra.uni BB30_71; -BB30_83: - setp.gt.s32 %p74, %r6, 13; - @%p74 bra BB30_92; +BB30_89: + setp.gt.s32 %p70, %r6, 13; + @%p70 bra BB30_98; - setp.gt.s32 %p81, %r6, 10; - @%p81 bra BB30_88; + setp.gt.s32 %p77, %r6, 10; + @%p77 bra BB30_94; - setp.eq.s32 %p85, %r6, 9; - @%p85 bra BB30_110; - bra.uni BB30_86; + setp.eq.s32 %p81, %r6, 9; + @%p81 bra BB30_118; + bra.uni BB30_92; -BB30_110: - setp.eq.f64 %p114, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114; - bra.uni BB30_129; +BB30_118: + setp.eq.f64 %p108, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p108; + bra.uni BB30_141; BB30_11: setp.gt.s32 %p20, %r6, 5; @%p20 bra BB30_15; setp.eq.s32 %p24, %r6, 4; - @%p24 bra BB30_49; + @%p24 bra BB30_51; bra.uni BB30_13; -BB30_49: +BB30_51: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd68; @@ -3015,7 +3019,7 @@ BB30_49: add.s32 %r25, %r24, -1012; mov.b64 %rd14, %fd1; shl.b64 %rd2, %rd14, %r25; - setp.eq.s64 %p51, %rd2, -9223372036854775808; + setp.eq.s64 %p49, %rd2, -9223372036854775808; abs.f64 %fd18, %fd68; // Callseq Start 1 { @@ -3032,69 +3036,68 @@ BB30_49: param0, param1 ); - ld.param.f64 %fd97, [retval0+0]; + ld.param.f64 %fd24, [retval0+0]; //{ }// Callseq End 1 - setp.lt.s32 %p52, %r2, 0; - and.pred %p1, %p52, %p51; - @!%p1 bra BB30_51; - bra.uni BB30_50; + setp.lt.s32 %p50, %r2, 0; + and.pred %p1, %p50, %p49; + @!%p1 bra BB30_53; + bra.uni BB30_52; -BB30_50: +BB30_52: { .reg .b32 %temp; - mov.b64 {%temp, %r26}, %fd97; + mov.b64 {%temp, %r26}, %fd24; } xor.b32 %r27, %r26, -2147483648; { .reg .b32 %temp; - mov.b64 {%r28, %temp}, %fd97; + mov.b64 {%r28, %temp}, %fd24; } - mov.b64 %fd97, {%r28, %r27}; + mov.b64 %fd24, {%r28, %r27}; -BB30_51: - mov.f64 %fd96, %fd97; - setp.eq.f64 %p53, %fd68, 0d0000000000000000; - @%p53 bra BB30_54; - bra.uni BB30_52; +BB30_53: + setp.eq.f64 %p51, %fd68, 0d0000000000000000; + @%p51 bra BB30_56; + bra.uni BB30_54; -BB30_54: - selp.b32 %r29, %r2, 0, %p51; +BB30_56: + selp.b32 %r29, %r2, 0, %p49; or.b32 %r30, %r29, 2146435072; - setp.lt.s32 %p57, %r3, 0; - selp.b32 %r31, %r30, %r29, %p57; + setp.lt.s32 %p55, %r3, 0; + selp.b32 %r31, %r30, %r29, %p55; mov.u32 %r32, 0; - mov.b64 %fd96, {%r32, %r31}; - bra.uni BB30_55; + mov.b64 %fd24, {%r32, %r31}; + bra.uni BB30_57; BB30_28: setp.gt.s32 %p7, %r6, 15; @%p7 bra BB30_32; setp.eq.s32 %p11, %r6, 14; - @%p11 bra BB30_43; + @%p11 bra BB30_45; bra.uni BB30_30; -BB30_43: +BB30_45: cvt.rni.s64.f64 %rd10, %fd68; - cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r18, %rd10; + cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r19, %rd11; or.b32 %r20, %r19, %r18; - setp.eq.s32 %p43, %r20, 0; - selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43; - bra.uni BB30_65; + setp.eq.s32 %p41, %r20, 0; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p41; + bra.uni BB30_71; -BB30_75: - setp.gt.s32 %p88, %r6, 5; - @%p88 bra BB30_79; +BB30_81: + setp.gt.s32 %p84, %r6, 5; + @%p84 bra BB30_85; - setp.eq.s32 %p92, %r6, 4; - @%p92 bra BB30_113; - bra.uni BB30_77; + setp.eq.s32 %p88, %r6, 4; + @%p88 bra BB30_121; + bra.uni BB30_83; -BB30_113: +BB30_121: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -3103,11 +3106,11 @@ BB30_113: .reg .b32 %temp; mov.b64 {%temp, %r5}, %fd68; } - bfe.u32 %r61, %r5, 20, 11; - add.s32 %r62, %r61, -1012; + bfe.u32 %r62, %r5, 20, 11; + add.s32 %r63, %r62, -1012; mov.b64 %rd19, %fd68; - shl.b64 %rd3, %rd19, %r62; - setp.eq.s64 %p119, %rd3, -9223372036854775808; + shl.b64 %rd3, %rd19, %r63; + setp.eq.s64 %p113, %rd3, -9223372036854775808; abs.f64 %fd51, %fd1; // Callseq Start 2 { @@ -3124,614 +3127,619 @@ BB30_113: param0, param1 ); - ld.param.f64 %fd105, [retval0+0]; + ld.param.f64 %fd57, [retval0+0]; //{ }// Callseq End 2 - setp.lt.s32 %p120, %r4, 0; - and.pred %p2, %p120, %p119; - @!%p2 bra BB30_115; - bra.uni BB30_114; + setp.lt.s32 %p114, %r4, 0; + and.pred %p2, %p114, %p113; + @!%p2 bra BB30_123; + bra.uni BB30_122; -BB30_114: +BB30_122: { .reg .b32 %temp; - mov.b64 {%temp, %r63}, %fd105; + mov.b64 {%temp, %r64}, %fd57; } - xor.b32 %r64, %r63, -2147483648; + xor.b32 %r65, %r64, -2147483648; { .reg .b32 %temp; - mov.b64 {%r65, %temp}, %fd105; + mov.b64 {%r66, %temp}, %fd57; } - mov.b64 %fd105, {%r65, %r64}; + mov.b64 %fd57, {%r66, %r65}; -BB30_115: - mov.f64 %fd104, %fd105; - setp.eq.f64 %p121, %fd1, 0d0000000000000000; - @%p121 bra BB30_118; - bra.uni BB30_116; +BB30_123: + setp.eq.f64 %p115, %fd1, 0d0000000000000000; + @%p115 bra BB30_126; + bra.uni BB30_124; -BB30_118: - selp.b32 %r66, %r4, 0, %p119; - or.b32 %r67, %r66, 2146435072; - setp.lt.s32 %p125, %r5, 0; - selp.b32 %r68, %r67, %r66, %p125; - mov.u32 %r69, 0; - mov.b64 %fd104, {%r69, %r68}; - bra.uni BB30_119; +BB30_126: + selp.b32 %r67, %r4, 0, %p113; + or.b32 %r68, %r67, 2146435072; + setp.lt.s32 %p119, %r5, 0; + selp.b32 %r69, %r68, %r67, %p119; + mov.u32 %r70, 0; + mov.b64 %fd57, {%r70, %r69}; + bra.uni BB30_127; -BB30_92: - setp.gt.s32 %p75, %r6, 15; - @%p75 bra BB30_96; +BB30_98: + setp.gt.s32 %p71, %r6, 15; + @%p71 bra BB30_102; - setp.eq.s32 %p79, %r6, 14; - @%p79 bra BB30_107; - bra.uni BB30_94; + setp.eq.s32 %p75, %r6, 14; + @%p75 bra BB30_115; + bra.uni BB30_100; -BB30_107: +BB30_115: cvt.rni.s64.f64 %rd15, %fd1; + cvt.u32.u64 %r56, %rd15; cvt.rni.s64.f64 %rd16, %fd68; - cvt.u32.u64 %r55, %rd15; - cvt.u32.u64 %r56, %rd16; - or.b32 %r57, %r56, %r55; - setp.eq.s32 %p111, %r57, 0; - selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111; - bra.uni BB30_129; + cvt.u32.u64 %r57, %rd16; + or.b32 %r58, %r57, %r56; + setp.eq.s32 %p105, %r58, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p105; + bra.uni BB30_141; BB30_8: setp.eq.s32 %p27, %r6, 2; - @%p27 bra BB30_63; + @%p27 bra BB30_69; bra.uni BB30_9; -BB30_63: - mul.f64 %fd98, %fd1, %fd68; - bra.uni BB30_65; +BB30_69: + mul.f64 %fd94, %fd1, %fd68; + bra.uni BB30_71; BB30_24: setp.eq.s32 %p14, %r6, 11; - @%p14 bra BB30_45; + @%p14 bra BB30_47; setp.eq.s32 %p15, %r6, 12; - @%p15 bra BB30_44; + @%p15 bra BB30_46; bra.uni BB30_26; -BB30_44: - max.f64 %fd98, %fd68, %fd1; - bra.uni BB30_65; +BB30_46: + max.f64 %fd94, %fd68, %fd1; + bra.uni BB30_71; BB30_15: setp.eq.s32 %p21, %r6, 6; - @%p21 bra BB30_48; + @%p21 bra BB30_50; setp.eq.s32 %p22, %r6, 7; - @%p22 bra BB30_47; + @%p22 bra BB30_49; bra.uni BB30_17; -BB30_47: - setp.lt.f64 %p48, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB30_65; +BB30_49: + setp.lt.f64 %p46, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p46; + bra.uni BB30_71; BB30_32: setp.eq.s32 %p8, %r6, 16; - @%p8 bra BB30_42; + @%p8 bra BB30_44; setp.eq.s32 %p9, %r6, 17; - @%p9 bra BB30_38; + @%p9 bra BB30_39; bra.uni BB30_34; -BB30_38: - setp.eq.f64 %p35, %fd1, 0d0000000000000000; - setp.eq.f64 %p36, %fd1, 0d8000000000000000; - or.pred %p37, %p35, %p36; - mov.f64 %fd98, 0d7FF8000000000000; - @%p37 bra BB30_65; +BB30_39: + setp.eq.f64 %p34, %fd1, 0d0000000000000000; + setp.eq.f64 %p35, %fd1, 0d8000000000000000; + or.pred %p36, %p34, %p35; + mov.f64 %fd94, 0d7FF8000000000000; + @%p36 bra BB30_71; - div.rn.f64 %fd98, %fd68, %fd1; - abs.f64 %fd72, %fd98; - setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000; - @%p38 bra BB30_65; + div.rn.f64 %fd94, %fd68, %fd1; + abs.f64 %fd72, %fd94; + setp.gtu.f64 %p37, %fd72, 0d7FF0000000000000; + @%p37 bra BB30_71; { .reg .b32 %temp; - mov.b64 {%r15, %temp}, %fd98; + mov.b64 {%temp, %r15}, %fd94; } + and.b32 %r16, %r15, 2147483647; + setp.ne.s32 %p38, %r16, 2146435072; + @%p38 bra BB30_43; + { .reg .b32 %temp; - mov.b64 {%temp, %r16}, %fd98; + mov.b64 {%r17, %temp}, %fd94; } - and.b32 %r17, %r16, 2147483647; - setp.ne.s32 %p39, %r17, 2146435072; - setp.ne.s32 %p40, %r15, 0; - or.pred %p41, %p39, %p40; - @!%p41 bra BB30_65; - bra.uni BB30_41; + setp.eq.s32 %p39, %r17, 0; + @%p39 bra BB30_71; -BB30_41: - cvt.rmi.f64.f64 %fd73, %fd98; +BB30_43: + cvt.rmi.f64.f64 %fd73, %fd94; mul.f64 %fd74, %fd1, %fd73; - sub.f64 %fd98, %fd68, %fd74; - bra.uni BB30_65; + sub.f64 %fd94, %fd68, %fd74; + bra.uni BB30_71; -BB30_72: - setp.eq.s32 %p95, %r6, 2; - @%p95 bra BB30_127; - bra.uni BB30_73; +BB30_78: + setp.eq.s32 %p91, %r6, 2; + @%p91 bra BB30_139; + bra.uni BB30_79; -BB30_127: - mul.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_139: + mul.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; -BB30_88: - setp.eq.s32 %p82, %r6, 11; - @%p82 bra BB30_109; +BB30_94: + setp.eq.s32 %p78, %r6, 11; + @%p78 bra BB30_117; - setp.eq.s32 %p83, %r6, 12; - @%p83 bra BB30_108; - bra.uni BB30_90; + setp.eq.s32 %p79, %r6, 12; + @%p79 bra BB30_116; + bra.uni BB30_96; -BB30_108: - max.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_116: + max.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; -BB30_79: - setp.eq.s32 %p89, %r6, 6; - @%p89 bra BB30_112; +BB30_85: + setp.eq.s32 %p85, %r6, 6; + @%p85 bra BB30_120; - setp.eq.s32 %p90, %r6, 7; - @%p90 bra BB30_111; - bra.uni BB30_81; + setp.eq.s32 %p86, %r6, 7; + @%p86 bra BB30_119; + bra.uni BB30_87; -BB30_111: - setp.gt.f64 %p116, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116; - bra.uni BB30_129; +BB30_119: + setp.gt.f64 %p110, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p110; + bra.uni BB30_141; -BB30_96: - setp.eq.s32 %p76, %r6, 16; - @%p76 bra BB30_106; +BB30_102: + setp.eq.s32 %p72, %r6, 16; + @%p72 bra BB30_114; - setp.eq.s32 %p77, %r6, 17; - @%p77 bra BB30_102; - bra.uni BB30_98; + setp.eq.s32 %p73, %r6, 17; + @%p73 bra BB30_109; + bra.uni BB30_104; -BB30_102: - setp.eq.f64 %p103, %fd68, 0d0000000000000000; - setp.eq.f64 %p104, %fd68, 0d8000000000000000; - or.pred %p105, %p103, %p104; - mov.f64 %fd106, 0d7FF8000000000000; - @%p105 bra BB30_129; +BB30_109: + setp.eq.f64 %p98, %fd68, 0d0000000000000000; + setp.eq.f64 %p99, %fd68, 0d8000000000000000; + or.pred %p100, %p98, %p99; + mov.f64 %fd98, 0d7FF8000000000000; + @%p100 bra BB30_141; - div.rn.f64 %fd106, %fd1, %fd68; - abs.f64 %fd83, %fd106; - setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000; - @%p106 bra BB30_129; + div.rn.f64 %fd98, %fd1, %fd68; + abs.f64 %fd83, %fd98; + setp.gtu.f64 %p101, %fd83, 0d7FF0000000000000; + @%p101 bra BB30_141; { .reg .b32 %temp; - mov.b64 {%r52, %temp}, %fd106; + mov.b64 {%temp, %r53}, %fd98; } + and.b32 %r54, %r53, 2147483647; + setp.ne.s32 %p102, %r54, 2146435072; + @%p102 bra BB30_113; + { .reg .b32 %temp; - mov.b64 {%temp, %r53}, %fd106; + mov.b64 {%r55, %temp}, %fd98; } - and.b32 %r54, %r53, 2147483647; - setp.ne.s32 %p107, %r54, 2146435072; - setp.ne.s32 %p108, %r52, 0; - or.pred %p109, %p107, %p108; - @!%p109 bra BB30_129; - bra.uni BB30_105; - -BB30_105: - cvt.rmi.f64.f64 %fd84, %fd106; + setp.eq.s32 %p103, %r55, 0; + @%p103 bra BB30_141; + +BB30_113: + cvt.rmi.f64.f64 %fd84, %fd98; mul.f64 %fd85, %fd84, %fd68; - sub.f64 %fd106, %fd1, %fd85; - bra.uni BB30_129; + sub.f64 %fd98, %fd1, %fd85; + bra.uni BB30_141; BB30_6: setp.eq.s32 %p30, %r6, 1; @%p30 bra BB30_7; - bra.uni BB30_65; + bra.uni BB30_71; BB30_7: - sub.f64 %fd98, %fd68, %fd1; - bra.uni BB30_65; + sub.f64 %fd94, %fd68, %fd1; + bra.uni BB30_71; BB30_22: setp.eq.s32 %p18, %r6, 10; @%p18 bra BB30_23; - bra.uni BB30_65; + bra.uni BB30_71; BB30_23: - setp.neu.f64 %p45, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45; - bra.uni BB30_65; + setp.neu.f64 %p43, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p43; + bra.uni BB30_71; BB30_13: setp.eq.s32 %p25, %r6, 5; @%p25 bra BB30_14; - bra.uni BB30_65; + bra.uni BB30_71; BB30_14: - setp.gt.f64 %p50, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB30_65; + setp.gt.f64 %p48, %fd1, %fd68; + selp.f64 %fd94, 0d3FF0000000000000, 0d0000000000000000, %p48; + bra.uni BB30_71; BB30_30: setp.eq.s32 %p12, %r6, 15; @%p12 bra BB30_31; - bra.uni BB30_65; + bra.uni BB30_71; BB30_31: mul.f64 %fd76, %fd1, %fd68; mov.f64 %fd77, 0d3FF0000000000000; - sub.f64 %fd98, %fd77, %fd76; - bra.uni BB30_65; + sub.f64 %fd94, %fd77, %fd76; + bra.uni BB30_71; BB30_9: setp.eq.s32 %p28, %r6, 3; @%p28 bra BB30_10; - bra.uni BB30_65; + bra.uni BB30_71; BB30_10: - div.rn.f64 %fd98, %fd68, %fd1; - bra.uni BB30_65; + div.rn.f64 %fd94, %fd68, %fd1; + bra.uni BB30_71; -BB30_45: - min.f64 %fd98, %fd68, %fd1; - bra.uni BB30_65; +BB30_47: + min.f64 %fd94, %fd68, %fd1; + bra.uni BB30_71; BB30_26: setp.eq.s32 %p16, %r6, 13; @%p16 bra BB30_27; - bra.uni BB30_65; + bra.uni BB30_71; BB30_27: cvt.rni.s64.f64 %rd12, %fd68; - cvt.rni.s64.f64 %rd13, %fd1; cvt.u32.u64 %r21, %rd12; + cvt.rni.s64.f64 %rd13, %fd1; cvt.u32.u64 %r22, %rd13; and.b32 %r23, %r22, %r21; - setp.eq.s32 %p44, %r23, 0; - selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44; - bra.uni BB30_65; + setp.eq.s32 %p42, %r23, 0; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p42; + bra.uni BB30_71; -BB30_48: - setp.ge.f64 %p49, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB30_65; +BB30_50: + setp.ltu.f64 %p47, %fd1, %fd68; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p47; + bra.uni BB30_71; BB30_17: setp.eq.s32 %p23, %r6, 8; @%p23 bra BB30_18; - bra.uni BB30_65; + bra.uni BB30_71; BB30_18: - setp.le.f64 %p47, %fd1, %fd68; - selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47; - bra.uni BB30_65; + setp.gtu.f64 %p45, %fd1, %fd68; + selp.f64 %fd94, 0d0000000000000000, 0d3FF0000000000000, %p45; + bra.uni BB30_71; -BB30_42: - setp.neu.f64 %p42, %fd68, 0d0000000000000000; +BB30_44: + setp.neu.f64 %p40, %fd68, 0d0000000000000000; sub.f64 %fd75, %fd68, %fd1; - selp.f64 %fd98, %fd75, 0d0000000000000000, %p42; - bra.uni BB30_65; + selp.f64 %fd94, %fd75, 0d0000000000000000, %p40; + bra.uni BB30_71; BB30_34: setp.ne.s32 %p10, %r6, 18; - @%p10 bra BB30_65; + @%p10 bra BB30_71; - div.rn.f64 %fd98, %fd68, %fd1; - abs.f64 %fd70, %fd98; + div.rn.f64 %fd94, %fd68, %fd1; + abs.f64 %fd70, %fd94; setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; - @%p31 bra BB30_65; + @%p31 bra BB30_71; { .reg .b32 %temp; - mov.b64 {%r12, %temp}, %fd98; + mov.b64 {%temp, %r12}, %fd94; } + and.b32 %r13, %r12, 2147483647; + setp.ne.s32 %p32, %r13, 2146435072; + @%p32 bra BB30_38; + { .reg .b32 %temp; - mov.b64 {%temp, %r13}, %fd98; + mov.b64 {%r14, %temp}, %fd94; } - and.b32 %r14, %r13, 2147483647; - setp.ne.s32 %p32, %r14, 2146435072; - setp.ne.s32 %p33, %r12, 0; - or.pred %p34, %p32, %p33; - @!%p34 bra BB30_65; - bra.uni BB30_37; + setp.eq.s32 %p33, %r14, 0; + @%p33 bra BB30_71; -BB30_37: - cvt.rmi.f64.f64 %fd98, %fd98; - bra.uni BB30_65; +BB30_38: + cvt.rmi.f64.f64 %fd94, %fd94; + bra.uni BB30_71; -BB30_70: - setp.eq.s32 %p98, %r6, 1; - @%p98 bra BB30_71; - bra.uni BB30_129; +BB30_76: + setp.eq.s32 %p94, %r6, 1; + @%p94 bra BB30_77; + bra.uni BB30_141; -BB30_71: - sub.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_77: + sub.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; -BB30_86: - setp.eq.s32 %p86, %r6, 10; - @%p86 bra BB30_87; - bra.uni BB30_129; +BB30_92: + setp.eq.s32 %p82, %r6, 10; + @%p82 bra BB30_93; + bra.uni BB30_141; -BB30_87: - setp.neu.f64 %p113, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113; - bra.uni BB30_129; +BB30_93: + setp.neu.f64 %p107, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p107; + bra.uni BB30_141; -BB30_77: - setp.eq.s32 %p93, %r6, 5; - @%p93 bra BB30_78; - bra.uni BB30_129; +BB30_83: + setp.eq.s32 %p89, %r6, 5; + @%p89 bra BB30_84; + bra.uni BB30_141; -BB30_78: - setp.lt.f64 %p118, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118; - bra.uni BB30_129; +BB30_84: + setp.lt.f64 %p112, %fd1, %fd68; + selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p112; + bra.uni BB30_141; -BB30_94: - setp.eq.s32 %p80, %r6, 15; - @%p80 bra BB30_95; - bra.uni BB30_129; +BB30_100: + setp.eq.s32 %p76, %r6, 15; + @%p76 bra BB30_101; + bra.uni BB30_141; -BB30_95: +BB30_101: mul.f64 %fd87, %fd1, %fd68; mov.f64 %fd88, 0d3FF0000000000000; - sub.f64 %fd106, %fd88, %fd87; - bra.uni BB30_129; + sub.f64 %fd98, %fd88, %fd87; + bra.uni BB30_141; -BB30_73: - setp.eq.s32 %p96, %r6, 3; - @%p96 bra BB30_74; - bra.uni BB30_129; +BB30_79: + setp.eq.s32 %p92, %r6, 3; + @%p92 bra BB30_80; + bra.uni BB30_141; -BB30_74: - div.rn.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_80: + div.rn.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; -BB30_109: - min.f64 %fd106, %fd1, %fd68; - bra.uni BB30_129; +BB30_117: + min.f64 %fd98, %fd1, %fd68; + bra.uni BB30_141; -BB30_90: - setp.eq.s32 %p84, %r6, 13; - @%p84 bra BB30_91; - bra.uni BB30_129; +BB30_96: + setp.eq.s32 %p80, %r6, 13; + @%p80 bra BB30_97; + bra.uni BB30_141; -BB30_91: +BB30_97: cvt.rni.s64.f64 %rd17, %fd1; + cvt.u32.u64 %r59, %rd17; cvt.rni.s64.f64 %rd18, %fd68; - cvt.u32.u64 %r58, %rd17; - cvt.u32.u64 %r59, %rd18; - and.b32 %r60, %r59, %r58; - setp.eq.s32 %p112, %r60, 0; - selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112; - bra.uni BB30_129; - -BB30_112: - setp.le.f64 %p117, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117; - bra.uni BB30_129; + cvt.u32.u64 %r60, %rd18; + and.b32 %r61, %r60, %r59; + setp.eq.s32 %p106, %r61, 0; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p106; + bra.uni BB30_141; -BB30_81: - setp.eq.s32 %p91, %r6, 8; - @%p91 bra BB30_82; - bra.uni BB30_129; +BB30_120: + setp.gtu.f64 %p111, %fd1, %fd68; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p111; + bra.uni BB30_141; + +BB30_87: + setp.eq.s32 %p87, %r6, 8; + @%p87 bra BB30_88; + bra.uni BB30_141; -BB30_82: - setp.ge.f64 %p115, %fd1, %fd68; - selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115; - bra.uni BB30_129; +BB30_88: + setp.ltu.f64 %p109, %fd1, %fd68; + selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p109; + bra.uni BB30_141; -BB30_106: - setp.neu.f64 %p110, %fd1, 0d0000000000000000; +BB30_114: + setp.neu.f64 %p104, %fd1, 0d0000000000000000; sub.f64 %fd86, %fd1, %fd68; - selp.f64 %fd106, %fd86, 0d0000000000000000, %p110; - bra.uni BB30_129; + selp.f64 %fd98, %fd86, 0d0000000000000000, %p104; + bra.uni BB30_141; -BB30_98: - setp.ne.s32 %p78, %r6, 18; - @%p78 bra BB30_129; +BB30_104: + setp.ne.s32 %p74, %r6, 18; + @%p74 bra BB30_141; - div.rn.f64 %fd106, %fd1, %fd68; - abs.f64 %fd81, %fd106; - setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000; - @%p99 bra BB30_129; + div.rn.f64 %fd98, %fd1, %fd68; + abs.f64 %fd81, %fd98; + setp.gtu.f64 %p95, %fd81, 0d7FF0000000000000; + @%p95 bra BB30_141; { .reg .b32 %temp; - mov.b64 {%r49, %temp}, %fd106; + mov.b64 {%temp, %r50}, %fd98; } + and.b32 %r51, %r50, 2147483647; + setp.ne.s32 %p96, %r51, 2146435072; + @%p96 bra BB30_108; + { .reg .b32 %temp; - mov.b64 {%temp, %r50}, %fd106; + mov.b64 {%r52, %temp}, %fd98; } - and.b32 %r51, %r50, 2147483647; - setp.ne.s32 %p100, %r51, 2146435072; - setp.ne.s32 %p101, %r49, 0; - or.pred %p102, %p100, %p101; - @!%p102 bra BB30_129; - bra.uni BB30_101; + setp.eq.s32 %p97, %r52, 0; + @%p97 bra BB30_141; -BB30_101: - cvt.rmi.f64.f64 %fd106, %fd106; - bra.uni BB30_129; +BB30_108: + cvt.rmi.f64.f64 %fd98, %fd98; + bra.uni BB30_141; -BB30_52: - setp.gt.s32 %p54, %r2, -1; - @%p54 bra BB30_55; +BB30_54: + setp.gt.s32 %p52, %r2, -1; + @%p52 bra BB30_57; cvt.rzi.f64.f64 %fd78, %fd1; - setp.neu.f64 %p55, %fd78, %fd1; - selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55; + setp.neu.f64 %p53, %fd78, %fd1; + selp.f64 %fd24, 0dFFF8000000000000, %fd24, %p53; -BB30_55: - mov.f64 %fd24, %fd96; - add.f64 %fd25, %fd1, %fd68; +BB30_57: + add.f64 %fd93, %fd1, %fd68; { .reg .b32 %temp; - mov.b64 {%temp, %r33}, %fd25; + mov.b64 {%temp, %r33}, %fd93; } and.b32 %r34, %r33, 2146435072; - setp.ne.s32 %p58, %r34, 2146435072; - mov.f64 %fd95, %fd24; - @%p58 bra BB30_62; + setp.ne.s32 %p56, %r34, 2146435072; + @%p56 bra BB30_58; - setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000; - mov.f64 %fd95, %fd25; - @%p59 bra BB30_62; + setp.gtu.f64 %p57, %fd18, 0d7FF0000000000000; + @%p57 bra BB30_68; abs.f64 %fd79, %fd1; - setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000; - mov.f64 %fd94, %fd25; - mov.f64 %fd95, %fd94; - @%p60 bra BB30_62; + setp.gtu.f64 %p58, %fd79, 0d7FF0000000000000; + @%p58 bra BB30_68; + + and.b32 %r35, %r3, 2147483647; + setp.ne.s32 %p59, %r35, 2146435072; + @%p59 bra BB30_63; { .reg .b32 %temp; - mov.b64 {%r35, %temp}, %fd1; - } - and.b32 %r36, %r3, 2147483647; - setp.eq.s32 %p61, %r36, 2146435072; - setp.eq.s32 %p62, %r35, 0; - and.pred %p63, %p61, %p62; - @%p63 bra BB30_61; - bra.uni BB30_59; - -BB30_61: - setp.gt.f64 %p67, %fd18, 0d3FF0000000000000; - selp.b32 %r44, 2146435072, 0, %p67; - xor.b32 %r45, %r44, 2146435072; - setp.lt.s32 %p68, %r3, 0; - selp.b32 %r46, %r45, %r44, %p68; - setp.eq.f64 %p69, %fd68, 0dBFF0000000000000; - selp.b32 %r47, 1072693248, %r46, %p69; - mov.u32 %r48, 0; - mov.b64 %fd95, {%r48, %r47}; - bra.uni BB30_62; - -BB30_116: - setp.gt.s32 %p122, %r4, -1; - @%p122 bra BB30_119; + mov.b64 {%r36, %temp}, %fd1; + } + setp.eq.s32 %p60, %r36, 0; + @%p60 bra BB30_67; - cvt.rzi.f64.f64 %fd89, %fd68; - setp.neu.f64 %p123, %fd89, %fd68; - selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123; +BB30_63: + and.b32 %r37, %r2, 2147483647; + setp.ne.s32 %p61, %r37, 2146435072; + @%p61 bra BB30_64; -BB30_119: - mov.f64 %fd57, %fd104; - add.f64 %fd58, %fd1, %fd68; { .reg .b32 %temp; - mov.b64 {%temp, %r70}, %fd58; + mov.b64 {%r38, %temp}, %fd68; } - and.b32 %r71, %r70, 2146435072; - setp.ne.s32 %p126, %r71, 2146435072; - mov.f64 %fd103, %fd57; - @%p126 bra BB30_126; + setp.ne.s32 %p62, %r38, 0; + mov.f64 %fd93, %fd24; + @%p62 bra BB30_68; - setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000; - mov.f64 %fd103, %fd58; - @%p127 bra BB30_126; + shr.s32 %r39, %r3, 31; + and.b32 %r40, %r39, -2146435072; + add.s32 %r41, %r40, 2146435072; + or.b32 %r42, %r41, -2147483648; + selp.b32 %r43, %r42, %r41, %p1; + mov.u32 %r44, 0; + mov.b64 %fd93, {%r44, %r43}; + bra.uni BB30_68; - abs.f64 %fd90, %fd68; - setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000; - mov.f64 %fd102, %fd58; - mov.f64 %fd103, %fd102; - @%p128 bra BB30_126; +BB30_58: + mov.f64 %fd93, %fd24; +BB30_68: + setp.eq.f64 %p66, %fd1, 0d0000000000000000; + setp.eq.f64 %p67, %fd68, 0d3FF0000000000000; + or.pred %p68, %p67, %p66; + selp.f64 %fd94, 0d3FF0000000000000, %fd93, %p68; + +BB30_71: + st.global.f64 [%rd1], %fd94; + bra.uni BB30_142; + +BB30_124: + setp.gt.s32 %p116, %r4, -1; + @%p116 bra BB30_127; + + cvt.rzi.f64.f64 %fd89, %fd68; + setp.neu.f64 %p117, %fd89, %fd68; + selp.f64 %fd57, 0dFFF8000000000000, %fd57, %p117; + +BB30_127: + add.f64 %fd97, %fd1, %fd68; { .reg .b32 %temp; - mov.b64 {%r72, %temp}, %fd68; + mov.b64 {%temp, %r71}, %fd97; } + and.b32 %r72, %r71, 2146435072; + setp.ne.s32 %p120, %r72, 2146435072; + @%p120 bra BB30_128; + + setp.gtu.f64 %p121, %fd51, 0d7FF0000000000000; + @%p121 bra BB30_138; + + abs.f64 %fd90, %fd68; + setp.gtu.f64 %p122, %fd90, 0d7FF0000000000000; + @%p122 bra BB30_138; + and.b32 %r73, %r5, 2147483647; - setp.eq.s32 %p129, %r73, 2146435072; - setp.eq.s32 %p130, %r72, 0; - and.pred %p131, %p129, %p130; - @%p131 bra BB30_125; - bra.uni BB30_123; - -BB30_125: - setp.gt.f64 %p135, %fd51, 0d3FF0000000000000; - selp.b32 %r81, 2146435072, 0, %p135; - xor.b32 %r82, %r81, 2146435072; - setp.lt.s32 %p136, %r5, 0; - selp.b32 %r83, %r82, %r81, %p136; - setp.eq.f64 %p137, %fd1, 0dBFF0000000000000; - selp.b32 %r84, 1072693248, %r83, %p137; - mov.u32 %r85, 0; - mov.b64 %fd103, {%r85, %r84}; - bra.uni BB30_126; - -BB30_59: + setp.ne.s32 %p123, %r73, 2146435072; + @%p123 bra BB30_133; + { .reg .b32 %temp; - mov.b64 {%r37, %temp}, %fd68; + mov.b64 {%r74, %temp}, %fd68; } - and.b32 %r38, %r2, 2147483647; - setp.eq.s32 %p64, %r38, 2146435072; - setp.eq.s32 %p65, %r37, 0; - and.pred %p66, %p64, %p65; - mov.f64 %fd95, %fd24; - @!%p66 bra BB30_62; - bra.uni BB30_60; - -BB30_60: - shr.s32 %r39, %r3, 31; - and.b32 %r40, %r39, -2146435072; - selp.b32 %r41, -1048576, 2146435072, %p1; - add.s32 %r42, %r41, %r40; - mov.u32 %r43, 0; - mov.b64 %fd95, {%r43, %r42}; - -BB30_62: - setp.eq.f64 %p70, %fd1, 0d0000000000000000; - setp.eq.f64 %p71, %fd68, 0d3FF0000000000000; - or.pred %p72, %p71, %p70; - selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72; + setp.eq.s32 %p124, %r74, 0; + @%p124 bra BB30_137; -BB30_65: - st.global.f64 [%rd1], %fd98; - bra.uni BB30_130; +BB30_133: + and.b32 %r75, %r4, 2147483647; + setp.ne.s32 %p125, %r75, 2146435072; + @%p125 bra BB30_134; -BB30_123: { .reg .b32 %temp; - mov.b64 {%r74, %temp}, %fd1; + mov.b64 {%r76, %temp}, %fd1; } - and.b32 %r75, %r4, 2147483647; - setp.eq.s32 %p132, %r75, 2146435072; - setp.eq.s32 %p133, %r74, 0; - and.pred %p134, %p132, %p133; - mov.f64 %fd103, %fd57; - @!%p134 bra BB30_126; - bra.uni BB30_124; + setp.ne.s32 %p126, %r76, 0; + mov.f64 %fd97, %fd57; + @%p126 bra BB30_138; -BB30_124: - shr.s32 %r76, %r5, 31; - and.b32 %r77, %r76, -2146435072; - selp.b32 %r78, -1048576, 2146435072, %p2; - add.s32 %r79, %r78, %r77; - mov.u32 %r80, 0; - mov.b64 %fd103, {%r80, %r79}; + shr.s32 %r77, %r5, 31; + and.b32 %r78, %r77, -2146435072; + add.s32 %r79, %r78, 2146435072; + or.b32 %r80, %r79, -2147483648; + selp.b32 %r81, %r80, %r79, %p2; + mov.u32 %r82, 0; + mov.b64 %fd97, {%r82, %r81}; + bra.uni BB30_138; -BB30_126: - setp.eq.f64 %p138, %fd68, 0d0000000000000000; - setp.eq.f64 %p139, %fd1, 0d3FF0000000000000; - or.pred %p140, %p139, %p138; - selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140; +BB30_128: + mov.f64 %fd97, %fd57; + +BB30_138: + setp.eq.f64 %p130, %fd68, 0d0000000000000000; + setp.eq.f64 %p131, %fd1, 0d3FF0000000000000; + or.pred %p132, %p131, %p130; + selp.f64 %fd98, 0d3FF0000000000000, %fd97, %p132; -BB30_129:
<TRUNCATED>
