http://git-wip-us.apache.org/repos/asf/systemml/blob/8e3c6f8b/src/main/cpp/kernels/SystemML.ptx ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx index c990f27..5f72887 100644 --- a/src/main/cpp/kernels/SystemML.ptx +++ b/src/main/cpp/kernels/SystemML.ptx @@ -103,6 +103,694 @@ BB1_2: ret; } + // .globl sparse_dense_im2col_d +.visible .entry sparse_dense_im2col_d( + .param .u64 sparse_dense_im2col_d_param_0, + .param .u64 sparse_dense_im2col_d_param_1, + .param .u64 sparse_dense_im2col_d_param_2, + .param .u64 sparse_dense_im2col_d_param_3, + .param .u32 sparse_dense_im2col_d_param_4, + .param .u32 sparse_dense_im2col_d_param_5, + .param .u32 sparse_dense_im2col_d_param_6, + .param .u32 sparse_dense_im2col_d_param_7, + .param .u32 sparse_dense_im2col_d_param_8, + .param .u32 sparse_dense_im2col_d_param_9, + .param .u32 sparse_dense_im2col_d_param_10, + .param .u32 sparse_dense_im2col_d_param_11, + .param .u32 sparse_dense_im2col_d_param_12, + .param .u32 sparse_dense_im2col_d_param_13, + .param .u32 sparse_dense_im2col_d_param_14, + .param .u32 sparse_dense_im2col_d_param_15, + .param .u32 sparse_dense_im2col_d_param_16, + .param .u32 sparse_dense_im2col_d_param_17, + .param .u32 sparse_dense_im2col_d_param_18, + .param .u32 sparse_dense_im2col_d_param_19 +) +{ + .reg .pred %p<13>; + .reg .b32 %r<74>; + .reg .f64 %fd<2>; + .reg .b64 %rd<18>; + + + ld.param.u64 %rd4, [sparse_dense_im2col_d_param_0]; + ld.param.u64 %rd5, [sparse_dense_im2col_d_param_1]; + ld.param.u64 %rd6, [sparse_dense_im2col_d_param_2]; + ld.param.u64 %rd7, [sparse_dense_im2col_d_param_3]; + ld.param.u32 %r35, [sparse_dense_im2col_d_param_4]; + ld.param.u32 %r22, [sparse_dense_im2col_d_param_7]; + ld.param.u32 %r23, [sparse_dense_im2col_d_param_8]; + ld.param.u32 %r24, [sparse_dense_im2col_d_param_9]; + ld.param.u32 %r25, [sparse_dense_im2col_d_param_10]; + ld.param.u32 %r26, [sparse_dense_im2col_d_param_11]; + ld.param.u32 %r27, [sparse_dense_im2col_d_param_12]; + ld.param.u32 %r28, [sparse_dense_im2col_d_param_13]; + ld.param.u32 %r29, [sparse_dense_im2col_d_param_14]; + ld.param.u32 %r30, [sparse_dense_im2col_d_param_15]; + ld.param.u32 %r31, [sparse_dense_im2col_d_param_16]; + ld.param.u32 %r32, [sparse_dense_im2col_d_param_17]; + ld.param.u32 %r33, [sparse_dense_im2col_d_param_18]; + ld.param.u32 %r34, [sparse_dense_im2col_d_param_19]; + mov.u32 %r36, %ntid.x; + mov.u32 %r37, %ctaid.x; + mov.u32 %r38, %tid.x; + mad.lo.s32 %r1, %r36, %r37, %r38; + setp.ge.s32 %p1, %r1, %r35; + @%p1 bra BB2_11; + + cvta.to.global.u64 %rd1, %rd6; + cvta.to.global.u64 %rd2, %rd5; + cvta.to.global.u64 %rd8, %rd4; + cvt.s64.s32 %rd3, %r1; + mul.wide.s32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.f64 %fd1, [%rd10]; + mov.u32 %r67, 0; + +BB2_2: + mov.u32 %r2, %r67; + add.s32 %r67, %r2, 1; + mul.wide.s32 %rd11, %r67, 4; + add.s64 %rd12, %rd2, %rd11; + ld.global.u32 %r40, [%rd12]; + setp.le.s32 %p2, %r40, %r1; + @%p2 bra BB2_2; + + shl.b64 %rd13, %rd3, 2; + add.s64 %rd14, %rd1, %rd13; + ld.global.u32 %r41, [%rd14]; + div.s32 %r4, %r41, %r22; + rem.s32 %r42, %r41, %r22; + div.s32 %r43, %r42, %r23; + rem.s32 %r44, %r42, %r23; + add.s32 %r5, %r43, %r33; + mul.lo.s32 %r45, %r31, %r26; + mov.u32 %r46, 1; + sub.s32 %r47, %r46, %r45; + add.s32 %r48, %r47, %r5; + mov.u32 %r49, 0; + max.s32 %r70, %r49, %r48; + add.s32 %r50, %r24, -1; + min.s32 %r7, %r50, %r5; + add.s32 %r8, %r44, %r34; + mul.lo.s32 %r51, %r32, %r27; + sub.s32 %r52, %r46, %r51; + add.s32 %r53, %r52, %r8; + max.s32 %r73, %r49, %r53; + add.s32 %r54, %r25, -1; + min.s32 %r10, %r54, %r8; + +BB2_4: + mov.u32 %r69, %r70; + sub.s32 %r55, %r5, %r69; + rem.s32 %r56, %r55, %r31; + setp.ne.s32 %p3, %r56, 0; + setp.le.s32 %p4, %r69, %r7; + and.pred %p5, %p3, %p4; + add.s32 %r70, %r69, 1; + @%p5 bra BB2_4; + +BB2_5: + mov.u32 %r13, %r73; + sub.s32 %r57, %r8, %r13; + rem.s32 %r58, %r57, %r32; + setp.ne.s32 %p6, %r58, 0; + setp.le.s32 %p7, %r13, %r10; + and.pred %p8, %p6, %p7; + add.s32 %r73, %r13, 1; + @%p8 bra BB2_5; + + setp.gt.s32 %p9, %r69, %r7; + @%p9 bra BB2_11; + + mul.lo.s32 %r15, %r2, %r28; + mul.lo.s32 %r16, %r4, %r29; + cvta.to.global.u64 %rd15, %rd7; + +BB2_8: + sub.s32 %r59, %r5, %r69; + div.s32 %r60, %r59, %r31; + mad.lo.s32 %r18, %r60, %r27, %r15; + setp.gt.s32 %p10, %r13, %r10; + mov.u32 %r72, %r13; + @%p10 bra BB2_10; + +BB2_9: + mov.u32 %r19, %r72; + sub.s32 %r61, %r8, %r19; + div.s32 %r62, %r61, %r32; + mad.lo.s32 %r63, %r69, %r25, %r16; + add.s32 %r64, %r63, %r19; + mad.lo.s32 %r65, %r64, %r30, %r18; + add.s32 %r66, %r65, %r62; + mul.wide.s32 %rd16, %r66, 8; + add.s64 %rd17, %rd15, %rd16; + st.global.f64 [%rd17], %fd1; + add.s32 %r20, %r19, %r32; + setp.le.s32 %p11, %r20, %r10; + mov.u32 %r72, %r20; + @%p11 bra BB2_9; + +BB2_10: + add.s32 %r69, %r69, %r31; + setp.le.s32 %p12, %r69, %r7; + @%p12 bra BB2_8; + +BB2_11: + ret; +} + + // .globl sparse_dense_im2col_f +.visible .entry sparse_dense_im2col_f( + .param .u64 sparse_dense_im2col_f_param_0, + .param .u64 sparse_dense_im2col_f_param_1, + .param .u64 sparse_dense_im2col_f_param_2, + .param .u64 sparse_dense_im2col_f_param_3, + .param .u32 sparse_dense_im2col_f_param_4, + .param .u32 sparse_dense_im2col_f_param_5, + .param .u32 sparse_dense_im2col_f_param_6, + .param .u32 sparse_dense_im2col_f_param_7, + .param .u32 sparse_dense_im2col_f_param_8, + .param .u32 sparse_dense_im2col_f_param_9, + .param .u32 sparse_dense_im2col_f_param_10, + .param .u32 sparse_dense_im2col_f_param_11, + .param .u32 sparse_dense_im2col_f_param_12, + .param .u32 sparse_dense_im2col_f_param_13, + .param .u32 sparse_dense_im2col_f_param_14, + .param .u32 sparse_dense_im2col_f_param_15, + .param .u32 sparse_dense_im2col_f_param_16, + .param .u32 sparse_dense_im2col_f_param_17, + .param .u32 sparse_dense_im2col_f_param_18, + .param .u32 sparse_dense_im2col_f_param_19 +) +{ + .reg .pred %p<13>; + .reg .f32 %f<2>; + .reg .b32 %r<74>; + .reg .b64 %rd<18>; + + + ld.param.u64 %rd4, [sparse_dense_im2col_f_param_0]; + ld.param.u64 %rd5, [sparse_dense_im2col_f_param_1]; + ld.param.u64 %rd6, [sparse_dense_im2col_f_param_2]; + ld.param.u64 %rd7, [sparse_dense_im2col_f_param_3]; + ld.param.u32 %r35, [sparse_dense_im2col_f_param_4]; + ld.param.u32 %r22, [sparse_dense_im2col_f_param_7]; + ld.param.u32 %r23, [sparse_dense_im2col_f_param_8]; + ld.param.u32 %r24, [sparse_dense_im2col_f_param_9]; + ld.param.u32 %r25, [sparse_dense_im2col_f_param_10]; + ld.param.u32 %r26, [sparse_dense_im2col_f_param_11]; + ld.param.u32 %r27, [sparse_dense_im2col_f_param_12]; + ld.param.u32 %r28, [sparse_dense_im2col_f_param_13]; + ld.param.u32 %r29, [sparse_dense_im2col_f_param_14]; + ld.param.u32 %r30, [sparse_dense_im2col_f_param_15]; + ld.param.u32 %r31, [sparse_dense_im2col_f_param_16]; + ld.param.u32 %r32, [sparse_dense_im2col_f_param_17]; + ld.param.u32 %r33, [sparse_dense_im2col_f_param_18]; + ld.param.u32 %r34, [sparse_dense_im2col_f_param_19]; + mov.u32 %r36, %ntid.x; + mov.u32 %r37, %ctaid.x; + mov.u32 %r38, %tid.x; + mad.lo.s32 %r1, %r36, %r37, %r38; + setp.ge.s32 %p1, %r1, %r35; + @%p1 bra BB3_11; + + cvta.to.global.u64 %rd1, %rd6; + cvta.to.global.u64 %rd2, %rd5; + cvta.to.global.u64 %rd8, %rd4; + cvt.s64.s32 %rd3, %r1; + mul.wide.s32 %rd9, %r1, 4; + add.s64 %rd10, %rd8, %rd9; + ld.global.f32 %f1, [%rd10]; + mov.u32 %r67, 0; + +BB3_2: + mov.u32 %r2, %r67; + add.s32 %r67, %r2, 1; + mul.wide.s32 %rd11, %r67, 4; + add.s64 %rd12, %rd2, %rd11; + ld.global.u32 %r40, [%rd12]; + setp.le.s32 %p2, %r40, %r1; + @%p2 bra BB3_2; + + shl.b64 %rd13, %rd3, 2; + add.s64 %rd14, %rd1, %rd13; + ld.global.u32 %r41, [%rd14]; + div.s32 %r4, %r41, %r22; + rem.s32 %r42, %r41, %r22; + div.s32 %r43, %r42, %r23; + rem.s32 %r44, %r42, %r23; + add.s32 %r5, %r43, %r33; + mul.lo.s32 %r45, %r31, %r26; + mov.u32 %r46, 1; + sub.s32 %r47, %r46, %r45; + add.s32 %r48, %r47, %r5; + mov.u32 %r49, 0; + max.s32 %r70, %r49, %r48; + add.s32 %r50, %r24, -1; + min.s32 %r7, %r50, %r5; + add.s32 %r8, %r44, %r34; + mul.lo.s32 %r51, %r32, %r27; + sub.s32 %r52, %r46, %r51; + add.s32 %r53, %r52, %r8; + max.s32 %r73, %r49, %r53; + add.s32 %r54, %r25, -1; + min.s32 %r10, %r54, %r8; + +BB3_4: + mov.u32 %r69, %r70; + sub.s32 %r55, %r5, %r69; + rem.s32 %r56, %r55, %r31; + setp.ne.s32 %p3, %r56, 0; + setp.le.s32 %p4, %r69, %r7; + and.pred %p5, %p3, %p4; + add.s32 %r70, %r69, 1; + @%p5 bra BB3_4; + +BB3_5: + mov.u32 %r13, %r73; + sub.s32 %r57, %r8, %r13; + rem.s32 %r58, %r57, %r32; + setp.ne.s32 %p6, %r58, 0; + setp.le.s32 %p7, %r13, %r10; + and.pred %p8, %p6, %p7; + add.s32 %r73, %r13, 1; + @%p8 bra BB3_5; + + setp.gt.s32 %p9, %r69, %r7; + @%p9 bra BB3_11; + + mul.lo.s32 %r15, %r2, %r28; + mul.lo.s32 %r16, %r4, %r29; + cvta.to.global.u64 %rd15, %rd7; + +BB3_8: + sub.s32 %r59, %r5, %r69; + div.s32 %r60, %r59, %r31; + mad.lo.s32 %r18, %r60, %r27, %r15; + setp.gt.s32 %p10, %r13, %r10; + mov.u32 %r72, %r13; + @%p10 bra BB3_10; + +BB3_9: + mov.u32 %r19, %r72; + sub.s32 %r61, %r8, %r19; + div.s32 %r62, %r61, %r32; + mad.lo.s32 %r63, %r69, %r25, %r16; + add.s32 %r64, %r63, %r19; + mad.lo.s32 %r65, %r64, %r30, %r18; + add.s32 %r66, %r65, %r62; + mul.wide.s32 %rd16, %r66, 4; + add.s64 %rd17, %rd15, %rd16; + st.global.f32 [%rd17], %f1; + add.s32 %r20, %r19, %r32; + setp.le.s32 %p11, %r20, %r10; + mov.u32 %r72, %r20; + @%p11 bra BB3_9; + +BB3_10: + add.s32 %r69, %r69, %r31; + setp.le.s32 %p12, %r69, %r7; + @%p12 bra BB3_8; + +BB3_11: + ret; +} + + // .globl dense_dense_im2col_d +.visible .entry dense_dense_im2col_d( + .param .u64 dense_dense_im2col_d_param_0, + .param .u64 dense_dense_im2col_d_param_1, + .param .u32 dense_dense_im2col_d_param_2, + .param .u32 dense_dense_im2col_d_param_3, + .param .u32 dense_dense_im2col_d_param_4, + .param .u32 dense_dense_im2col_d_param_5, + .param .u32 dense_dense_im2col_d_param_6, + .param .u32 dense_dense_im2col_d_param_7, + .param .u32 dense_dense_im2col_d_param_8, + .param .u32 dense_dense_im2col_d_param_9, + .param .u32 dense_dense_im2col_d_param_10, + .param .u32 dense_dense_im2col_d_param_11, + .param .u32 dense_dense_im2col_d_param_12, + .param .u32 dense_dense_im2col_d_param_13, + .param .u32 dense_dense_im2col_d_param_14, + .param .u32 dense_dense_im2col_d_param_15, + .param .u32 dense_dense_im2col_d_param_16 +) +{ + .reg .pred %p<12>; + .reg .b32 %r<71>; + .reg .f64 %fd<2>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [dense_dense_im2col_d_param_0]; + ld.param.u64 %rd2, [dense_dense_im2col_d_param_1]; + ld.param.u32 %r35, [dense_dense_im2col_d_param_2]; + ld.param.u32 %r21, [dense_dense_im2col_d_param_3]; + ld.param.u32 %r22, [dense_dense_im2col_d_param_4]; + ld.param.u32 %r23, [dense_dense_im2col_d_param_5]; + ld.param.u32 %r24, [dense_dense_im2col_d_param_6]; + ld.param.u32 %r25, [dense_dense_im2col_d_param_7]; + ld.param.u32 %r26, [dense_dense_im2col_d_param_8]; + ld.param.u32 %r27, [dense_dense_im2col_d_param_9]; + ld.param.u32 %r28, [dense_dense_im2col_d_param_10]; + ld.param.u32 %r29, [dense_dense_im2col_d_param_11]; + ld.param.u32 %r30, [dense_dense_im2col_d_param_12]; + ld.param.u32 %r31, [dense_dense_im2col_d_param_13]; + ld.param.u32 %r32, [dense_dense_im2col_d_param_14]; + ld.param.u32 %r33, [dense_dense_im2col_d_param_15]; + ld.param.u32 %r34, [dense_dense_im2col_d_param_16]; + mov.u32 %r36, %ctaid.x; + mov.u32 %r37, %ntid.x; + mov.u32 %r38, %tid.x; + mad.lo.s32 %r1, %r37, %r36, %r38; + setp.ge.s32 %p1, %r1, %r35; + @%p1 bra BB4_9; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + div.s32 %r2, %r1, %r21; + rem.s32 %r39, %r1, %r21; + div.s32 %r3, %r39, %r22; + rem.s32 %r40, %r39, %r22; + div.s32 %r41, %r40, %r23; + rem.s32 %r42, %r40, %r23; + add.s32 %r4, %r41, %r33; + mul.lo.s32 %r43, %r31, %r26; + mov.u32 %r44, 1; + sub.s32 %r45, %r44, %r43; + add.s32 %r46, %r45, %r4; + mov.u32 %r47, 0; + max.s32 %r67, %r47, %r46; + add.s32 %r48, %r24, -1; + min.s32 %r6, %r48, %r4; + add.s32 %r7, %r42, %r34; + mul.lo.s32 %r49, %r32, %r27; + sub.s32 %r50, %r44, %r49; + add.s32 %r51, %r50, %r7; + max.s32 %r70, %r47, %r51; + add.s32 %r52, %r25, -1; + min.s32 %r9, %r52, %r7; + +BB4_2: + mov.u32 %r66, %r67; + sub.s32 %r53, %r4, %r66; + rem.s32 %r54, %r53, %r31; + setp.ne.s32 %p2, %r54, 0; + setp.le.s32 %p3, %r66, %r6; + and.pred %p4, %p2, %p3; + add.s32 %r67, %r66, 1; + @%p4 bra BB4_2; + +BB4_3: + mov.u32 %r12, %r70; + sub.s32 %r55, %r7, %r12; + rem.s32 %r56, %r55, %r32; + setp.ne.s32 %p5, %r56, 0; + setp.le.s32 %p6, %r12, %r9; + and.pred %p7, %p5, %p6; + add.s32 %r70, %r12, 1; + @%p7 bra BB4_3; + + setp.gt.s32 %p8, %r66, %r6; + @%p8 bra BB4_9; + + mul.lo.s32 %r14, %r2, %r28; + mul.lo.s32 %r15, %r3, %r29; + cvta.to.global.u64 %rd6, %rd2; + +BB4_6: + sub.s32 %r57, %r4, %r66; + div.s32 %r58, %r57, %r31; + mad.lo.s32 %r17, %r58, %r27, %r14; + setp.gt.s32 %p9, %r12, %r9; + mov.u32 %r69, %r12; + @%p9 bra BB4_8; + +BB4_7: + mov.u32 %r18, %r69; + sub.s32 %r59, %r7, %r18; + div.s32 %r60, %r59, %r32; + mad.lo.s32 %r61, %r66, %r25, %r15; + add.s32 %r62, %r61, %r18; + mad.lo.s32 %r63, %r62, %r30, %r17; + add.s32 %r64, %r63, %r60; + mul.wide.s32 %rd7, %r64, 8; + add.s64 %rd8, %rd6, %rd7; + st.global.f64 [%rd8], %fd1; + add.s32 %r19, %r18, %r32; + setp.le.s32 %p10, %r19, %r9; + mov.u32 %r69, %r19; + @%p10 bra BB4_7; + +BB4_8: + add.s32 %r66, %r66, %r31; + setp.le.s32 %p11, %r66, %r6; + @%p11 bra BB4_6; + +BB4_9: + ret; +} + + // .globl dense_dense_im2col_f +.visible .entry dense_dense_im2col_f( + .param .u64 dense_dense_im2col_f_param_0, + .param .u64 dense_dense_im2col_f_param_1, + .param .u32 dense_dense_im2col_f_param_2, + .param .u32 dense_dense_im2col_f_param_3, + .param .u32 dense_dense_im2col_f_param_4, + .param .u32 dense_dense_im2col_f_param_5, + .param .u32 dense_dense_im2col_f_param_6, + .param .u32 dense_dense_im2col_f_param_7, + .param .u32 dense_dense_im2col_f_param_8, + .param .u32 dense_dense_im2col_f_param_9, + .param .u32 dense_dense_im2col_f_param_10, + .param .u32 dense_dense_im2col_f_param_11, + .param .u32 dense_dense_im2col_f_param_12, + .param .u32 dense_dense_im2col_f_param_13, + .param .u32 dense_dense_im2col_f_param_14, + .param .u32 dense_dense_im2col_f_param_15, + .param .u32 dense_dense_im2col_f_param_16 +) +{ + .reg .pred %p<12>; + .reg .f32 %f<2>; + .reg .b32 %r<71>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [dense_dense_im2col_f_param_0]; + ld.param.u64 %rd2, [dense_dense_im2col_f_param_1]; + ld.param.u32 %r35, [dense_dense_im2col_f_param_2]; + ld.param.u32 %r21, [dense_dense_im2col_f_param_3]; + ld.param.u32 %r22, [dense_dense_im2col_f_param_4]; + ld.param.u32 %r23, [dense_dense_im2col_f_param_5]; + ld.param.u32 %r24, [dense_dense_im2col_f_param_6]; + ld.param.u32 %r25, [dense_dense_im2col_f_param_7]; + ld.param.u32 %r26, [dense_dense_im2col_f_param_8]; + ld.param.u32 %r27, [dense_dense_im2col_f_param_9]; + ld.param.u32 %r28, [dense_dense_im2col_f_param_10]; + ld.param.u32 %r29, [dense_dense_im2col_f_param_11]; + ld.param.u32 %r30, [dense_dense_im2col_f_param_12]; + ld.param.u32 %r31, [dense_dense_im2col_f_param_13]; + ld.param.u32 %r32, [dense_dense_im2col_f_param_14]; + ld.param.u32 %r33, [dense_dense_im2col_f_param_15]; + ld.param.u32 %r34, [dense_dense_im2col_f_param_16]; + mov.u32 %r36, %ctaid.x; + mov.u32 %r37, %ntid.x; + mov.u32 %r38, %tid.x; + mad.lo.s32 %r1, %r37, %r36, %r38; + setp.ge.s32 %p1, %r1, %r35; + @%p1 bra BB5_9; + + cvta.to.global.u64 %rd3, %rd1; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + div.s32 %r2, %r1, %r21; + rem.s32 %r39, %r1, %r21; + div.s32 %r3, %r39, %r22; + rem.s32 %r40, %r39, %r22; + div.s32 %r41, %r40, %r23; + rem.s32 %r42, %r40, %r23; + add.s32 %r4, %r41, %r33; + mul.lo.s32 %r43, %r31, %r26; + mov.u32 %r44, 1; + sub.s32 %r45, %r44, %r43; + add.s32 %r46, %r45, %r4; + mov.u32 %r47, 0; + max.s32 %r67, %r47, %r46; + add.s32 %r48, %r24, -1; + min.s32 %r6, %r48, %r4; + add.s32 %r7, %r42, %r34; + mul.lo.s32 %r49, %r32, %r27; + sub.s32 %r50, %r44, %r49; + add.s32 %r51, %r50, %r7; + max.s32 %r70, %r47, %r51; + add.s32 %r52, %r25, -1; + min.s32 %r9, %r52, %r7; + +BB5_2: + mov.u32 %r66, %r67; + sub.s32 %r53, %r4, %r66; + rem.s32 %r54, %r53, %r31; + setp.ne.s32 %p2, %r54, 0; + setp.le.s32 %p3, %r66, %r6; + and.pred %p4, %p2, %p3; + add.s32 %r67, %r66, 1; + @%p4 bra BB5_2; + +BB5_3: + mov.u32 %r12, %r70; + sub.s32 %r55, %r7, %r12; + rem.s32 %r56, %r55, %r32; + setp.ne.s32 %p5, %r56, 0; + setp.le.s32 %p6, %r12, %r9; + and.pred %p7, %p5, %p6; + add.s32 %r70, %r12, 1; + @%p7 bra BB5_3; + + setp.gt.s32 %p8, %r66, %r6; + @%p8 bra BB5_9; + + mul.lo.s32 %r14, %r2, %r28; + mul.lo.s32 %r15, %r3, %r29; + cvta.to.global.u64 %rd6, %rd2; + +BB5_6: + sub.s32 %r57, %r4, %r66; + div.s32 %r58, %r57, %r31; + mad.lo.s32 %r17, %r58, %r27, %r14; + setp.gt.s32 %p9, %r12, %r9; + mov.u32 %r69, %r12; + @%p9 bra BB5_8; + +BB5_7: + mov.u32 %r18, %r69; + sub.s32 %r59, %r7, %r18; + div.s32 %r60, %r59, %r32; + mad.lo.s32 %r61, %r66, %r25, %r15; + add.s32 %r62, %r61, %r18; + mad.lo.s32 %r63, %r62, %r30, %r17; + add.s32 %r64, %r63, %r60; + mul.wide.s32 %rd7, %r64, 4; + add.s64 %rd8, %rd6, %rd7; + st.global.f32 [%rd8], %f1; + add.s32 %r19, %r18, %r32; + setp.le.s32 %p10, %r19, %r9; + mov.u32 %r69, %r19; + @%p10 bra BB5_7; + +BB5_8: + add.s32 %r66, %r66, %r31; + setp.le.s32 %p11, %r66, %r6; + @%p11 bra BB5_6; + +BB5_9: + ret; +} + + // .globl reorg_knpq_d +.visible .entry reorg_knpq_d( + .param .u64 reorg_knpq_d_param_0, + .param .u64 reorg_knpq_d_param_1, + .param .u32 reorg_knpq_d_param_2, + .param .u32 reorg_knpq_d_param_3, + .param .u32 reorg_knpq_d_param_4, + .param .u32 reorg_knpq_d_param_5 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<16>; + .reg .f64 %fd<2>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [reorg_knpq_d_param_0]; + ld.param.u64 %rd2, [reorg_knpq_d_param_1]; + ld.param.u32 %r5, [reorg_knpq_d_param_2]; + ld.param.u32 %r2, [reorg_knpq_d_param_3]; + ld.param.u32 %r3, [reorg_knpq_d_param_4]; + ld.param.u32 %r4, [reorg_knpq_d_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.s32 %p1, %r1, %r5; + @%p1 bra BB6_2; + + cvta.to.global.u64 %rd3, %rd1; + rem.s32 %r9, %r1, %r2; + div.s32 %r10, %r9, %r4; + rem.s32 %r11, %r9, %r4; + mul.wide.s32 %rd4, %r1, 8; + add.s64 %rd5, %rd3, %rd4; + ld.global.f64 %fd1, [%rd5]; + div.s32 %r12, %r1, %r2; + mul.lo.s32 %r13, %r12, %r4; + mad.lo.s32 %r14, %r10, %r3, %r13; + add.s32 %r15, %r14, %r11; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r15, 8; + add.s64 %rd8, %rd6, %rd7; + st.global.f64 [%rd8], %fd1; + +BB6_2: + ret; +} + + // .globl reorg_knpq_f +.visible .entry reorg_knpq_f( + .param .u64 reorg_knpq_f_param_0, + .param .u64 reorg_knpq_f_param_1, + .param .u32 reorg_knpq_f_param_2, + .param .u32 reorg_knpq_f_param_3, + .param .u32 reorg_knpq_f_param_4, + .param .u32 reorg_knpq_f_param_5 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<2>; + .reg .b32 %r<16>; + .reg .b64 %rd<9>; + + + ld.param.u64 %rd1, [reorg_knpq_f_param_0]; + ld.param.u64 %rd2, [reorg_knpq_f_param_1]; + ld.param.u32 %r5, [reorg_knpq_f_param_2]; + ld.param.u32 %r2, [reorg_knpq_f_param_3]; + ld.param.u32 %r3, [reorg_knpq_f_param_4]; + ld.param.u32 %r4, [reorg_knpq_f_param_5]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.s32 %p1, %r1, %r5; + @%p1 bra BB7_2; + + cvta.to.global.u64 %rd3, %rd1; + rem.s32 %r9, %r1, %r2; + div.s32 %r10, %r9, %r4; + rem.s32 %r11, %r9, %r4; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.global.f32 %f1, [%rd5]; + div.s32 %r12, %r1, %r2; + mul.lo.s32 %r13, %r12, %r4; + mad.lo.s32 %r14, %r10, %r3, %r13; + add.s32 %r15, %r14, %r11; + cvta.to.global.u64 %rd6, %rd2; + mul.wide.s32 %rd7, %r15, 4; + add.s64 %rd8, %rd6, %rd7; + st.global.f32 [%rd8], %f1; + +BB7_2: + ret; +} + // .globl slice_sparse_dense_row_d .visible .entry slice_sparse_dense_row_d( .param .u64 slice_sparse_dense_row_d_param_0, @@ -137,7 +825,7 @@ BB1_2: mad.lo.s32 %r1, %r17, %r18, %r19; add.s32 %r2, %r1, %r15; setp.gt.s32 %p1, %r2, %r16; - @%p1 bra BB2_6; + @%p1 bra BB8_6; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r2, 4; @@ -145,7 +833,7 @@ BB1_2: ld.global.u32 %r23, [%rd1]; ld.global.u32 %r22, [%rd1+4]; setp.ge.s32 %p2, %r23, %r22; - @%p2 bra BB2_6; + @%p2 bra BB8_6; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd15, %rd9; @@ -157,12 +845,12 @@ BB1_2: mul.wide.s32 %rd18, %r23, 4; add.s64 %rd21, %rd16, %rd18; -BB2_3: +BB8_3: ld.global.u32 %r8, [%rd21]; setp.lt.s32 %p3, %r8, %r12; setp.gt.s32 %p4, %r8, %r13; or.pred %p5, %p3, %p4; - @%p5 bra BB2_5; + @%p5 bra BB8_5; ld.global.f64 %fd1, [%rd22]; add.s32 %r21, %r5, %r8; @@ -171,14 +859,14 @@ BB2_3: st.global.f64 [%rd20], %fd1; ld.global.u32 %r22, [%rd1+4]; -BB2_5: +BB8_5: add.s64 %rd22, %rd22, 8; add.s64 %rd21, %rd21, 4; add.s32 %r23, %r23, 1; setp.lt.s32 %p6, %r23, %r22; - @%p6 bra BB2_3; + @%p6 bra BB8_3; -BB2_6: +BB8_6: ret; } @@ -216,7 +904,7 @@ BB2_6: mad.lo.s32 %r1, %r17, %r18, %r19; add.s32 %r2, %r1, %r15; setp.gt.s32 %p1, %r2, %r16; - @%p1 bra BB3_6; + @%p1 bra BB9_6; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r2, 4; @@ -224,7 +912,7 @@ BB2_6: ld.global.u32 %r23, [%rd1]; ld.global.u32 %r22, [%rd1+4]; setp.ge.s32 %p2, %r23, %r22; - @%p2 bra BB3_6; + @%p2 bra BB9_6; cvta.to.global.u64 %rd2, %rd12; cvta.to.global.u64 %rd15, %rd9; @@ -235,12 +923,12 @@ BB2_6: add.s64 %rd21, %rd15, %rd17; add.s64 %rd20, %rd16, %rd17; -BB3_3: +BB9_3: ld.global.u32 %r8, [%rd20]; setp.lt.s32 %p3, %r8, %r12; setp.gt.s32 %p4, %r8, %r13; or.pred %p5, %p3, %p4; - @%p5 bra BB3_5; + @%p5 bra BB9_5; ld.global.f32 %f1, [%rd21]; add.s32 %r21, %r5, %r8; @@ -249,14 +937,14 @@ BB3_3: st.global.f32 [%rd19], %f1; ld.global.u32 %r22, [%rd1+4]; -BB3_5: +BB9_5: add.s64 %rd21, %rd21, 4; add.s64 %rd20, %rd20, 4; add.s32 %r23, %r23, 1; setp.lt.s32 %p6, %r23, %r22; - @%p6 bra BB3_3; + @%p6 bra BB9_3; -BB3_6: +BB9_6: ret; } @@ -301,7 +989,7 @@ BB3_6: add.s64 %rd12, %rd1, %rd11; ld.global.u32 %r15, [%rd12+4]; setp.ge.s32 %p1, %r1, %r15; - @%p1 bra BB4_5; + @%p1 bra BB10_5; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd5; @@ -313,11 +1001,11 @@ BB3_6: setp.lt.s32 %p2, %r2, %r6; setp.gt.s32 %p3, %r2, %r7; or.pred %p4, %p2, %p3; - @%p4 bra BB4_5; + @%p4 bra BB10_5; mov.u32 %r21, %r5; -BB4_3: +BB10_3: mov.u32 %r3, %r21; add.s32 %r4, %r3, 1; mul.wide.s32 %rd16, %r4, 4; @@ -325,7 +1013,7 @@ BB4_3: ld.global.u32 %r16, [%rd17]; setp.le.s32 %p5, %r16, %r1; mov.u32 %r21, %r4; - @%p5 bra BB4_3; + @%p5 bra BB10_3; shl.b64 %rd18, %rd4, 3; add.s64 %rd19, %rd3, %rd18; @@ -338,7 +1026,7 @@ BB4_3: add.s64 %rd21, %rd2, %rd20; st.global.f64 [%rd21], %fd1; -BB4_5: +BB10_5: ret; } @@ -383,7 +1071,7 @@ BB4_5: add.s64 %rd12, %rd1, %rd11; ld.global.u32 %r15, [%rd12+4]; setp.ge.s32 %p1, %r1, %r15; - @%p1 bra BB5_5; + @%p1 bra BB11_5; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd5; @@ -395,11 +1083,11 @@ BB4_5: setp.lt.s32 %p2, %r2, %r6; setp.gt.s32 %p3, %r2, %r7; or.pred %p4, %p2, %p3; - @%p4 bra BB5_5; + @%p4 bra BB11_5; mov.u32 %r21, %r5; -BB5_3: +BB11_3: mov.u32 %r3, %r21; add.s32 %r4, %r3, 1; mul.wide.s32 %rd16, %r4, 4; @@ -407,7 +1095,7 @@ BB5_3: ld.global.u32 %r16, [%rd17]; setp.le.s32 %p5, %r16, %r1; mov.u32 %r21, %r4; - @%p5 bra BB5_3; + @%p5 bra BB11_3; shl.b64 %rd18, %rd4, 2; add.s64 %rd19, %rd3, %rd18; @@ -420,7 +1108,7 @@ BB5_3: add.s64 %rd21, %rd2, %rd20; st.global.f32 [%rd21], %f1; -BB5_5: +BB11_5: ret; } @@ -458,10 +1146,10 @@ BB5_5: setp.lt.s32 %p1, %r2, %r7; setp.gt.s32 %p2, %r6, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB6_2; - bra.uni BB6_1; + @!%p3 bra BB12_2; + bra.uni BB12_1; -BB6_1: +BB12_1: rem.s32 %r11, %r1, %r6; cvta.to.global.u64 %rd3, %rd1; add.s32 %r12, %r2, %r3; @@ -475,7 +1163,7 @@ BB6_1: add.s64 %rd8, %rd6, %rd7; st.global.f64 [%rd8], %fd1; -BB6_2: +BB12_2: ret; } @@ -513,10 +1201,10 @@ BB6_2: setp.lt.s32 %p1, %r2, %r7; setp.gt.s32 %p2, %r6, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB7_2; - bra.uni BB7_1; + @!%p3 bra BB13_2; + bra.uni BB13_1; -BB7_1: +BB13_1: rem.s32 %r11, %r1, %r6; cvta.to.global.u64 %rd3, %rd1; add.s32 %r12, %r2, %r3; @@ -530,7 +1218,7 @@ BB7_1: add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; -BB7_2: +BB13_2: ret; } @@ -560,10 +1248,10 @@ BB7_2: setp.gt.s32 %p1, %r9, %r8; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB8_2; - bra.uni BB8_1; + @!%p3 bra BB14_2; + bra.uni BB14_1; -BB8_1: +BB14_1: cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 8; add.s64 %rd4, %rd2, %rd3; @@ -572,7 +1260,7 @@ BB8_1: add.s64 %rd6, %rd2, %rd5; st.global.f64 [%rd6], %fd1; -BB8_2: +BB14_2: ret; } @@ -602,10 +1290,10 @@ BB8_2: setp.gt.s32 %p1, %r9, %r8; setp.lt.s32 %p2, %r2, %r4; and.pred %p3, %p1, %p2; - @!%p3 bra BB9_2; - bra.uni BB9_1; + @!%p3 bra BB15_2; + bra.uni BB15_1; -BB9_1: +BB15_1: cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 4; add.s64 %rd4, %rd2, %rd3; @@ -614,7 +1302,7 @@ BB9_1: add.s64 %rd6, %rd2, %rd5; st.global.f32 [%rd6], %f1; -BB9_2: +BB15_2: ret; } @@ -644,10 +1332,10 @@ BB9_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB10_2; - bra.uni BB10_1; + @!%p3 bra BB16_2; + bra.uni BB16_1; -BB10_1: +BB16_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -658,7 +1346,7 @@ BB10_1: add.s64 %rd7, %rd6, %rd4; st.global.f64 [%rd7], %fd3; -BB10_2: +BB16_2: ret; } @@ -689,10 +1377,10 @@ BB10_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB11_2; - bra.uni BB11_1; + @!%p3 bra BB17_2; + bra.uni BB17_1; -BB11_1: +BB17_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 4; add.s64 %rd5, %rd3, %rd4; @@ -705,7 +1393,7 @@ BB11_1: add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f2; -BB11_2: +BB17_2: ret; } @@ -737,10 +1425,10 @@ BB11_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB12_4; - bra.uni BB12_1; + @!%p3 bra BB18_4; + bra.uni BB18_1; -BB12_1: +BB18_1: cvta.to.global.u64 %rd5, %rd2; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd6, %r1, 8; @@ -748,20 +1436,20 @@ BB12_1: ld.global.f64 %fd4, [%rd7]; mov.f64 %fd5, 0d0000000000000000; setp.leu.f64 %p4, %fd4, 0d0000000000000000; - @%p4 bra BB12_3; + @%p4 bra BB18_3; cvta.to.global.u64 %rd8, %rd3; shl.b64 %rd9, %rd1, 3; add.s64 %rd10, %rd8, %rd9; ld.global.f64 %fd5, [%rd10]; -BB12_3: +BB18_3: cvta.to.global.u64 %rd11, %rd4; shl.b64 %rd12, %rd1, 3; add.s64 %rd13, %rd11, %rd12; st.global.f64 [%rd13], %fd5; -BB12_4: +BB18_4: ret; } @@ -793,10 +1481,10 @@ BB12_4: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB13_4; - bra.uni BB13_1; + @!%p3 bra BB19_4; + bra.uni BB19_1; -BB13_1: +BB19_1: cvta.to.global.u64 %rd5, %rd2; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd6, %r1, 4; @@ -804,20 +1492,20 @@ BB13_1: ld.global.f32 %f4, [%rd7]; mov.f32 %f5, 0f00000000; setp.leu.f32 %p4, %f4, 0f00000000; - @%p4 bra BB13_3; + @%p4 bra BB19_3; cvta.to.global.u64 %rd8, %rd3; shl.b64 %rd9, %rd1, 2; add.s64 %rd10, %rd8, %rd9; ld.global.f32 %f5, [%rd10]; -BB13_3: +BB19_3: cvta.to.global.u64 %rd11, %rd4; shl.b64 %rd12, %rd1, 2; add.s64 %rd13, %rd11, %rd12; st.global.f32 [%rd13], %f5; -BB13_4: +BB19_4: ret; } @@ -847,10 +1535,10 @@ BB13_4: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB14_2; - bra.uni BB14_1; + @!%p3 bra BB20_2; + bra.uni BB20_1; -BB14_1: +BB20_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 8; add.s64 %rd5, %rd3, %rd4; @@ -861,7 +1549,7 @@ BB14_1: add.f64 %fd3, %fd2, %fd1; st.global.f64 [%rd7], %fd3; -BB14_2: +BB20_2: ret; } @@ -891,10 +1579,10 @@ BB14_2: setp.lt.s32 %p1, %r7, %r2; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB15_2; - bra.uni BB15_1; + @!%p3 bra BB21_2; + bra.uni BB21_1; -BB15_1: +BB21_1: cvta.to.global.u64 %rd3, %rd1; mul.wide.s32 %rd4, %r1, 4; add.s64 %rd5, %rd3, %rd4; @@ -905,7 +1593,7 @@ BB15_1: add.f32 %f3, %f2, %f1; st.global.f32 [%rd7], %f3; -BB15_2: +BB21_2: ret; } @@ -939,10 +1627,10 @@ BB15_2: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB16_2; - bra.uni BB16_1; + @!%p3 bra BB22_2; + bra.uni BB22_1; -BB16_1: +BB22_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 8; @@ -958,7 +1646,7 @@ BB16_1: add.s64 %rd11, %rd10, %rd5; st.global.f64 [%rd11], %fd3; -BB16_2: +BB22_2: ret; } @@ -992,10 +1680,10 @@ BB16_2: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB17_2; - bra.uni BB17_1; + @!%p3 bra BB23_2; + bra.uni BB23_1; -BB17_1: +BB23_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 4; @@ -1011,7 +1699,7 @@ BB17_1: add.s64 %rd11, %rd10, %rd5; st.global.f32 [%rd11], %f3; -BB17_2: +BB23_2: ret; } @@ -1050,10 +1738,10 @@ BB17_2: setp.lt.s32 %p1, %r1, %r5; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB18_4; - bra.uni BB18_1; + @!%p3 bra BB24_4; + bra.uni BB24_1; -BB18_1: +BB24_1: cvta.to.global.u64 %rd6, %rd4; mad.lo.s32 %r10, %r1, %r3, %r2; cvta.to.global.u64 %rd7, %rd3; @@ -1062,25 +1750,25 @@ BB18_1: ld.global.f64 %fd1, [%rd9]; add.s64 %rd2, %rd6, %rd8; setp.eq.s32 %p4, %r4, 1; - @%p4 bra BB18_3; - bra.uni BB18_2; + @%p4 bra BB24_3; + bra.uni BB24_2; -BB18_3: +BB24_3: mul.wide.s32 %rd12, %r2, 8; add.s64 %rd13, %rd1, %rd12; ld.global.f64 %fd5, [%rd13]; fma.rn.f64 %fd6, %fd5, %fd2, %fd1; st.global.f64 [%rd2], %fd6; - bra.uni BB18_4; + bra.uni BB24_4; -BB18_2: +BB24_2: mul.wide.s32 %rd10, %r1, 8; add.s64 %rd11, %rd1, %rd10; ld.global.f64 %fd3, [%rd11]; fma.rn.f64 %fd4, %fd3, %fd2, %fd1; st.global.f64 [%rd2], %fd4; -BB18_4: +BB24_4: ret; } @@ -1120,10 +1808,10 @@ BB18_4: setp.lt.s32 %p1, %r1, %r5; setp.gt.s32 %p2, %r3, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB19_4; - bra.uni BB19_1; + @!%p3 bra BB25_4; + bra.uni BB25_1; -BB19_1: +BB25_1: cvta.to.global.u64 %rd6, %rd4; mad.lo.s32 %r10, %r1, %r3, %r2; cvta.to.global.u64 %rd7, %rd3; @@ -1133,10 +1821,10 @@ BB19_1: cvt.f64.f32 %fd1, %f1; add.s64 %rd2, %rd6, %rd8; setp.eq.s32 %p4, %r4, 1; - @%p4 bra BB19_3; - bra.uni BB19_2; + @%p4 bra BB25_3; + bra.uni BB25_2; -BB19_3: +BB25_3: mul.wide.s32 %rd12, %r2, 4; add.s64 %rd13, %rd1, %rd12; ld.global.f32 %f4, [%rd13]; @@ -1144,9 +1832,9 @@ BB19_3: fma.rn.f64 %fd6, %fd5, %fd2, %fd1; cvt.rn.f32.f64 %f5, %fd6; st.global.f32 [%rd2], %f5; - bra.uni BB19_4; + bra.uni BB25_4; -BB19_2: +BB25_2: mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd1, %rd10; ld.global.f32 %f2, [%rd11]; @@ -1155,7 +1843,7 @@ BB19_2: cvt.rn.f32.f64 %f3, %fd4; st.global.f32 [%rd2], %f3; -BB19_4: +BB25_4: ret; } @@ -1189,10 +1877,10 @@ BB19_4: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB20_2; - bra.uni BB20_1; + @!%p3 bra BB26_2; + bra.uni BB26_1; -BB20_1: +BB26_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 8; @@ -1208,7 +1896,7 @@ BB20_1: add.s64 %rd11, %rd10, %rd5; st.global.f64 [%rd11], %fd3; -BB20_2: +BB26_2: ret; } @@ -1242,10 +1930,10 @@ BB20_2: setp.lt.s32 %p1, %r8, %r4; setp.gt.s32 %p2, %r2, -1; and.pred %p3, %p1, %p2; - @!%p3 bra BB21_2; - bra.uni BB21_1; + @!%p3 bra BB27_2; + bra.uni BB27_1; -BB21_1: +BB27_1: rem.s32 %r9, %r1, %r2; cvta.to.global.u64 %rd4, %rd1; mul.wide.s32 %rd5, %r1, 4; @@ -1261,7 +1949,7 @@ BB21_1: add.s64 %rd11, %rd10, %rd5; st.global.f32 [%rd11], %f3; -BB21_2: +BB27_2: ret; } @@ -1299,10 +1987,10 @@ BB21_2: setp.lt.s32 %p2, %r2, %r10; setp.gt.s32 %p3, %r6, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB22_65; - bra.uni BB22_1; + @!%p4 bra BB28_65; + bra.uni BB28_1; -BB22_1: +BB28_1: rem.s32 %r14, %r1, %r6; cvta.to.global.u64 %rd5, %rd2; mad.lo.s32 %r3, %r2, %r6, %r14; @@ -1323,47 +2011,47 @@ BB22_1: ld.global.f64 %fd2, [%rd10]; mov.f64 %fd54, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p9, %r9, 8; - @%p9 bra BB22_18; + @%p9 bra BB28_18; setp.gt.s32 %p23, %r9, 3; - @%p23 bra BB22_10; + @%p23 bra BB28_10; setp.gt.s32 %p30, %r9, 1; - @%p30 bra BB22_7; + @%p30 bra BB28_7; setp.eq.s32 %p33, %r9, 0; - @%p33 bra BB22_63; - bra.uni BB22_5; + @%p33 bra BB28_63; + bra.uni BB28_5; -BB22_63: +BB28_63: add.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_18: +BB28_18: setp.gt.s32 %p10, %r9, 13; - @%p10 bra BB22_27; + @%p10 bra BB28_27; setp.gt.s32 %p17, %r9, 10; - @%p17 bra BB22_23; + @%p17 bra BB28_23; setp.eq.s32 %p21, %r9, 9; - @%p21 bra BB22_45; - bra.uni BB22_21; + @%p21 bra BB28_45; + bra.uni BB28_21; -BB22_45: +BB28_45: setp.eq.f64 %p50, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_10: +BB28_10: setp.gt.s32 %p24, %r9, 5; - @%p24 bra BB22_14; + @%p24 bra BB28_14; setp.eq.s32 %p28, %r9, 4; - @%p28 bra BB22_48; - bra.uni BB22_12; + @%p28 bra BB28_48; + bra.uni BB28_12; -BB22_48: +BB28_48: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -1399,10 +2087,10 @@ BB22_48: }// Callseq End 0 setp.lt.s32 %p56, %r4, 0; and.pred %p1, %p56, %p55; - @!%p1 bra BB22_50; - bra.uni BB22_49; + @!%p1 bra BB28_50; + bra.uni BB28_49; -BB22_49: +BB28_49: { .reg .b32 %temp; mov.b64 {%temp, %r33}, %fd53; @@ -1414,30 +2102,30 @@ BB22_49: } mov.b64 %fd53, {%r35, %r34}; -BB22_50: +BB28_50: mov.f64 %fd52, %fd53; setp.eq.f64 %p57, %fd1, 0d0000000000000000; - @%p57 bra BB22_53; - bra.uni BB22_51; + @%p57 bra BB28_53; + bra.uni BB28_51; -BB22_53: +BB28_53: selp.b32 %r36, %r4, 0, %p55; or.b32 %r37, %r36, 2146435072; setp.lt.s32 %p61, %r5, 0; selp.b32 %r38, %r37, %r36, %p61; mov.u32 %r39, 0; mov.b64 %fd52, {%r39, %r38}; - bra.uni BB22_54; + bra.uni BB28_54; -BB22_27: +BB28_27: setp.gt.s32 %p11, %r9, 15; - @%p11 bra BB22_31; + @%p11 bra BB28_31; setp.eq.s32 %p15, %r9, 14; - @%p15 bra BB22_42; - bra.uni BB22_29; + @%p15 bra BB28_42; + bra.uni BB28_29; -BB22_42: +BB28_42: cvt.rni.s64.f64 %rd11, %fd1; cvt.rni.s64.f64 %rd12, %fd2; cvt.u32.u64 %r25, %rd11; @@ -1445,61 +2133,61 @@ BB22_42: or.b32 %r27, %r26, %r25; setp.eq.s32 %p47, %r27, 0; selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p47; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_7: +BB28_7: setp.eq.s32 %p31, %r9, 2; - @%p31 bra BB22_62; - bra.uni BB22_8; + @%p31 bra BB28_62; + bra.uni BB28_8; -BB22_62: +BB28_62: mul.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_23: +BB28_23: setp.eq.s32 %p18, %r9, 11; - @%p18 bra BB22_44; + @%p18 bra BB28_44; setp.eq.s32 %p19, %r9, 12; - @%p19 bra BB22_43; - bra.uni BB22_25; + @%p19 bra BB28_43; + bra.uni BB28_25; -BB22_43: +BB28_43: max.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_14: +BB28_14: setp.eq.s32 %p25, %r9, 6; - @%p25 bra BB22_47; + @%p25 bra BB28_47; setp.eq.s32 %p26, %r9, 7; - @%p26 bra BB22_46; - bra.uni BB22_16; + @%p26 bra BB28_46; + bra.uni BB28_16; -BB22_46: +BB28_46: setp.gt.f64 %p52, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p52; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_31: +BB28_31: setp.eq.s32 %p12, %r9, 16; - @%p12 bra BB22_41; + @%p12 bra BB28_41; setp.eq.s32 %p13, %r9, 17; - @%p13 bra BB22_37; - bra.uni BB22_33; + @%p13 bra BB28_37; + bra.uni BB28_33; -BB22_37: +BB28_37: setp.eq.f64 %p39, %fd2, 0d0000000000000000; setp.eq.f64 %p40, %fd2, 0d8000000000000000; or.pred %p41, %p39, %p40; mov.f64 %fd54, 0d7FF8000000000000; - @%p41 bra BB22_64; + @%p41 bra BB28_64; div.rn.f64 %fd54, %fd1, %fd2; abs.f64 %fd39, %fd54; setp.gtu.f64 %p42, %fd39, 0d7FF0000000000000; - @%p42 bra BB22_64; + @%p42 bra BB28_64; { .reg .b32 %temp; @@ -1513,74 +2201,74 @@ BB22_37: setp.ne.s32 %p43, %r24, 2146435072; setp.ne.s32 %p44, %r22, 0; or.pred %p45, %p43, %p44; - @!%p45 bra BB22_64; - bra.uni BB22_40; + @!%p45 bra BB28_64; + bra.uni BB28_40; -BB22_40: +BB28_40: cvt.rmi.f64.f64 %fd40, %fd54; mul.f64 %fd41, %fd2, %fd40; sub.f64 %fd54, %fd1, %fd41; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_5: +BB28_5: setp.eq.s32 %p34, %r9, 1; - @%p34 bra BB22_6; - bra.uni BB22_64; + @%p34 bra BB28_6; + bra.uni BB28_64; -BB22_6: +BB28_6: sub.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_21: +BB28_21: setp.eq.s32 %p22, %r9, 10; - @%p22 bra BB22_22; - bra.uni BB22_64; + @%p22 bra BB28_22; + bra.uni BB28_64; -BB22_22: +BB28_22: setp.neu.f64 %p49, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_12: +BB28_12: setp.eq.s32 %p29, %r9, 5; - @%p29 bra BB22_13; - bra.uni BB22_64; + @%p29 bra BB28_13; + bra.uni BB28_64; -BB22_13: +BB28_13: setp.lt.f64 %p54, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p54; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_29: +BB28_29: setp.eq.s32 %p16, %r9, 15; - @%p16 bra BB22_30; - bra.uni BB22_64; + @%p16 bra BB28_30; + bra.uni BB28_64; -BB22_30: +BB28_30: mul.f64 %fd43, %fd1, %fd2; mov.f64 %fd44, 0d3FF0000000000000; sub.f64 %fd54, %fd44, %fd43; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_8: +BB28_8: setp.eq.s32 %p32, %r9, 3; - @%p32 bra BB22_9; - bra.uni BB22_64; + @%p32 bra BB28_9; + bra.uni BB28_64; -BB22_9: +BB28_9: div.rn.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_44: +BB28_44: min.f64 %fd54, %fd1, %fd2; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_25: +BB28_25: setp.eq.s32 %p20, %r9, 13; - @%p20 bra BB22_26; - bra.uni BB22_64; + @%p20 bra BB28_26; + bra.uni BB28_64; -BB22_26: +BB28_26: cvt.rni.s64.f64 %rd13, %fd1; cvt.rni.s64.f64 %rd14, %fd2; cvt.u32.u64 %r28, %rd13; @@ -1588,37 +2276,37 @@ BB22_26: and.b32 %r30, %r29, %r28; setp.eq.s32 %p48, %r30, 0; selp.f64 %fd54, 0d0000000000000000, 0d3FF0000000000000, %p48; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_47: +BB28_47: setp.le.f64 %p53, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p53; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_16: +BB28_16: setp.eq.s32 %p27, %r9, 8; - @%p27 bra BB22_17; - bra.uni BB22_64; + @%p27 bra BB28_17; + bra.uni BB28_64; -BB22_17: +BB28_17: setp.ge.f64 %p51, %fd1, %fd2; selp.f64 %fd54, 0d3FF0000000000000, 0d0000000000000000, %p51; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_41: +BB28_41: setp.neu.f64 %p46, %fd1, 0d0000000000000000; sub.f64 %fd42, %fd1, %fd2; selp.f64 %fd54, %fd42, 0d0000000000000000, %p46; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_33: +BB28_33: setp.ne.s32 %p14, %r9, 18; - @%p14 bra BB22_64; + @%p14 bra BB28_64; div.rn.f64 %fd54, %fd1, %fd2; abs.f64 %fd37, %fd54; setp.gtu.f64 %p35, %fd37, 0d7FF0000000000000; - @%p35 bra BB22_64; + @%p35 bra BB28_64; { .reg .b32 %temp; @@ -1632,22 +2320,22 @@ BB22_33: setp.ne.s32 %p36, %r21, 2146435072; setp.ne.s32 %p37, %r19, 0; or.pred %p38, %p36, %p37; - @!%p38 bra BB22_64; - bra.uni BB22_36; + @!%p38 bra BB28_64; + bra.uni BB28_36; -BB22_36: +BB28_36: cvt.rmi.f64.f64 %fd54, %fd54; - bra.uni BB22_64; + bra.uni BB28_64; -BB22_51: +BB28_51: setp.gt.s32 %p58, %r4, -1; - @%p58 bra BB22_54; + @%p58 bra BB28_54; cvt.rzi.f64.f64 %fd45, %fd2; setp.neu.f64 %p59, %fd45, %fd2; selp.f64 %fd52, 0dFFF8000000000000, %fd52, %p59; -BB22_54: +BB28_54: mov.f64 %fd25, %fd52; add.f64 %fd26, %fd1, %fd2; { @@ -1657,17 +2345,17 @@ BB22_54: and.b32 %r41, %r40, 2146435072; setp.ne.s32 %p62, %r41, 2146435072; mov.f64 %fd51, %fd25; - @%p62 bra BB22_61; + @%p62 bra BB28_61; setp.gtu.f64 %p63, %fd19, 0d7FF0000000000000; mov.f64 %fd51, %fd26; - @%p63 bra BB22_61; + @%p63 bra BB28_61; abs.f64 %fd46, %fd2; setp.gtu.f64 %p64, %fd46, 0d7FF0000000000000; mov.f64 %fd50, %fd26; mov.f64 %fd51, %fd50; - @%p64 bra BB22_61; + @%p64 bra BB28_61; { .reg .b32 %temp; @@ -1677,10 +2365,10 @@ BB22_54: setp.eq.s32 %p65, %r43, 2146435072; setp.eq.s32 %p66, %r42, 0; and.pred %p67, %p65, %p66; - @%p67 bra BB22_60; - bra.uni BB22_58; + @%p67 bra BB28_60; + bra.uni BB28_58; -BB22_60: +BB28_60: setp.gt.f64 %p71, %fd19, 0d3FF0000000000000; selp.b32 %r51, 2146435072, 0, %p71; xor.b32 %r52, %r51, 2146435072; @@ -1690,9 +2378,9 @@ BB22_60: selp.b32 %r54, 1072693248, %r53, %p73; mov.u32 %r55, 0; mov.b64 %fd51, {%r55, %r54}; - bra.uni BB22_61; + bra.uni BB28_61; -BB22_58: +BB28_58: { .reg .b32 %temp; mov.b64 {%r44, %temp}, %fd1; @@ -1702,10 +2390,10 @@ BB22_58: setp.eq.s32 %p69, %r44, 0; and.pred %p70, %p68, %p69; mov.f64 %fd51, %fd25; - @!%p70 bra BB22_61; - bra.uni BB22_59; + @!%p70 bra BB28_61; + bra.uni BB28_59; -BB22_59: +BB28_59: shr.s32 %r46, %r5, 31; and.b32 %r47, %r46, -2146435072; selp.b32 %r48, -1048576, 2146435072, %p1; @@ -1713,20 +2401,20 @@ BB22_59: mov.u32 %r50, 0; mov.b64 %fd51, {%r50, %r49}; -BB22_61: +BB28_61: setp.eq.f64 %p74, %fd2, 0d0000000000000000; setp.eq.f64 %p75, %fd1, 0d3FF0000000000000; or.pred %p76, %p75, %p74; selp.f64 %fd54, 0d3FF0000000000000, %fd51, %p76; -BB22_64: +BB28_64: cvta.to.global.u64 %rd16, %rd4; mul.wide.s32 %rd17, %r3, 8; add.s64 %rd18, %rd16, %rd17; st.global.f64 [%rd18], %fd54; bar.sync 0; -BB22_65: +BB28_65: ret; } @@ -1764,10 +2452,10 @@ BB22_65: setp.lt.s32 %p2, %r2, %r8; setp.gt.s32 %p3, %r4, -1; and.pred %p4, %p2, %p3; - @!%p4 bra BB23_63; - bra.uni BB23_1; + @!%p4 bra BB29_63; + bra.uni BB29_1; -BB23_1: +BB29_1: rem.s32 %r12, %r1, %r4; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r3, %r2, %r4, %r12; @@ -1788,47 +2476,47 @@ BB23_1: ld.global.f32 %f2, [%rd9]; mov.f32 %f133, 0f7F7FFFFF; setp.gt.s32 %p9, %r7, 8; - @%p9 bra BB23_18; + @%p9 bra BB29_18; setp.gt.s32 %p23, %r7, 3; - @%p23 bra BB23_10; + @%p23 bra BB29_10; setp.gt.s32 %p30, %r7, 1; - @%p30 bra BB23_7; + @%p30 bra BB29_7; setp.eq.s32 %p33, %r7, 0; - @%p33 bra BB23_61; - bra.uni BB23_5; + @%p33 bra BB29_61; + bra.uni BB29_5; -BB23_61: +BB29_61: add.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_18: +BB29_18: setp.gt.s32 %p10, %r7, 13; - @%p10 bra BB23_27; + @%p10 bra BB29_27; setp.gt.s32 %p17, %r7, 10; - @%p17 bra BB23_23; + @%p17 bra BB29_23; setp.eq.s32 %p21, %r7, 9; - @%p21 bra BB23_43; - bra.uni BB23_21; + @%p21 bra BB29_43; + bra.uni BB29_21; -BB23_43: +BB29_43: setp.eq.f32 %p44, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p44; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_10: +BB29_10: setp.gt.s32 %p24, %r7, 5; - @%p24 bra BB23_14; + @%p24 bra BB29_14; setp.eq.s32 %p28, %r7, 4; - @%p28 bra BB23_46; - bra.uni BB23_12; + @%p28 bra BB29_46; + bra.uni BB29_12; -BB23_46: +BB29_46: mul.f32 %f53, %f2, 0f3F000000; cvt.rzi.f32.f32 %f54, %f53; fma.rn.f32 %f55, %f54, 0fC0000000, %f2; @@ -1928,11 +2616,11 @@ BB23_46: setp.gt.f32 %p54, %f115, 0f42D20000; selp.f32 %f131, 0f7F800000, %f125, %p54; setp.eq.f32 %p55, %f131, 0f7F800000; - @%p55 bra BB23_48; + @%p55 bra BB29_48; fma.rn.f32 %f131, %f131, %f22, %f131; -BB23_48: +BB29_48: setp.lt.f32 %p56, %f1, 0f00000000; setp.eq.f32 %p57, %f19, 0f3F800000; and.pred %p1, %p56, %p57; @@ -1941,10 +2629,10 @@ BB23_48: mov.b32 %f126, %r30; selp.f32 %f132, %f126, %f131, %p1; setp.eq.f32 %p58, %f1, 0f00000000; - @%p58 bra BB23_51; - bra.uni BB23_49; + @%p58 bra BB29_51; + bra.uni BB29_49; -BB23_51: +BB29_51: add.f32 %f128, %f1, %f1; mov.b32 %r31, %f128; selp.b32 %r32, %r31, 0, %p57; @@ -1952,17 +2640,17 @@ BB23_51: setp.lt.f32 %p62, %f2, 0f00000000; selp.b32 %r34, %r33, %r32, %p62; mov.b32 %f132, %r34; - bra.uni BB23_52; + bra.uni BB29_52; -BB23_27: +BB29_27: setp.gt.s32 %p11, %r7, 15; - @%p11 bra BB23_31; + @%p11 bra BB29_31; setp.eq.s32 %p15, %r7, 14; - @%p15 bra BB23_40; - bra.uni BB23_29; + @%p15 bra BB29_40; + bra.uni BB29_29; -BB23_40: +BB29_40: cvt.rni.s64.f32 %rd10, %f1; cvt.rni.s64.f32 %rd11, %f2; cvt.u32.u64 %r17, %rd10; @@ -1970,126 +2658,126 @@ BB23_40: or.b32 %r19, %r18, %r17; setp.eq.s32 %p41, %r19, 0; selp.f32 %f133, 0f00000000, 0f3F800000, %p41; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_7: +BB29_7: setp.eq.s32 %p31, %r7, 2; - @%p31 bra BB23_60; - bra.uni BB23_8; + @%p31 bra BB29_60; + bra.uni BB29_8; -BB23_60: +BB29_60: mul.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_23: +BB29_23: setp.eq.s32 %p18, %r7, 11; - @%p18 bra BB23_42; + @%p18 bra BB29_42; setp.eq.s32 %p19, %r7, 12; - @%p19 bra BB23_41; - bra.uni BB23_25; + @%p19 bra BB29_41; + bra.uni BB29_25; -BB23_41: +BB29_41: max.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_14: +BB29_14: setp.eq.s32 %p25, %r7, 6; - @%p25 bra BB23_45; + @%p25 bra BB29_45; setp.eq.s32 %p26, %r7, 7; - @%p26 bra BB23_44; - bra.uni BB23_16; + @%p26 bra BB29_44; + bra.uni BB29_16; -BB23_44: +BB29_44: setp.gt.f32 %p46, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p46; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_31: +BB29_31: setp.eq.s32 %p12, %r7, 16; - @%p12 bra BB23_39; + @%p12 bra BB29_39; setp.eq.s32 %p13, %r7, 17; - @%p13 bra BB23_36; - bra.uni BB23_33; + @%p13 bra BB29_36; + bra.uni BB29_33; -BB23_36: +BB29_36: setp.eq.f32 %p36, %f2, 0f00000000; setp.eq.f32 %p37, %f2, 0f80000000; or.pred %p38, %p36, %p37; mov.f32 %f133, 0f7FC00000; - @%p38 bra BB23_62; + @%p38 bra BB29_62; div.rn.f32 %f133, %f1, %f2; abs.f32 %f43, %f133; setp.geu.f32 %p39, %f43, 0f7F800000; - @%p39 bra BB23_62; + @%p39 bra BB29_62; cvt.rmi.f32.f32 %f44, %f133; mul.f32 %f45, %f2, %f44; sub.f32 %f133, %f1, %f45; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_5: +BB29_5: setp.eq.s32 %p34, %r7, 1; - @%p34 bra BB23_6; - bra.uni BB23_62; + @%p34 bra BB29_6; + bra.uni BB29_62; -BB23_6: +BB29_6: sub.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_21: +BB29_21: setp.eq.s32 %p22, %r7, 10; - @%p22 bra BB23_22; - bra.uni BB23_62; + @%p22 bra BB29_22; + bra.uni BB29_62; -BB23_22: +BB29_22: setp.neu.f32 %p43, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p43; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_12: +BB29_12: setp.eq.s32 %p29, %r7, 5; - @%p29 bra BB23_13; - bra.uni BB23_62; + @%p29 bra BB29_13; + bra.uni BB29_62; -BB23_13: +BB29_13: setp.lt.f32 %p48, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p48; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_29: +BB29_29: setp.eq.s32 %p16, %r7, 15; - @%p16 bra BB23_30; - bra.uni BB23_62; + @%p16 bra BB29_30; + bra.uni BB29_62; -BB23_30: +BB29_30: mul.f32 %f47, %f1, %f2; mov.f32 %f48, 0f3F800000; sub.f32 %f133, %f48, %f47; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_8: +BB29_8: setp.eq.s32 %p32, %r7, 3; - @%p32 bra BB23_9; - bra.uni BB23_62; + @%p32 bra BB29_9; + bra.uni BB29_62; -BB23_9: +BB29_9: div.rn.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_42: +BB29_42: min.f32 %f133, %f1, %f2; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_25: +BB29_25: setp.eq.s32 %p20, %r7, 13; - @%p20 bra BB23_26; - bra.uni BB23_62; + @%p20 bra BB29_26; + bra.uni BB29_62; -BB23_26: +BB29_26: cvt.rni.s64.f32 %rd12, %f1; cvt.rni.s64.f32 %rd13, %f2; cvt.u32.u64 %r20, %rd12; @@ -2097,71 +2785,71 @@ BB23_26: and.b32 %r22, %r21, %r20; setp.eq.s32 %p42, %r22, 0; selp.f32 %f133, 0f00000000, 0f3F800000, %p42; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_45: +BB29_45: setp.le.f32 %p47, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p47; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_16: +BB29_16: setp.eq.s32 %p27, %r7, 8; - @%p27 bra BB23_17; - bra.uni BB23_62; + @%p27 bra BB29_17; + bra.uni BB29_62; -BB23_17: +BB29_17: setp.ge.f32 %p45, %f1, %f2; selp.f32 %f133, 0f3F800000, 0f00000000, %p45; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_39: +BB29_39: setp.neu.f32 %p40, %f1, 0f00000000; sub.f32 %f46, %f1, %f2; selp.f32 %f133, %f46, 0f00000000, %p40; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_33: +BB29_33: setp.ne.s32 %p14, %r7, 18; - @%p14 bra BB23_62; + @%p14 bra BB29_62; div.rn.f32 %f133, %f1, %f2; abs.f32 %f41, %f133; setp.geu.f32 %p35, %f41, 0f7F800000; - @%p35 bra BB23_62; + @%p35 bra BB29_62; cvt.rmi.f32.f32 %f133, %f133; - bra.uni BB23_62; + bra.uni BB29_62; -BB23_49: +BB29_49: setp.geu.f32 %p59, %f1, 0f00000000; - @%p59 bra BB23_52; + @%p59 bra BB29_52; cvt.rzi.f32.f32 %f127, %f2; setp.neu.f32 %p60, %f127, %f2; selp.f32 %f132, 0f7FFFFFFF, %f132, %p60; -BB23_52: +BB29_52: add.f32 %f129, %f20, %f21; mov.b32 %r35, %f129; setp.lt.s32 %p63, %r35, 2139095040; - @%p63 bra BB23_59; + @%p63 bra BB29_59; setp.gtu.f32 %p64, %f20, 0f7F800000; setp.gtu.f32 %p65, %f21, 0f7F800000; or.pred %p66, %p64, %p65; - @%p66 bra BB23_58; - bra.uni BB23_54; + @%p66 bra BB29_58; + bra.uni BB29_54; -BB23_58: +BB29_58: add.f32 %f132, %f1, %f2; - bra.uni BB23_59; + bra.uni BB29_59; -BB23_54: +BB29_54: setp.eq.f32 %p67, %f21, 0f7F800000; - @%p67 bra BB23_57; - bra.uni BB23_55; + @%p67 bra BB29_57; + bra.uni BB29_55; -BB23_57: +BB29_57: setp.gt.f32 %p70, %f20, 0f3F800000; selp.b32 %r39, 2139095040, 0, %p70; xor.b32 %r40, %r39, 2139095040; @@ -2170,11 +2858,11 @@ BB23_57: mov.b32 %f130, %r41; setp.eq.f32 %p72, %f1, 0fBF800000; selp.f32 %f132, 0f3F800000, %f130, %p72; - bra.uni BB23_59; + bra.uni BB29_59; -BB23_55: +BB29_55: setp.neu.f32 %p68, %f20, 0f7F800000; - @%p68 bra BB23_59; + @%p68 bra BB29_59; setp.ge.f32 %p69, %f2, 0f00000000; selp.b32 %r36, 2139095040, 0, %p69; @@ -2182,20 +2870,20 @@ BB23_55: selp.b32 %r38, %r37, %r36, %p1; mov.b32 %f132, %r38; -BB23_59: +BB29_59: setp.eq.f32 %p73, %f2, 0f00000000; setp.eq.f32 %p74, %f1, 0f3F800000; or.pred %p75, %p74, %p73; selp.f32 %f133, 0f3F800000, %f132, %p75; -BB23_62: +BB29_62: cvta.to.global.u64 %rd14, %rd3; mul.wide.s32 %rd15, %r3, 4; add.s64 %rd16, %rd14, %rd15; st.global.f32 [%rd16], %f133; bar.sync 0; -BB23_63: +BB29_63: ret; } @@ -2226,7 +2914,7 @@ BB23_63: mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; setp.ge.s32 %p3, %r1, %r8; - @%p3 bra BB24_130; + @%p3 bra BB30_130; cvta.to.global.u64 %rd6, %rd5; cvta.to.global.u64 %rd7, %rd4; @@ -2235,86 +2923,86 @@ BB23_63: ld.global.f64 %fd1, [%rd9]; add.s64 %rd1, %rd6, %rd8; setp.eq.s32 %p4, %r7, 0; - @%p4 bra BB24_66; + @%p4 bra BB30_66; mov.f64 %fd98, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p5, %r6, 8; - @%p5 bra BB24_19; + @%p5 bra BB30_19; setp.gt.s32 %p19, %r6, 3; - @%p19 bra BB24_11; + @%p19 bra BB30_11; setp.gt.s32 %p26, %r6, 1; - @%p26 bra BB24_8; + @%p26 bra BB30_8; setp.eq.s32 %p29, %r6, 0; - @%p29 bra BB24_64; - bra.uni BB24_6; + @%p29 bra BB30_64; + bra.uni BB30_6; -BB24_64: +BB30_64: add.f64 %fd98, %fd1, %fd68; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_66: +BB30_66: mov.f64 %fd106, 0d7FEFFFFFFFFFFFFF; setp.gt.s32 %p73, %r6, 8; - @%p73 bra BB24_83; + @%p73 bra BB30_83; setp.gt.s32 %p87, %r6, 3; - @%p87 bra BB24_75; + @%p87 bra BB30_75; setp.gt.s32 %p94, %r6, 1; - @%p94 bra BB24_72; + @%p94 bra BB30_72; setp.eq.s32 %p97, %r6, 0; - @%p97 bra BB24_128; - bra.uni BB24_70; + @%p97 bra BB30_128; + bra.uni BB30_70; -BB24_128: +BB30_128: add.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_19: +BB30_19: setp.gt.s32 %p6, %r6, 13; - @%p6 bra BB24_28; + @%p6 bra BB30_28; setp.gt.s32 %p13, %r6, 10; - @%p13 bra BB24_24; + @%p13 bra BB30_24; setp.eq.s32 %p17, %r6, 9; - @%p17 bra BB24_46; - bra.uni BB24_22; + @%p17 bra BB30_46; + bra.uni BB30_22; -BB24_46: +BB30_46: setp.eq.f64 %p46, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p46; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_83: +BB30_83: setp.gt.s32 %p74, %r6, 13; - @%p74 bra BB24_92; + @%p74 bra BB30_92; setp.gt.s32 %p81, %r6, 10; - @%p81 bra BB24_88; + @%p81 bra BB30_88; setp.eq.s32 %p85, %r6, 9; - @%p85 bra BB24_110; - bra.uni BB24_86; + @%p85 bra BB30_110; + bra.uni BB30_86; -BB24_110: +BB30_110: setp.eq.f64 %p114, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p114; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_11: +BB30_11: setp.gt.s32 %p20, %r6, 5; - @%p20 bra BB24_15; + @%p20 bra BB30_15; setp.eq.s32 %p24, %r6, 4; - @%p24 bra BB24_49; - bra.uni BB24_13; + @%p24 bra BB30_49; + bra.uni BB30_13; -BB24_49: +BB30_49: { .reg .b32 %temp; mov.b64 {%temp, %r2}, %fd68; @@ -2350,10 +3038,10 @@ BB24_49: }// Callseq End 1 setp.lt.s32 %p52, %r2, 0; and.pred %p1, %p52, %p51; - @!%p1 bra BB24_51; - bra.uni BB24_50; + @!%p1 bra BB30_51; + bra.uni BB30_50; -BB24_50: +BB30_50: { .reg .b32 %temp; mov.b64 {%temp, %r26}, %fd97; @@ -2365,30 +3053,30 @@ BB24_50: } mov.b64 %fd97, {%r28, %r27}; -BB24_51: +BB30_51: mov.f64 %fd96, %fd97; setp.eq.f64 %p53, %fd68, 0d0000000000000000; - @%p53 bra BB24_54; - bra.uni BB24_52; + @%p53 bra BB30_54; + bra.uni BB30_52; -BB24_54: +BB30_54: selp.b32 %r29, %r2, 0, %p51; or.b32 %r30, %r29, 2146435072; setp.lt.s32 %p57, %r3, 0; selp.b32 %r31, %r30, %r29, %p57; mov.u32 %r32, 0; mov.b64 %fd96, {%r32, %r31}; - bra.uni BB24_55; + bra.uni BB30_55; -BB24_28: +BB30_28: setp.gt.s32 %p7, %r6, 15; - @%p7 bra BB24_32; + @%p7 bra BB30_32; setp.eq.s32 %p11, %r6, 14; - @%p11 bra BB24_43; - bra.uni BB24_30; + @%p11 bra BB30_43; + bra.uni BB30_30; -BB24_43: +BB30_43: cvt.rni.s64.f64 %rd10, %fd68; cvt.rni.s64.f64 %rd11, %fd1; cvt.u32.u64 %r18, %rd10; @@ -2396,17 +3084,17 @@ BB24_43: or.b32 %r20, %r19, %r18; setp.eq.s32 %p43, %r20, 0; selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p43; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_75: +BB30_75: setp.gt.s32 %p88, %r6, 5; - @%p88 bra BB24_79; + @%p88 bra BB30_79; setp.eq.s32 %p92, %r6, 4; - @%p92 bra BB24_113; - bra.uni BB24_77; + @%p92 bra BB30_113; + bra.uni BB30_77; -BB24_113: +BB30_113: { .reg .b32 %temp; mov.b64 {%temp, %r4}, %fd1; @@ -2442,10 +3130,10 @@ BB24_113: }// Callseq End 2 setp.lt.s32 %p120, %r4, 0; and.pred %p2, %p120, %p119; - @!%p2 bra BB24_115; - bra.uni BB24_114; + @!%p2 bra BB30_115; + bra.uni BB30_114; -BB24_114: +BB30_114: { .reg .b32 %temp; mov.b64 {%temp, %r63}, %fd105; @@ -2457,30 +3145,30 @@ BB24_114: } mov.b64 %fd105, {%r65, %r64}; -BB24_115: +BB30_115: mov.f64 %fd104, %fd105; setp.eq.f64 %p121, %fd1, 0d0000000000000000; - @%p121 bra BB24_118; - bra.uni BB24_116; + @%p121 bra BB30_118; + bra.uni BB30_116; -BB24_118: +BB30_118: selp.b32 %r66, %r4, 0, %p119; or.b32 %r67, %r66, 2146435072; setp.lt.s32 %p125, %r5, 0; selp.b32 %r68, %r67, %r66, %p125; mov.u32 %r69, 0; mov.b64 %fd104, {%r69, %r68}; - bra.uni BB24_119; + bra.uni BB30_119; -BB24_92: +BB30_92: setp.gt.s32 %p75, %r6, 15; - @%p75 bra BB24_96; + @%p75 bra BB30_96; setp.eq.s32 %p79, %r6, 14; - @%p79 bra BB24_107; - bra.uni BB24_94; + @%p79 bra BB30_107; + bra.uni BB30_94; -BB24_107: +BB30_107: cvt.rni.s64.f64 %rd15, %fd1; cvt.rni.s64.f64 %rd16, %fd68; cvt.u32.u64 %r55, %rd15; @@ -2488,61 +3176,61 @@ BB24_107: or.b32 %r57, %r56, %r55; setp.eq.s32 %p111, %r57, 0; selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p111; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_8: +BB30_8: setp.eq.s32 %p27, %r6, 2; - @%p27 bra BB24_63; - bra.uni BB24_9; + @%p27 bra BB30_63; + bra.uni BB30_9; -BB24_63: +BB30_63: mul.f64 %fd98, %fd1, %fd68; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_24: +BB30_24: setp.eq.s32 %p14, %r6, 11; - @%p14 bra BB24_45; + @%p14 bra BB30_45; setp.eq.s32 %p15, %r6, 12; - @%p15 bra BB24_44; - bra.uni BB24_26; + @%p15 bra BB30_44; + bra.uni BB30_26; -BB24_44: +BB30_44: max.f64 %fd98, %fd68, %fd1; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_15: +BB30_15: setp.eq.s32 %p21, %r6, 6; - @%p21 bra BB24_48; + @%p21 bra BB30_48; setp.eq.s32 %p22, %r6, 7; - @%p22 bra BB24_47; - bra.uni BB24_17; + @%p22 bra BB30_47; + bra.uni BB30_17; -BB24_47: +BB30_47: setp.lt.f64 %p48, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p48; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_32: +BB30_32: setp.eq.s32 %p8, %r6, 16; - @%p8 bra BB24_42; + @%p8 bra BB30_42; setp.eq.s32 %p9, %r6, 17; - @%p9 bra BB24_38; - bra.uni BB24_34; + @%p9 bra BB30_38; + bra.uni BB30_34; -BB24_38: +BB30_38: setp.eq.f64 %p35, %fd1, 0d0000000000000000; setp.eq.f64 %p36, %fd1, 0d8000000000000000; or.pred %p37, %p35, %p36; mov.f64 %fd98, 0d7FF8000000000000; - @%p37 bra BB24_65; + @%p37 bra BB30_65; div.rn.f64 %fd98, %fd68, %fd1; abs.f64 %fd72, %fd98; setp.gtu.f64 %p38, %fd72, 0d7FF0000000000000; - @%p38 bra BB24_65; + @%p38 bra BB30_65; { .reg .b32 %temp; @@ -2556,68 +3244,68 @@ BB24_38: setp.ne.s32 %p39, %r17, 2146435072; setp.ne.s32 %p40, %r15, 0; or.pred %p41, %p39, %p40; - @!%p41 bra BB24_65; - bra.uni BB24_41; + @!%p41 bra BB30_65; + bra.uni BB30_41; -BB24_41: +BB30_41: cvt.rmi.f64.f64 %fd73, %fd98; mul.f64 %fd74, %fd1, %fd73; sub.f64 %fd98, %fd68, %fd74; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_72: +BB30_72: setp.eq.s32 %p95, %r6, 2; - @%p95 bra BB24_127; - bra.uni BB24_73; + @%p95 bra BB30_127; + bra.uni BB30_73; -BB24_127: +BB30_127: mul.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_88: +BB30_88: setp.eq.s32 %p82, %r6, 11; - @%p82 bra BB24_109; + @%p82 bra BB30_109; setp.eq.s32 %p83, %r6, 12; - @%p83 bra BB24_108; - bra.uni BB24_90; + @%p83 bra BB30_108; + bra.uni BB30_90; -BB24_108: +BB30_108: max.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_79: +BB30_79: setp.eq.s32 %p89, %r6, 6; - @%p89 bra BB24_112; + @%p89 bra BB30_112; setp.eq.s32 %p90, %r6, 7; - @%p90 bra BB24_111; - bra.uni BB24_81; + @%p90 bra BB30_111; + bra.uni BB30_81; -BB24_111: +BB30_111: setp.gt.f64 %p116, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p116; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_96: +BB30_96: setp.eq.s32 %p76, %r6, 16; - @%p76 bra BB24_106; + @%p76 bra BB30_106; setp.eq.s32 %p77, %r6, 17; - @%p77 bra BB24_102; - bra.uni BB24_98; + @%p77 bra BB30_102; + bra.uni BB30_98; -BB24_102: +BB30_102: setp.eq.f64 %p103, %fd68, 0d0000000000000000; setp.eq.f64 %p104, %fd68, 0d8000000000000000; or.pred %p105, %p103, %p104; mov.f64 %fd106, 0d7FF8000000000000; - @%p105 bra BB24_129; + @%p105 bra BB30_129; div.rn.f64 %fd106, %fd1, %fd68; abs.f64 %fd83, %fd106; setp.gtu.f64 %p106, %fd83, 0d7FF0000000000000; - @%p106 bra BB24_129; + @%p106 bra BB30_129; { .reg .b32 %temp; @@ -2631,74 +3319,74 @@ BB24_102: setp.ne.s32 %p107, %r54, 2146435072; setp.ne.s32 %p108, %r52, 0; or.pred %p109, %p107, %p108; - @!%p109 bra BB24_129; - bra.uni BB24_105; + @!%p109 bra BB30_129; + bra.uni BB30_105; -BB24_105: +BB30_105: cvt.rmi.f64.f64 %fd84, %fd106; mul.f64 %fd85, %fd84, %fd68; sub.f64 %fd106, %fd1, %fd85; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_6: +BB30_6: setp.eq.s32 %p30, %r6, 1; - @%p30 bra BB24_7; - bra.uni BB24_65; + @%p30 bra BB30_7; + bra.uni BB30_65; -BB24_7: +BB30_7: sub.f64 %fd98, %fd68, %fd1; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_22: +BB30_22: setp.eq.s32 %p18, %r6, 10; - @%p18 bra BB24_23; - bra.uni BB24_65; + @%p18 bra BB30_23; + bra.uni BB30_65; -BB24_23: +BB30_23: setp.neu.f64 %p45, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p45; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_13: +BB30_13: setp.eq.s32 %p25, %r6, 5; - @%p25 bra BB24_14; - bra.uni BB24_65; + @%p25 bra BB30_14; + bra.uni BB30_65; -BB24_14: +BB30_14: setp.gt.f64 %p50, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p50; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_30: +BB30_30: setp.eq.s32 %p12, %r6, 15; - @%p12 bra BB24_31; - bra.uni BB24_65; + @%p12 bra BB30_31; + bra.uni BB30_65; -BB24_31: +BB30_31: mul.f64 %fd76, %fd1, %fd68; mov.f64 %fd77, 0d3FF0000000000000; sub.f64 %fd98, %fd77, %fd76; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_9: +BB30_9: setp.eq.s32 %p28, %r6, 3; - @%p28 bra BB24_10; - bra.uni BB24_65; + @%p28 bra BB30_10; + bra.uni BB30_65; -BB24_10: +BB30_10: div.rn.f64 %fd98, %fd68, %fd1; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_45: +BB30_45: min.f64 %fd98, %fd68, %fd1; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_26: +BB30_26: setp.eq.s32 %p16, %r6, 13; - @%p16 bra BB24_27; - bra.uni BB24_65; + @%p16 bra BB30_27; + bra.uni BB30_65; -BB24_27: +BB30_27: cvt.rni.s64.f64 %rd12, %fd68; cvt.rni.s64.f64 %rd13, %fd1; cvt.u32.u64 %r21, %rd12; @@ -2706,37 +3394,37 @@ BB24_27: and.b32 %r23, %r22, %r21; setp.eq.s32 %p44, %r23, 0; selp.f64 %fd98, 0d0000000000000000, 0d3FF0000000000000, %p44; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_48: +BB30_48: setp.ge.f64 %p49, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p49; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_17: +BB30_17: setp.eq.s32 %p23, %r6, 8; - @%p23 bra BB24_18; - bra.uni BB24_65; + @%p23 bra BB30_18; + bra.uni BB30_65; -BB24_18: +BB30_18: setp.le.f64 %p47, %fd1, %fd68; selp.f64 %fd98, 0d3FF0000000000000, 0d0000000000000000, %p47; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_42: +BB30_42: setp.neu.f64 %p42, %fd68, 0d0000000000000000; sub.f64 %fd75, %fd68, %fd1; selp.f64 %fd98, %fd75, 0d0000000000000000, %p42; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_34: +BB30_34: setp.ne.s32 %p10, %r6, 18; - @%p10 bra BB24_65; + @%p10 bra BB30_65; div.rn.f64 %fd98, %fd68, %fd1; abs.f64 %fd70, %fd98; setp.gtu.f64 %p31, %fd70, 0d7FF0000000000000; - @%p31 bra BB24_65; + @%p31 bra BB30_65; { .reg .b32 %temp; @@ -2750,72 +3438,72 @@ BB24_34: setp.ne.s32 %p32, %r14, 2146435072; setp.ne.s32 %p33, %r12, 0; or.pred %p34, %p32, %p33; - @!%p34 bra BB24_65; - bra.uni BB24_37; + @!%p34 bra BB30_65; + bra.uni BB30_37; -BB24_37: +BB30_37: cvt.rmi.f64.f64 %fd98, %fd98; - bra.uni BB24_65; + bra.uni BB30_65; -BB24_70: +BB30_70: setp.eq.s32 %p98, %r6, 1; - @%p98 bra BB24_71; - bra.uni BB24_129; + @%p98 bra BB30_71; + bra.uni BB30_129; -BB24_71: +BB30_71: sub.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_86: +BB30_86: setp.eq.s32 %p86, %r6, 10; - @%p86 bra BB24_87; - bra.uni BB24_129; + @%p86 bra BB30_87; + bra.uni BB30_129; -BB24_87: +BB30_87: setp.neu.f64 %p113, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p113; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_77: +BB30_77: setp.eq.s32 %p93, %r6, 5; - @%p93 bra BB24_78; - bra.uni BB24_129; + @%p93 bra BB30_78; + bra.uni BB30_129; -BB24_78: +BB30_78: setp.lt.f64 %p118, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p118; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_94: +BB30_94: setp.eq.s32 %p80, %r6, 15; - @%p80 bra BB24_95; - bra.uni BB24_129; + @%p80 bra BB30_95; + bra.uni BB30_129; -BB24_95: +BB30_95: mul.f64 %fd87, %fd1, %fd68; mov.f64 %fd88, 0d3FF0000000000000; sub.f64 %fd106, %fd88, %fd87; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_73: +BB30_73: setp.eq.s32 %p96, %r6, 3; - @%p96 bra BB24_74; - bra.uni BB24_129; + @%p96 bra BB30_74; + bra.uni BB30_129; -BB24_74: +BB30_74: div.rn.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_109: +BB30_109: min.f64 %fd106, %fd1, %fd68; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_90: +BB30_90: setp.eq.s32 %p84, %r6, 13; - @%p84 bra BB24_91; - bra.uni BB24_129; + @%p84 bra BB30_91; + bra.uni BB30_129; -BB24_91: +BB30_91: cvt.rni.s64.f64 %rd17, %fd1; cvt.rni.s64.f64 %rd18, %fd68; cvt.u32.u64 %r58, %rd17; @@ -2823,37 +3511,37 @@ BB24_91: and.b32 %r60, %r59, %r58; setp.eq.s32 %p112, %r60, 0; selp.f64 %fd106, 0d0000000000000000, 0d3FF0000000000000, %p112; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_112: +BB30_112: setp.le.f64 %p117, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p117; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_81: +BB30_81: setp.eq.s32 %p91, %r6, 8; - @%p91 bra BB24_82; - bra.uni BB24_129; + @%p91 bra BB30_82; + bra.uni BB30_129; -BB24_82: +BB30_82: setp.ge.f64 %p115, %fd1, %fd68; selp.f64 %fd106, 0d3FF0000000000000, 0d0000000000000000, %p115; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_106: +BB30_106: setp.neu.f64 %p110, %fd1, 0d0000000000000000; sub.f64 %fd86, %fd1, %fd68; selp.f64 %fd106, %fd86, 0d0000000000000000, %p110; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_98: +BB30_98: setp.ne.s32 %p78, %r6, 18; - @%p78 bra BB24_129; + @%p78 bra BB30_129; div.rn.f64 %fd106, %fd1, %fd68; abs.f64 %fd81, %fd106; setp.gtu.f64 %p99, %fd81, 0d7FF0000000000000; - @%p99 bra BB24_129; + @%p99 bra BB30_129; { .reg .b32 %temp; @@ -2867,22 +3555,22 @@ BB24_98: setp.ne.s32 %p100, %r51, 2146435072; setp.ne.s32 %p101, %r49, 0; or.pred %p102, %p100, %p101; - @!%p102 bra BB24_129; - bra.uni BB24_101; + @!%p102 bra BB30_129; + bra.uni BB30_101; -BB24_101: +BB30_101: cvt.rmi.f64.f64 %fd106, %fd106; - bra.uni BB24_129; + bra.uni BB30_129; -BB24_52: +BB30_52: setp.gt.s32 %p54, %r2, -1; - @%p54 bra BB24_55; + @%p54 bra BB30_55; cvt.rzi.f64.f64 %fd78, %fd1; setp.neu.f64 %p55, %fd78, %fd1; selp.f64 %fd96, 0dFFF8000000000000, %fd96, %p55; -BB24_55: +BB30_55: mov.f64 %fd24, %fd96; add.f64 %fd25, %fd1, %fd68; { @@ -2892,17 +3580,17 @@ BB24_55: and.b32 %r34, %r33, 2146435072; setp.ne.s32 %p58, %r34, 2146435072; mov.f64 %fd95, %fd24; - @%p58 bra BB24_62; + @%p58 bra BB30_62; setp.gtu.f64 %p59, %fd18, 0d7FF0000000000000; mov.f64 %fd95, %fd25; - @%p59 bra BB24_62; + @%p59 bra BB30_62; abs.f64 %fd79, %fd1; setp.gtu.f64 %p60, %fd79, 0d7FF0000000000000; mov.f64 %fd94, %fd25; mov.f64 %fd95, %fd94; - @%p60 bra BB24_62; + @%p60 bra BB30_62; { .reg .b32 %temp; @@ -2912,10 +3600,10 @@ BB24_55: setp.eq.s32 %p61, %r36, 2146435072; setp.eq.s32 %p62, %r35, 0; and.pred %p63, %p61, %p62; - @%p63 bra BB24_61; - bra.uni BB24_59; + @%p63 bra BB30_61; + bra.uni BB30_59; -BB24_61: +BB30_61: setp.gt.f64 %p67, %fd18, 0d3FF0000000000000; selp.b32 %r44, 2146435072, 0, %p67; xor.b32 %r45, %r44, 2146435072; @@ -2925,17 +3613,17 @@ BB24_61: selp.b32 %r47, 1072693248, %r46, %p69; mov.u32 %r48, 0; mov.b64 %fd95, {%r48, %r47}; - bra.uni BB24_62; + bra.uni BB30_62; -BB24_116: +BB30_116: setp.gt.s32 %p122, %r4, -1; - @%p122 bra BB24_119; + @%p122 bra BB30_119; cvt.rzi.f64.f64 %fd89, %fd68; setp.neu.f64 %p123, %fd89, %fd68; selp.f64 %fd104, 0dFFF8000000000000, %fd104, %p123; -BB24_119: +BB30_119: mov.f64 %fd57, %fd104; add.f64 %fd58, %fd1, %fd68; { @@ -2945,17 +3633,17 @@ BB24_119: and.b32 %r71, %r70, 2146435072; setp.ne.s32 %p126, %r71, 2146435072; mov.f64 %fd103, %fd57; - @%p126 bra BB24_126; + @%p126 bra BB30_126; setp.gtu.f64 %p127, %fd51, 0d7FF0000000000000; mov.f64 %fd103, %fd58; - @%p127 bra BB24_126; + @%p127 bra BB30_126; abs.f64 %fd90, %fd68; setp.gtu.f64 %p128, %fd90, 0d7FF0000000000000; mov.f64 %fd102, %fd58; mov.f64 %fd103, %fd102; - @%p128 bra BB24_126; + @%p128 bra BB30_126; { .reg .b32 %temp; @@ -2965,10 +3653,10 @@ BB24_119: setp.eq.s32 %p129, %r73, 2146435072; setp.eq.s32 %p130, %r72, 0; and.pred %p131, %p129, %p130; - @%p131 bra BB24_125; - bra.uni BB24_123; + @%p131 bra BB30_125; + bra.uni BB30_123; -BB24_125: +BB30_125: setp.gt.f64 %p135, %fd51, 0d3FF0000000000000; selp.b32 %r81, 2146435072, 0, %p135; xor.b32 %r82, %r81, 2146435072; @@ -2978,9 +3666,9 @@ BB24_125: selp.b32 %r84, 1072693248, %r83, %p137; mov.u32 %r85, 0; mov.b64 %fd103, {%r85, %r84}; - bra.uni BB24_126; + bra.uni BB30_126; -BB24_59: +BB30_59: { .reg .b32 %temp; mov.b64 {%r37, %temp}, %fd68; @@ -2990,10 +3678,10 @@ BB24_59: setp.eq.s32 %p65, %r37, 0; and.pred %p66, %p64, %p65; mov.f64 %fd95, %fd24; - @!%p66 bra BB24_62; - bra.uni BB24_60; + @!%p66 bra BB30_62; + bra.uni BB30_60; -BB24_60: +BB30_60: shr.s32 %r39, %r3, 31; and.b32 %r40, %r39, -2146435072; selp.b32 %r41, -1048576, 2146435072, %p1; @@ -3001,17 +3689,17 @@ BB24_60: mov.u32 %r43, 0; mov.b64 %fd95, {%r43, %r42}; -BB24_62: +BB30_62: setp.eq.f64 %p70, %fd1, 0d0000000000000000; setp.eq.f64 %p71, %fd68, 0d3FF0000000000000; or.pred %p72, %p71, %p70; selp.f64 %fd98, 0d3FF0000000000000, %fd95, %p72; -BB24_65: +BB30_65: st.global.f64 [%rd1], %fd98; - bra.uni BB24_130; + bra.uni BB30_130; -BB24_123: +BB30_123: { .reg .b32 %temp; mov.b64 {%r74, %temp}, %fd1; @@ -3021,10 +3709,10 @@ BB24_123: setp.eq.s32 %p133, %r74, 0; and.pred %p134, %p132, %p133; mov.f64 %fd103, %fd57; - @!%p134 bra BB24_126; - bra.uni BB24_124; + @!%p134 bra BB30_126; + bra.uni BB30_124; -BB24_124: +BB30_124: shr.s32 %r76, %r5, 31; and.b32 %r77, %r76, -2146435072; selp.b32 %r78, -1048576, 2146435072, %p2; @@ -3032,16 +3720,16 @@ BB24_124: mov.u32 %r80, 0; mov.b64 %fd103, {%r80, %r79}; -BB24_126: +BB30_126: setp.eq.f64 %p138, %fd68, 0d0000000000000000; setp.eq.f64 %p139, %fd1, 0d3FF0000000000000; or.pred %p140, %p139, %p138; selp.f64 %fd106, 0d3FF0000000000000, %fd103, %p140; -BB24_129: +BB30_129: st.global.f64 [%rd1], %fd106; -BB24_130: +BB30_130: bar.sync 0; ret; } @@ -3075,7 +3763,7 @@ BB24_130: mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; setp.ge.s32 %p3, %r1, %r4; - @%p3 bra BB25_126; + @%p3 bra BB31_126; cvta.to.global.u64 %rd4, %rd3; cvta.to.global.u64 %rd5, %rd2; @@ -3084,86 +3772,86 @@ BB24_130: ld.global.f32 %f2, [%rd7]; add.s64 %rd1, %rd4, %rd6; setp.eq.s32 %p4, %r3, 0; - @%p4 bra BB25_64; + @%p4 bra BB31_64; mov.f32 %f261, 0f7F7FFFFF; setp.gt.s32 %p5, %r2, 8; - @%p5 bra BB25_19; + @%p5 bra BB31_19; setp.gt.s32 %p19, %r2, 3; - @%p19 bra BB25_11; + @%p19 bra BB31_11; setp.gt.s32 %p26, %r2, 1; - @%p26 bra BB25_8; + @%p26 bra BB31_8; setp.eq.s32 %p29, %r2, 0; - @%p29 bra BB25_62; - bra.uni BB25_6; + @%p29 bra BB31_62; + bra.uni BB31_6; -BB25_62: +BB31_62: add.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_64: +BB31_64: mov.f32 %f264, 0f7F7FFFFF; setp.gt.s32 %p72, %r2, 8; - @%p72 bra BB25_81; + @%p72 bra BB31_81; setp.gt.s32 %p86, %r2, 3; - @%p86 bra BB25_73; + @%p86 bra BB31_73; setp.gt.s32 %p93, %r2, 1; - @%p93 bra BB25_70; + @%p93 bra BB31_70; setp.eq.s32 %p96, %r2, 0; - @%p96 bra BB25_124; - bra.uni BB25_68; + @%p96 bra BB31_124; + bra.uni BB31_68; -BB25_124: +BB31_124: add.f32 %f264, %f1, %f2; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_19: +BB31_19: setp.gt.s32 %p6, %r2, 13; - @%p6 bra BB25_28; + @%p6 bra BB31_28; setp.gt.s32 %p13, %r2, 10; - @%p13 bra BB25_24; + @%p13 bra BB31_24; setp.eq.s32 %p17, %r2, 9; - @%p17 bra BB25_44; - bra.uni BB25_22; + @%p17 bra BB31_44; + bra.uni BB31_22; -BB25_44: +BB31_44: setp.eq.f32 %p40, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p40; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_81: +BB31_81: setp.gt.s32 %p73, %r2, 13; - @%p73 bra BB25_90; + @%p73 bra BB31_90; setp.gt.s32 %p80, %r2, 10; - @%p80 bra BB25_86; + @%p80 bra BB31_86; setp.eq.s32 %p84, %r2, 9; - @%p84 bra BB25_106; - bra.uni BB25_84; + @%p84 bra BB31_106; + bra.uni BB31_84; -BB25_106: +BB31_106: setp.eq.f32 %p107, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p107; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_11: +BB31_11: setp.gt.s32 %p20, %r2, 5; - @%p20 bra BB25_15; + @%p20 bra BB31_15; setp.eq.s32 %p24, %r2, 4; - @%p24 bra BB25_47; - bra.uni BB25_13; + @%p24 bra BB31_47; + bra.uni BB31_13; -BB25_47: +BB31_47: mul.f32 %f90, %f2, 0f3F000000; cvt.rzi.f32.f32 %f91, %f90; fma.rn.f32 %f92, %f91, 0fC0000000, %f2; @@ -3263,11 +3951,11 @@ BB25_47: setp.gt.f32 %p50, %f152, 0f42D20000; selp.f32 %f259, 0f7F800000, %f162, %p50; setp.eq.f32 %p51, %f259, 0f7F800000; - @%p51 bra BB25_49; + @%p51 bra BB31_49; fma.rn.f32 %f259, %f259, %f22, %f259; -BB25_49: +BB31_49: setp.lt.f32 %p52, %f1, 0f00000000; setp.eq.f32 %p53, %f19, 0f3F800000; and.pred %p1, %p52, %p53; @@ -3276,10 +3964,10 @@ BB25_49: mov.b32 %f163, %r21; selp.f32 %f260, %f163, %f259, %p1; setp.eq.f32 %p54, %f1, 0f00000000; - @%p54 bra BB25_52; - bra.uni BB25_50; + @%p54 bra BB31_52; + bra.uni BB31_50; -BB25_52: +BB31_52: add.f32 %f165, %f1, %f1; mov.b32 %r22, %f165; selp.b32 %r23, %r22, 0, %p53; @@ -3287,17 +3975,17 @@ BB25_52: setp.lt.f32 %p58, %f2, 0f00000000; selp.b32 %r25, %r24, %r23, %p58; mov.b32 %f260, %r25; - bra.uni BB25_53; + bra.uni BB31_53; -BB25_28: +BB31_28: setp.gt.s32 %p7, %r2, 15; - @%p7 bra BB25_32; + @%p7 bra BB31_32; setp.eq.s32 %p11, %r2, 14; - @%p11 bra BB25_41; - bra.uni BB25_30; + @%p11 bra BB31_41; + bra.uni BB31_30; -BB25_41: +BB31_41: cvt.rni.s64.f32 %rd8, %f1; cvt.rni.s64.f32 %rd9, %f2; cvt.u32.u64 %r8, %rd8; @@ -3305,17 +3993,17 @@ BB25_41: or.b32 %r10, %r9, %r8; setp.eq.s32 %p37, %r10, 0; selp.f32 %f261, 0f00000000, 0f3F800000, %p37; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_73: +BB31_73: setp.gt.s32 %p87, %r2, 5; - @%p87 bra BB25_77; + @%p87 bra BB31_77; setp.eq.s32 %p91, %r2, 4; - @%p91 bra BB25_109; - bra.uni BB25_75; + @%p91 bra BB31_109; + bra.uni BB31_75; -BB25_109: +BB31_109: mul.f32 %f181, %f1, 0f3F000000; cvt.rzi.f32.f32 %f182, %f181; fma.rn.f32 %f183, %f182, 0fC0000000, %f1; @@ -3415,11 +4103,11 @@ BB25_109: setp.gt.f32 %p117, %f243, 0f42D20000; selp.f32 %f262, 0f7F800000, %f253, %p117; setp.eq.f32 %p118, %f262, 0f7F800000; - @%p118 bra BB25_111; + @%p118 bra BB31_111; fma.rn.f32 %f262, %f262, %f59, %f262; -BB25_111: +BB31_111: setp.lt.f32 %p119, %f2, 0f00000000; setp.eq.f32 %p120, %f56, 0f3F800000; and.pred %p2, %p119, %p120; @@ -3428,10 +4116,10 @@ BB25_111: mov.b32 %f254, %r46; selp.f32 %f263, %f254, %f262, %p2; setp.eq.f32 %p121, %f2, 0f00000000; - @%p121 bra BB25_114; - bra.uni BB25_112; + @%p121 bra BB31_114; + bra.uni BB31_112; -BB25_114: +BB31_114: add.f32 %f256, %f2, %f2; mov.b32 %r47, %f256; selp.b32 %r48, %r47, 0, %p120; @@ -3439,17 +4127,17 @@ BB25_114: setp.lt.f32 %p125, %f1, 0f00000000; selp.b32 %r50, %r49, %r48, %p125; mov.b32 %f263, %r50; - bra.uni BB25_115; + bra.uni BB31_115; -BB25_90: +BB31_90: setp.gt.s32 %p74, %r2, 15; - @%p74 bra BB25_94; + @%p74 bra BB31_94; setp.eq.s32 %p78, %r2, 14; - @%p78 bra BB25_103; - bra.uni BB25_92; + @%p78 bra BB31_103; + bra.uni BB31_92; -BB25_103: +BB31_103: cvt.rni.s64.f32 %rd12, %f2; cvt.rni.s64.f32 %rd13, %f1; cvt.u32.u64 %r33, %rd12; @@ -3457,185 +4145,185 @@ BB25_103: or.b32 %r35, %r34, %r33; setp.eq.s32 %p104, %r35, 0; selp.f32 %f264, 0f00000000, 0f3F800000, %p104; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_8: +BB31_8: setp.eq.s32 %p27, %r2, 2; - @%p27 bra BB25_61; - bra.uni BB25_9; + @%p27 bra BB31_61; + bra.uni BB31_9; -BB25_61: +BB31_61: mul.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_24: +BB31_24: setp.eq.s32 %p14, %r2, 11; - @%p14 bra BB25_43; + @%p14 bra BB31_43; setp.eq.s32 %p15, %r2, 12; - @%p15 bra BB25_42; - bra.uni BB25_26; + @%p15 bra BB31_42; + bra.uni BB31_26; -BB25_42: +BB31_42: max.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_15: +BB31_15: setp.eq.s32 %p21, %r2, 6; - @%p21 bra BB25_46; + @%p21 bra BB31_46; setp.eq.s32 %p22, %r2, 7; - @%p22 bra BB25_45; - bra.uni BB25_17; + @%p22 bra BB31_45; + bra.uni BB31_17; -BB25_45: +BB31_45: setp.gt.f32 %p42, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p42; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_32: +BB31_32: setp.eq.s32 %p8, %r2, 16; - @%p8 bra BB25_40; + @%p8 bra BB31_40; setp.eq.s32 %p9, %r2, 17; - @%p9 bra BB25_37; - bra.uni BB25_34; + @%p9 bra BB31_37; + bra.uni BB31_34; -BB25_37: +BB31_37: setp.eq.f32 %p32, %f2, 0f00000000; setp.eq.f32 %p33, %f2, 0f80000000; or.pred %p34, %p32, %p33; mov.f32 %f261, 0f7FC00000; - @%p34 bra BB25_63; + @%p34 bra BB31_63; div.rn.f32 %f261, %f1, %f2; abs.f32 %f80, %f261; setp.geu.f32 %p35, %f80, 0f7F800000; - @%p35 bra BB25_63; + @%p35 bra BB31_63; cvt.rmi.f32.f32 %f81, %f261; mul.f32 %f82, %f2, %f81; sub.f32 %f261, %f1, %f82; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_70: +BB31_70: setp.eq.s32 %p94, %r2, 2; - @%p94 bra BB25_123; - bra.uni BB25_71; + @%p94 bra BB31_123; + bra.uni BB31_71; -BB25_123: +BB31_123: mul.f32 %f264, %f1, %f2; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_86: +BB31_86: setp.eq.s32 %p81, %r2, 11; - @%p81 bra BB25_105; + @%p81 bra BB31_105; setp.eq.s32 %p82, %r2, 12; - @%p82 bra BB25_104; - bra.uni BB25_88; + @%p82 bra BB31_104; + bra.uni BB31_88; -BB25_104: +BB31_104: max.f32 %f264, %f2, %f1; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_77: +BB31_77: setp.eq.s32 %p88, %r2, 6; - @%p88 bra BB25_108; + @%p88 bra BB31_108; setp.eq.s32 %p89, %r2, 7; - @%p89 bra BB25_107; - bra.uni BB25_79; + @%p89 bra BB31_107; + bra.uni BB31_79; -BB25_107: +BB31_107: setp.gt.f32 %p109, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p109; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_94: +BB31_94: setp.eq.s32 %p75, %r2, 16; - @%p75 bra BB25_102; + @%p75 bra BB31_102; setp.eq.s32 %p76, %r2, 17; - @%p76 bra BB25_99; - bra.uni BB25_96; + @%p76 bra BB31_99; + bra.uni BB31_96; -BB25_99: +BB31_99: setp.eq.f32 %p99, %f1, 0f00000000; setp.eq.f32 %p100, %f1, 0f80000000; or.pred %p101, %p99, %p100; mov.f32 %f264, 0f7FC00000; - @%p101 bra BB25_125; + @%p101 bra BB31_125; div.rn.f32 %f264, %f2, %f1; abs.f32 %f171, %f264; setp.geu.f32 %p102, %f171, 0f7F800000; - @%p102 bra BB25_125; + @%p102 bra BB31_125; cvt.rmi.f32.f32 %f172, %f264; mul.f32 %f173, %f1, %f172; sub.f32 %f264, %f2, %f173; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_6: +BB31_6: setp.eq.s32 %p30, %r2, 1; - @%p30 bra BB25_7; - bra.uni BB25_63; + @%p30 bra BB31_7; + bra.uni BB31_63; -BB25_7: +BB31_7: sub.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_22: +BB31_22: setp.eq.s32 %p18, %r2, 10; - @%p18 bra BB25_23; - bra.uni BB25_63; + @%p18 bra BB31_23; + bra.uni BB31_63; -BB25_23: +BB31_23: setp.neu.f32 %p39, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p39; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_13: +BB31_13: setp.eq.s32 %p25, %r2, 5; - @%p25 bra BB25_14; - bra.uni BB25_63; + @%p25 bra BB31_14; + bra.uni BB31_63; -BB25_14: +BB31_14: setp.lt.f32 %p44, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p44; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_30: +BB31_30: setp.eq.s32 %p12, %r2, 15; - @%p12 bra BB25_31; - bra.uni BB25_63; + @%p12 bra BB31_31; + bra.uni BB31_63; -BB25_31: +BB31_31: mul.f32 %f84, %f1, %f2; mov.f32 %f85, 0f3F800000; sub.f32 %f261, %f85, %f84; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_9: +BB31_9: setp.eq.s32 %p28, %r2, 3; - @%p28 bra BB25_10; - bra.uni BB25_63; + @%p28 bra BB31_10; + bra.uni BB31_63; -BB25_10: +BB31_10: div.rn.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_43: +BB31_43: min.f32 %f261, %f1, %f2; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_26: +BB31_26: setp.eq.s32 %p16, %r2, 13; - @%p16 bra BB25_27; - bra.uni BB25_63; + @%p16 bra BB31_27; + bra.uni BB31_63; -BB25_27: +BB31_27: cvt.rni.s64.f32 %rd10, %f1; cvt.rni.s64.f32 %rd11, %f2; cvt.u32.u64 %r11, %rd10; @@ -3643,100 +4331,100 @@ BB25_27: and.b32 %r13, %r12, %r11; setp.eq.s32 %p38, %r13, 0; selp.f32 %f261, 0f00000000, 0f3F800000, %p38; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_46: +BB31_46: setp.le.f32 %p43, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p43; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_17: +BB31_17: setp.eq.s32 %p23, %r2, 8; - @%p23 bra BB25_18; - bra.uni BB25_63; + @%p23 bra BB31_18; + bra.uni BB31_63; -BB25_18: +BB31_18: setp.ge.f32 %p41, %f1, %f2; selp.f32 %f261, 0f3F800000, 0f00000000, %p41; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_40: +BB31_40: setp.neu.f32 %p36, %f1, 0f00000000; sub.f32 %f83, %f1, %f2; selp.f32 %f261, %f83, 0f00000000, %p36; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_34: +BB31_34: setp.ne.s32 %p10, %r2, 18; - @%p10 bra BB25_63; + @%p10 bra BB31_63; div.rn.f32 %f261, %f1, %f2; abs.f32 %f78, %f261; setp.geu.f32 %p31, %f78, 0f7F800000; - @%p31 bra BB25_63; + @%p31 bra BB31_63; cvt.rmi.f32.f32 %f261, %f261; - bra.uni BB25_63; + bra.uni BB31_63; -BB25_68: +BB31_68: setp.eq.s32 %p97, %r2, 1; - @%p97 bra BB25_69; - bra.uni BB25_125; + @%p97 bra BB31_69; + bra.uni BB31_125; -BB25_69: +BB31_69: sub.f32 %f264, %f2, %f1; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_84: +BB31_84: setp.eq.s32 %p85, %r2, 10; - @%p85 bra BB25_85; - bra.uni BB25_125; + @%p85 bra BB31_85; + bra.uni BB31_125; -BB25_85: +BB31_85: setp.neu.f32 %p106, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p106; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_75: +BB31_75: setp.eq.s32 %p92, %r2, 5; - @%p92 bra BB25_76; - bra.uni BB25_125; + @%p92 bra BB31_76; + bra.uni BB31_125; -BB25_76: +BB31_76: setp.lt.f32 %p111, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p111; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_92: +BB31_92: setp.eq.s32 %p79, %r2, 15; - @%p79 bra BB25_93; - bra.uni BB25_125; + @%p79 bra BB31_93; + bra.uni BB31_125; -BB25_93: +BB31_93: mul.f32 %f175, %f1, %f2; mov.f32 %f176, 0f3F800000; sub.f32 %f264, %f176, %f175; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_71: +BB31_71: setp.eq.s32 %p95, %r2, 3; - @%p95 bra BB25_72; - bra.uni BB25_125; + @%p95 bra BB31_72; + bra.uni BB31_125; -BB25_72: +BB31_72: div.rn.f32 %f264, %f2, %f1; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_105: +BB31_105: min.f32 %f264, %f2, %f1; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_88: +BB31_88: setp.eq.s32 %p83, %r2, 13; - @%p83 bra BB25_89; - bra.uni BB25_125; + @%p83 bra BB31_89; + bra.uni BB31_125; -BB25_89: +BB31_89: cvt.rni.s64.f32 %rd14, %f2; cvt.rni.s64.f32 %rd15, %f1; cvt.u32.u64 %r36, %rd14; @@ -3744,71 +4432,71 @@ BB25_89: and.b32 %r38, %r37, %r36; setp.eq.s32 %p105, %r38, 0; selp.f32 %f264, 0f00000000, 0f3F800000, %p105; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_108: +BB31_108: setp.le.f32 %p110, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p110; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_79: +BB31_79: setp.eq.s32 %p90, %r2, 8; - @%p90 bra BB25_80; - bra.uni BB25_125; + @%p90 bra BB31_80; + bra.uni BB31_125; -BB25_80: +BB31_80: setp.ge.f32 %p108, %f2, %f1; selp.f32 %f264, 0f3F800000, 0f00000000, %p108; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_102: +BB31_102: setp.neu.f32 %p103, %f2, 0f00000000; sub.f32 %f174, %f2, %f1; selp.f32 %f264, %f174, 0f00000000, %p103; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_96: +BB31_96: setp.ne.s32 %p77, %r2, 18; - @%p77 bra BB25_125; + @%p77 bra BB31_125; div.rn.f32 %f264, %f2, %f1; abs.f32 %f169, %f264; setp.geu.f32 %p98, %f169, 0f7F800000; - @%p98 bra BB25_125; + @%p98 bra BB31_125; cvt.rmi.f32.f32 %f264, %f264; - bra.uni BB25_125; + bra.uni BB31_125; -BB25_50: +BB31_50: setp.geu.f32 %p55, %f1, 0f00000000; - @%p55 bra BB25_53; + @%p55 bra BB31_53; cvt.rzi.f32.f32 %f164, %f2; setp.neu.f32 %p56, %f164, %f2; selp.f32 %f260, 0f7FFFFFFF, %f260, %p56; -BB25_53: +BB31_53: add.f32 %f166, %f20, %f21; mov.b32 %r26, %f166; setp.lt.s32 %p59, %r26, 2139095040; - @%p59 bra BB25_60; + @%p59 bra BB31_60; setp.gtu.f32 %p60, %f20, 0f7F800000; setp.gtu.f32 %p61, %f21, 0f7F800000; or.pred %p62, %p60, %p61; - @%p62 bra BB25_59; - bra.uni BB25_55; + @%p62 bra BB31_59; + bra.uni BB31_55; -BB25_59: +BB31_59: add.f32 %f260, %f1, %f2; - bra.uni BB25_60; + bra.uni BB31_60; -BB25_55: +BB31_55: setp.eq.f32 %p63, %f21, 0f7F800000; - @%p63 bra BB25_58; - bra.uni BB25_56; + @%p63 bra BB31_58; + bra.uni BB31_56; -BB25_58: +BB31_58: setp.gt.f32 %p66, %f20, 0f3F800000; selp.b32 %r30, 2139095040, 0, %p66; xor.b32 %r31, %r30, 2139095040; @@ -3817,38 +4505,38 @@ BB25_58: mov.b32 %f167, %r32; setp.eq.f32 %p68, %f1, 0fBF800000; selp.f32 %f260, 0f3F800000, %f167, %p68; - bra.uni BB25_60; + bra.uni BB31_60; -BB25_112: +BB31_112: setp.geu.f32 %p122, %f2, 0f00000000; - @%p122 bra BB25_115; + @%p122 bra BB31_115; cvt.rzi.f32.f32 %f255, %f1; setp.neu.f32 %p123, %f255, %f1; selp.f32 %f263, 0f7FFFFFFF, %f263, %p123; -BB25_115: +BB31_115: add.f32 %f257, %f57, %f58; mov.b32 %r51, %f257; setp.lt.s32 %p126, %r51, 2139095040; - @%p126 bra BB25_122; + @%p126 bra BB31_122; setp.gtu.f32 %p127, %f57, 0f7F800000; setp.gtu.f32 %p128, %f58, 0f7F800000; or.pred %p129, %p127, %p128; - @%p129 bra BB25_121; - bra.uni BB25_117; + @%p129 bra BB31_121; + bra.uni BB31_117; -BB25_121: +BB31_121: add.f32 %f263, %f1, %f2; - bra.uni BB25_122; + bra.uni BB31_122; -BB25_117: +BB31_117: setp.eq.f32 %p130, %f58, 0f7F800000; - @%p130 bra BB25_120; - bra.uni BB25_118; + @%p130 bra BB31_120; + bra.uni BB31_118; -BB25_120: +BB31_120: setp.gt.f32 %p133, %f57, 0f3F800000; selp.b32 %r55, 2139095040, 0, %p133; xor.b32 %r56, %r55, 2139095040; @@ -3857,11 +4545,11 @@ BB25_120: mov.b32 %f258, %r57; setp.eq.f32 %p135, %f2, 0fBF800000; selp.f32 %f263, 0f3F800000, %f258, %p135; - bra.uni BB25_122; + bra.uni BB31_122; -BB25_56: +BB31_56: setp.neu.f32 %p64, %f20, 0f7F800000; - @%p64 bra BB25_60; + @%p64 bra BB31_60; setp.ge.f32 %p65, %f2, 0f00000000; selp.b32 %r27, 2139095040, 0, %p65; @@ -3869,19 +4557,19 @@ BB25_56: selp.b32 %r29, %r28, %r27, %p1; mov.b32 %f260, %r29; -BB25_60: +BB31_60: setp.eq.f32 %p69, %f2, 0f00000000; setp.eq.f32 %p70, %f1, 0f3F800000; or.pred %p71, %p70, %p69; selp.f32 %f261, 0f3F800000, %f260, %p71; -BB25_63: +BB31_63: st.global.f32 [%rd1], %f261; - bra.uni BB25_126; + bra.uni BB31_126; -BB25_118: +BB31_118: setp.neu.f32 %p131, %f57, 0f7F800000; - @%p131 bra BB25_122; + @%p131 bra BB31_122; setp.ge.f32 %p132, %f1, 0f00000000; selp.b32 %r52, 2139095040, 0, %p132; @@ -3889,16 +4577,16 @@ BB25_118: selp.b32 %r54, %r53, %r52, %p2; mov.b32 %f263, %r54; -BB25_122: +BB31_122: setp.eq.f32 %p136, %f1, 0f00000000; setp.eq.f32 %p137, %f2, 0f3F800000; or.pred %p138, %p137, %p136; selp.f32 %f264, 0f3F800000, %f263, %p138; -BB25_125: +BB31_125: st.global.f32 [%rd1], %f264; -BB25_126: +BB31_126: bar.sync 0; ret; } @@ -3924,14 +4612,14 @@ BB25_126: mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r4, %r3, %r5; setp.ge.s32 %p1, %r1, %r2; - @%p1 bra BB26_2; + @%p1 bra BB32_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1,
<TRUNCATED>
