https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/128702
In preparations for implementing temporal divergence lowering for global-isel, switch llvm-ir tests for amdgpu divergence lowering to new reg bank select. Requires adding few simple regbanklegalize rules for these tests to work. >From ebc51390f1b9b788303978872fdb8cd5b84b84ae Mon Sep 17 00:00:00 2001 From: Petar Avramovic <petar.avramo...@amd.com> Date: Tue, 25 Feb 2025 12:19:52 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Update divergence lowering tests In preparations for implementing temporal divergence lowering for global-isel, switch llvm-ir tests for amdgpu divergence lowering to new reg bank select. Requires adding few simple regbanklegalize rules for these tests to work. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 6 + .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 28 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 39 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 5 + ...-divergent-i1-phis-no-lane-mask-merging.ll | 97 ++-- ...vergence-divergent-i1-used-outside-loop.ll | 368 ++++++++------- .../GlobalISel/divergence-structurizer.ll | 418 ++++++++++-------- .../divergence-temporal-divergent-i1.ll | 304 ++++++------- .../divergence-temporal-divergent-reg.ll | 40 +- 9 files changed, 683 insertions(+), 622 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 8d3e7829e10e1..eb2ece7bece51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { } // Opcodes that also support S1. + if (Opc == G_FREEZE && + MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { + RBLHelper.applyMappingTrivial(*MI); + continue; + } + if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || Opc == AMDGPU::G_IMPLICIT_DEF)) { Register Dst = MI->getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 3c007987b8494..3383175fc1bdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: return; + case VccExtToSel: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + Register Src = MI.getOperand(1).getReg(); + unsigned Opc = MI.getOpcode(); + if (Ty == S32 || Ty == S16) { + auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, Ty}, 0); + B.buildSelect(MI.getOperand(0).getReg(), Src, True, False); + } + if (Ty == S64) { + auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1); + auto False = B.buildConstant({VgprRB, S32}, 0); + auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False); + B.buildMergeValues( + MI.getOperand(0).getReg(), + {Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)}); + } + MI.eraseFromParent(); + return; + } case UniExtToSel: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); auto True = B.buildConstant({SgprRB, Ty}, @@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr64: case Vgpr64: return LLT::scalar(64); + case VgprP0: + return LLT::pointer(0, 64); case SgprP1: case VgprP1: return LLT::pointer(1, 64); @@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return SgprRB; case Vgpr32: case Vgpr64: + case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprV4S32: case Vgpr32: case Vgpr64: + case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc( // vgpr scalars, pointers and vectors case Vgpr32: case Vgpr64: + case VgprP0: case VgprP1: case VgprP3: case VgprP4: @@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { // We accept all types that can fit in some register class. // Uniform G_PHIs have all sgpr registers. // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. - if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) { + if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || + Ty == LLT::pointer(4, 64)) { return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f293b3aba7b79..fd40e765a4b3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32); case S64: return MRI.getType(Reg) == LLT::scalar(64); + case P0: + return MRI.getType(Reg) == LLT::pointer(0, 64); case P1: return MRI.getType(Reg) == LLT::pointer(1, 64); case P3: @@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case V4S32: + return MRI.getType(Reg) == LLT::fixed_vector(4, 32); case B32: return MRI.getType(Reg).getSizeInBits() == 32; case B64: @@ -78,6 +82,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); case UniS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + case UniP0: + return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg); case UniP1: return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg); case UniP3: @@ -104,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); case DivS64: return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP0: + return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg); case DivP1: return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); case DivP3: @@ -431,16 +439,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB) .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}}) .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}}) + .Div(B32, {{VgprB32}, {VgprB32, VgprB32}}) + .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}}) .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32}); addRulesForGOpcs({G_SHL}, Standard) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT // and G_FREEZE here, rest is trivially regbankselected earlier + addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}}); addRulesForGOpcs({G_CONSTANT}) .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}}); + addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}}); addRulesForGOpcs({G_ICMP}) .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) @@ -471,6 +484,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ZEXT, G_SEXT}) .Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}}) + .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}}) .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); @@ -525,9 +539,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // clang-format off addRulesForGOpcs({G_LOAD}) + .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}}) + .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) + .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}) .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) @@ -556,15 +573,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // clang-format on addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) + .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) + .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); addRulesForGOpcs({G_STORE}) + .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}}) .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}}) .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}}) .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}}); - addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}); + addRulesForGOpcs({G_AMDGPU_BUFFER_STORE}) + .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); + + addRulesForGOpcs({G_PTR_ADD}) + .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) + .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) + .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); + + addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}}); addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); @@ -580,15 +608,24 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); addRulesForGOpcs({G_UITOFP}) + .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); using namespace Intrinsic; + addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}}); + // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir. addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}}); addRulesForIOpcs({amdgcn_if_break}, Standard) .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}}); + addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard) + .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}}); + + addRulesForIOpcs({amdgcn_readfirstlane}) + .Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}}); + } // end initialize rules diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 8280751e1dbdd..6bde7f2cd676d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -50,16 +50,19 @@ enum UniformityLLTOpPredicateID { DivS64, // pointers + P0, P1, P3, P4, P5, + UniP0, UniP1, UniP3, UniP4, UniP5, + DivP0, DivP1, DivP3, DivP4, @@ -124,6 +127,7 @@ enum RegBankLLTMappingApplyID { // vgpr scalars, pointers, vectors and B-types Vgpr32, Vgpr64, + VgprP0, VgprP1, VgprP3, VgprP4, @@ -162,6 +166,7 @@ enum RegBankLLTMappingApplyID { // vgpr. Lower it to two S32 vgpr ANDs. enum LoweringMethodID { DoNotLower, + VccExtToSel, UniExtToSel, VgprToVccCopy, SplitTo32, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index c5ded11c7d323..65c96a3db5bbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; Divergent phis that don't require lowering using lane mask merging @@ -101,27 +101,23 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) { ; GFX10-LABEL: divergent_i1_phi_used_inside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 1 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v3 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 -; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX10-NEXT: s_xor_b32 s5, s5, 1 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -147,29 +143,25 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_1: ; %loop_body ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GFX10-NEXT: s_xor_b32 s5, s5, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6 +; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: .LBB3_2: ; %loop_start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_cmpk_le_i32 s6, 0x3e8 ; GFX10-NEXT: s_mov_b32 s7, 1 -; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8 -; GFX10-NEXT: s_cbranch_vccz .LBB3_4 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-NEXT: ; %bb.3: ; %else ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: s_mov_b32 s7, 0 @@ -177,7 +169,6 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: .LBB3_4: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: s_xor_b32 s7, s7, 1 -; GFX10-NEXT: s_and_b32 s7, s7, 1 ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %if @@ -185,8 +176,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: flat_store_dword v[4:5], v1 ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -234,45 +225,47 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0 ; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b32 s3, -1 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 ; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: ; implicit-def: $vgpr3 +; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: .LBB4_2: ; %.preheader ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3 +; GFX10-NEXT: s_add_i32 s1, s1, 4 +; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4 +; GFX10-NEXT: v_readfirstlane_b32 s12, v3 +; GFX10-NEXT: s_add_i32 s3, s12, s3 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1 -; GFX10-NEXT: .LBB4_4: ; %Flow -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: s_branch .LBB4_6 +; GFX10-NEXT: .LBB4_4: +; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX10-NEXT: s_cbranch_vccz .LBB4_6 ; GFX10-NEXT: ; %bb.5: ; %.19 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_or_b32_e32 v3, 2, v1 +; GFX10-NEXT: v_or_b32_e32 v1, 2, v1 ; GFX10-NEXT: .LBB4_6: ; %.22 ; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2 -; GFX10-NEXT: buffer_store_dword v3, v0, s[8:11], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen ; GFX10-NEXT: s_endpgm .entry: %.0 = call i64 @llvm.amdgcn.s.getpc() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 45a1b25f12ff1..b902c23a3982e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; This file contains various tests that have divergent i1s used outside of ; the loop. These are lane masks is sgpr and need to have correct value in @@ -14,31 +14,28 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s6, s5, s6 -; GFX10-NEXT: ; implicit-def: $sgpr5 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GFX10-NEXT: s_xor_b32 s7, s6, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX10-NEXT: s_mov_b32 s8, exec_lo +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_add_i32 s5, s5, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s8 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s8, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 -; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_andn2_b32 s8, s7, exec_lo ; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 -; GFX10-NEXT: s_or_b32 s7, s8, s7 -; GFX10-NEXT: s_or_b32 s5, s5, s6 -; GFX10-NEXT: s_mov_b32 s6, s7 +; GFX10-NEXT: s_or_b32 s6, s8, s6 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -65,43 +62,44 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr ; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_and_b32 s6, exec_lo, exec_lo +; GFX10-NEXT: s_mov_b32 s4, -1 +; GFX10-NEXT: s_or_b32 s7, s5, s6 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB1_2 ; GFX10-NEXT: .LBB1_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4 -; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo -; GFX10-NEXT: s_and_b32 s8, exec_lo, s6 -; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: s_or_b32 s4, s7, s8 -; GFX10-NEXT: s_cbranch_vccz .LBB1_4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, 4 +; GFX10-NEXT: s_cmp_ge_i32 s4, 10 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_andn2_b32 s7, s6, exec_lo +; GFX10-NEXT: s_and_b32 s9, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX10-NEXT: .LBB1_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_and_saveexec_b32 s4, s5 +; GFX10-NEXT: s_mov_b32 s6, s7 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_and_saveexec_b32 s7, s6 ; GFX10-NEXT: s_cbranch_execz .LBB1_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX10-NEXT: global_load_dword v5, v[1:2], off -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo +; GFX10-NEXT: global_load_dword v0, v[1:2], off +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_b32 s8, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s5, s5, s8 ; GFX10-NEXT: s_branch .LBB1_1 ; GFX10-NEXT: .LBB1_4: ; %exit -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -137,25 +135,21 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-LABEL: divergent_i1_xor_used_outside_loop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB2_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GFX10-NEXT: s_xor_b32 s5, s5, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -182,26 +176,22 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_twice: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB3_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1 -; GFX10-NEXT: s_xor_b32 s5, s5, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, 2.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, 2.0, s4 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: flat_store_dword v[4:5], v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -241,62 +231,60 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader -; GFX10-NEXT: v_mov_b32_e32 v5, s5 -; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: ; implicit-def: $sgpr7 -; GFX10-NEXT: ; implicit-def: $sgpr8 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: ; implicit-def: $sgpr9 +; GFX10-NEXT: ; implicit-def: $sgpr10 ; GFX10-NEXT: s_branch .LBB4_3 ; GFX10-NEXT: .LBB4_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX10-NEXT: s_xor_b32 s9, s8, -1 -; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 -; GFX10-NEXT: s_or_b32 s5, s10, s5 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, s9 -; GFX10-NEXT: s_or_b32 s6, s6, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_xor_b32 s5, s10, exec_lo +; GFX10-NEXT: s_and_b32 s11, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s8, s11, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execz .LBB4_5 ; GFX10-NEXT: .LBB4_3: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, -1 -; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_or_b32 s8, s8, s9 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: s_or_b32 s7, s7, s9 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo -; GFX10-NEXT: global_load_dword v6, v[6:7], off +; GFX10-NEXT: s_ashr_i32 s5, s4, 31 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], 2 +; GFX10-NEXT: s_andn2_b32 s5, s10, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s12 +; GFX10-NEXT: v_mov_b32_e32 v6, s13 +; GFX10-NEXT: s_and_b32 s10, exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 s11, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s10, s5, s10 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v1, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v6, vcc_lo +; GFX10-NEXT: s_or_b32 s9, s9, s11 +; GFX10-NEXT: global_load_dword v5, v[5:6], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_2 ; GFX10-NEXT: ; %bb.4: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5 -; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 -; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s10, exec_lo, 0 -; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s8, s8, s10 -; GFX10-NEXT: s_or_b32 s7, s7, s11 +; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, s4, v0 +; GFX10-NEXT: s_andn2_b32 s10, s10, exec_lo +; GFX10-NEXT: s_and_b32 s11, exec_lo, 0 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s12, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s10, s10, s11 +; GFX10-NEXT: s_or_b32 s9, s9, s12 ; GFX10-NEXT: s_branch .LBB4_2 ; GFX10-NEXT: .LBB4_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 -; GFX10-NEXT: s_or_b32 s6, s5, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo +; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, s4, s5 ; GFX10-NEXT: .LBB4_6: ; %Flow1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX10-NEXT: s_and_saveexec_b32 s4, s6 ; GFX10-NEXT: s_cbranch_execz .LBB4_8 ; GFX10-NEXT: ; %bb.7: ; %block.after.loop @@ -346,23 +334,19 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_and_b32 s4, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s5, s4, s5 -; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s4, s6 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execz .LBB5_6 ; GFX10-NEXT: .LBB5_2: ; %cond.block.0 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s6, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_4 ; GFX10-NEXT: ; %bb.3: ; %if.block.0 ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 @@ -374,21 +358,21 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: .LBB5_4: ; %loop.break.block ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 -; GFX10-NEXT: s_mov_b32 s7, -1 -; GFX10-NEXT: s_and_saveexec_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_and_saveexec_b32 s7, s4 ; GFX10-NEXT: s_cbranch_execz .LBB5_1 ; GFX10-NEXT: ; %bb.5: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 -; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 -; GFX10-NEXT: s_or_b32 s7, s4, s7 +; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo +; GFX10-NEXT: s_and_b32 s6, exec_lo, 0 +; GFX10-NEXT: s_or_b32 s6, s4, s6 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_6: ; %cond.block.1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_8 ; GFX10-NEXT: ; %bb.7: ; %if.block.1 ; GFX10-NEXT: global_store_dword v[6:7], v4, off @@ -452,49 +436,48 @@ exit: define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) { ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s3, -1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: ; implicit-def: $sgpr2 +; GFX10-NEXT: ; implicit-def: $sgpr3 ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 -; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 +; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0 +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB6_4 ; GFX10-NEXT: .LBB6_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX10-NEXT: s_and_b32 s4, exec_lo, s1 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s1 ; GFX10-NEXT: s_cbranch_execz .LBB6_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo -; GFX10-NEXT: global_load_dword v6, v[6:7], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s3, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v1, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v6, vcc_lo +; GFX10-NEXT: global_load_dword v5, v[5:6], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: ; implicit-def: $sgpr3 +; GFX10-NEXT: s_or_b32 s3, s1, s3 +; GFX10-NEXT: ; implicit-def: $sgpr1 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_endpgm entry: @@ -528,60 +511,63 @@ exit: define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: ; implicit-def: $sgpr2 -; GFX10-NEXT: ; implicit-def: $sgpr3 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: ; implicit-def: $sgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: s_branch .LBB7_2 ; GFX10-NEXT: .LBB7_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-NEXT: .LBB7_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo -; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s6, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s6, s1, s6 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: global_load_dword v9, v[7:8], off -; GFX10-NEXT: s_and_b32 s5, exec_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, v10 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo +; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo +; GFX10-NEXT: s_and_b32 s6, exec_lo, 0 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s6, s3, s6 +; GFX10-NEXT: s_or_b32 s5, s5, s0 +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 -; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX10-NEXT: global_store_dword v[6:7], v8, off ; GFX10-NEXT: s_branch .LBB7_1 ; GFX10-NEXT: .LBB7_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s0, s6 ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB7_6 ; GFX10-NEXT: ; %bb.5: ; %break.body diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index a70926a7de36d..9b1cc719d5567 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; Simples case, if - then, that requires lane mask merging, ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B @@ -41,9 +41,10 @@ exit: define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: divergent_i1_phi_if_else: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, s0, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-NEXT: ; %bb.1: ; %B @@ -105,46 +106,51 @@ exit: define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) { ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB2_4 ; GFX10-NEXT: .LBB2_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5] -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo -; GFX10-NEXT: global_load_dword v7, v[7:8], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s5, s1, s5 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v5, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[4:5], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB2_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v4 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: global_load_dword v7, v[5:6], off -; GFX10-NEXT: v_mov_b32_e32 v4, v8 -; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: s_andn2_b32 s3, s5, exec_lo +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s5, s3, s0 +; GFX10-NEXT: global_load_dword v6, v[4:5], off +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7 -; GFX10-NEXT: global_store_dword v[5:6], v7, off +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6 +; GFX10-NEXT: global_store_dword v[4:5], v6, off ; GFX10-NEXT: s_branch .LBB2_1 ; GFX10-NEXT: .LBB2_4: ; %exit ; GFX10-NEXT: s_endpgm @@ -174,62 +180,69 @@ exit: define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX10-LABEL: loop_with_2breaks: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_1: ; %Flow3 ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 -; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 s2, s5, exec_lo +; GFX10-NEXT: s_and_b32 s3, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, s2, s3 ; GFX10-NEXT: .LBB3_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: .LBB3_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo -; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s5, s1, s5 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %B ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo +; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: global_load_dword v9, v[7:8], off -; GFX10-NEXT: v_mov_b32_e32 v6, v10 -; GFX10-NEXT: s_and_b32 s5, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo +; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s6, s3, s0 +; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 -; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX10-NEXT: global_store_dword v[6:7], v8, off ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit ; GFX10-NEXT: s_endpgm @@ -265,78 +278,87 @@ exit: define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) { ; GFX10-LABEL: loop_with_3breaks: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB4_4 ; GFX10-NEXT: .LBB4_1: ; %Flow5 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_andn2_b32 s2, s6, exec_lo +; GFX10-NEXT: s_and_b32 s3, exec_lo, s8 +; GFX10-NEXT: s_or_b32 s6, s2, s3 ; GFX10-NEXT: .LBB4_2: ; %Flow4 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 -; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 s2, s5, exec_lo +; GFX10-NEXT: s_and_b32 s3, exec_lo, s6 +; GFX10-NEXT: s_or_b32 s5, s2, s3 ; GFX10-NEXT: .LBB4_3: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB4_8 ; GFX10-NEXT: .LBB4_4: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9] -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo -; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s5, s1, s5 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo +; GFX10-NEXT: global_load_dword v8, v[8:9], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_3 ; GFX10-NEXT: ; %bb.5: ; %B ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v9, vcc_lo +; GFX10-NEXT: global_load_dword v8, v[8:9], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_2 ; GFX10-NEXT: ; %bb.6: ; %C ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo -; GFX10-NEXT: s_mov_b32 s5, -1 -; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: s_mov_b32 s8, exec_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v6, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v7, v9, vcc_lo +; GFX10-NEXT: global_load_dword v8, v[8:9], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_1 ; GFX10-NEXT: ; %bb.7: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v8 -; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo -; GFX10-NEXT: global_load_dword v11, v[9:10], off -; GFX10-NEXT: v_mov_b32_e32 v8, v12 -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo +; GFX10-NEXT: s_andn2_b32 s3, s8, exec_lo +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s8, s3, s0 +; GFX10-NEXT: global_load_dword v10, v[8:9], off +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11 -; GFX10-NEXT: global_store_dword v[9:10], v11, off +; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v10 +; GFX10-NEXT: global_store_dword v[8:9], v10, off ; GFX10-NEXT: s_branch .LBB4_1 ; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_endpgm @@ -382,60 +404,63 @@ exit: define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { ; GFX10-LABEL: loop_with_div_break_with_body: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; implicit-def: $sgpr1 -; GFX10-NEXT: ; implicit-def: $sgpr2 -; GFX10-NEXT: ; implicit-def: $sgpr3 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: ; implicit-def: $sgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s4, s1, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo -; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 +; GFX10-NEXT: s_andn2_b32 s1, s6, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s6, s1, s6 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: global_load_dword v9, v[7:8], off -; GFX10-NEXT: s_and_b32 s5, exec_lo, 0 -; GFX10-NEXT: v_mov_b32_e32 v6, v10 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo +; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo +; GFX10-NEXT: s_and_b32 s6, exec_lo, 0 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_or_b32 s6, s3, s6 +; GFX10-NEXT: s_or_b32 s5, s5, s0 +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 -; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX10-NEXT: global_store_dword v[6:7], v8, off ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s0, s6 ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB5_6 ; GFX10-NEXT: ; %bb.5: ; %break.body @@ -494,74 +519,75 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, ; GFX10-LABEL: irreducible_cfg: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: ; implicit-def: $sgpr2 -; GFX10-NEXT: v_cmp_ne_u32_e64 s3, 0, s0 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_and_b32 s3, s0, 1 +; GFX10-NEXT: s_xor_b32 s1, vcc_lo, s0 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %Flow2 ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_and_b32 s4, 1, s6 -; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: s_and_b32 s5, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cselect_b32 s1, exec_lo, 0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB6_8 ; GFX10-NEXT: .LBB6_2: ; %irr.guard ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB6_6 Depth 2 -; GFX10-NEXT: s_andn2_b32 s5, s3, exec_lo -; GFX10-NEXT: s_and_b32 s3, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s3, s5, s3 -; GFX10-NEXT: s_mov_b32 s5, -1 -; GFX10-NEXT: s_and_saveexec_b32 s6, s4 -; GFX10-NEXT: s_xor_b32 s4, exec_lo, s6 +; GFX10-NEXT: s_andn2_b32 s4, s0, exec_lo +; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_and_saveexec_b32 s4, s1 +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: ; %bb.3: ; %.loopexit ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, v5, v0 +; GFX10-NEXT: v_cmp_gt_i32_e64 s1, v5, v0 +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_xor_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_or_b32 s5, s1, s5 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s1 +; GFX10-NEXT: s_xor_b32 s5, s5, s6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_andn2_b32 s7, -1, exec_lo -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s1 -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_xor_b32 s5, s5, -1 -; GFX10-NEXT: s_or_b32 s3, s3, s6 ; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s5, s7, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: ; %bb.4: ; %Flow1 ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_mov_b32 s7, -1 -; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: s_and_saveexec_b32 s4, s5 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: ; implicit-def: $sgpr1 +; GFX10-NEXT: s_and_saveexec_b32 s5, s3 ; GFX10-NEXT: s_cbranch_execz .LBB6_1 ; GFX10-NEXT: ; %bb.5: ; %.preheader ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, v4, v0 +; GFX10-NEXT: v_cmp_le_i32_e64 s1, v4, v0 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: .LBB6_6: ; %.inner_loop ; GFX10-NEXT: ; Parent Loop BB6_2 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s5, s6, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s1 +; GFX10-NEXT: s_or_b32 s3, s6, s3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_cbranch_execnz .LBB6_6 ; GFX10-NEXT: ; %bb.7: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 -; GFX10-NEXT: s_mov_b32 s6, 1 -; GFX10-NEXT: s_or_b32 s7, s5, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_andn2_b32 s3, s4, exec_lo +; GFX10-NEXT: s_and_b32 s4, exec_lo, 0 +; GFX10-NEXT: s_mov_b32 s1, 1 +; GFX10-NEXT: s_or_b32 s4, s3, s4 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_8: ; %.exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog .entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index ede0db0e3662e..1caecb599ffed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -1,31 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i1_phi: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s6, 1 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 -; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_add_i32 s5, s5, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, 1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cmp_lg_u32 s7, 0 +; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -51,27 +48,23 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i1_non_phi: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ; implicit-def: $sgpr6 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 1 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v3 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 -; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX10-NEXT: s_xor_b32 s5, s5, 1 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -98,60 +91,60 @@ exit: define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr addrspace(1) inreg %a, ptr addrspace(1) inreg %a.break) { ; GFX10-LABEL: loop_with_1break: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: ; implicit-def: $sgpr0 -; GFX10-NEXT: ; implicit-def: $sgpr1 +; GFX10-NEXT: ; implicit-def: $sgpr9 ; GFX10-NEXT: s_branch .LBB2_3 ; GFX10-NEXT: .LBB2_1: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v5 -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v5, v2 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_andn2_b32 s6, s9, exec_lo ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, v9 -; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v2 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: global_load_dword v6, v[4:5], off +; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s9, s6, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 -; GFX10-NEXT: global_store_dword v[6:7], v8, off +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6 +; GFX10-NEXT: global_store_dword v[4:5], v6, off ; GFX10-NEXT: .LBB2_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1 -; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_and_b32 s6, exec_lo, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 -; GFX10-NEXT: s_or_b32 s4, s6, s4 -; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s0, s0, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, exec_lo, 0 +; GFX10-NEXT: s_and_b32 s6, exec_lo, s9 +; GFX10-NEXT: s_or_b32 s8, s6, s8 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execz .LBB2_5 ; GFX10-NEXT: .LBB2_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s5 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v3, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo -; GFX10-NEXT: global_load_dword v8, v[8:9], off +; GFX10-NEXT: s_ashr_i32 s5, s4, 31 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[4:5], 2 +; GFX10-NEXT: s_add_u32 s10, s0, s6 +; GFX10-NEXT: s_addc_u32 s11, s1, s7 +; GFX10-NEXT: global_load_dword v4, v3, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo +; GFX10-NEXT: s_and_b32 s10, exec_lo, exec_lo +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB2_3 Depth=1 ; GFX10-NEXT: s_mov_b32 s5, 1 -; GFX10-NEXT: ; implicit-def: $vgpr5 +; GFX10-NEXT: ; implicit-def: $sgpr4 ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s1, s0 -; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_and_saveexec_b32 s0, s5 +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB2_7 ; GFX10-NEXT: ; %bb.6: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 @@ -191,52 +184,50 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n. ; GFX10-LABEL: nested_loops_temporal_divergence_inner: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s6, 1.0, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, s5 -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB3_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB3_2 Depth 2 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX10-NEXT: s_mov_b32 s9, s6 +; GFX10-NEXT: s_ashr_i32 s7, s6, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ; implicit-def: $sgpr8 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[6:7] -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s10 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[6:7] ; GFX10-NEXT: .LBB3_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB3_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GFX10-NEXT: s_xor_b32 s9, s9, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s10 +; GFX10-NEXT: s_add_i32 s10, s10, 1 +; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s10, exec_lo, s9 -; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB3_1 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v6 -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v6, v1 -; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: s_or_b32 s7, vcc_lo, s7 -; GFX10-NEXT: flat_store_byte v[8:9], v7 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -279,52 +270,50 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n. ; GFX10-LABEL: nested_loops_temporal_divergence_outer: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s6, 1.0, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, s5 -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB4_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB4_2 Depth 2 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX10-NEXT: s_mov_b32 s9, s6 +; GFX10-NEXT: s_ashr_i32 s7, s6, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: ; implicit-def: $sgpr8 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[6:7] -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s10 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[6:7] ; GFX10-NEXT: .LBB4_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB4_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GFX10-NEXT: s_xor_b32 s9, s9, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v6, s10 +; GFX10-NEXT: s_add_i32 s10, s10, 1 +; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s10, exec_lo, s9 -; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB4_1 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v6 -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v6, v1 -; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: s_or_b32 s7, vcc_lo, s7 -; GFX10-NEXT: flat_store_byte v[8:9], v7 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -367,57 +356,50 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-LABEL: nested_loops_temporal_divergence_both: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s6, 1.0, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: s_mov_b32 s7, 0 -; GFX10-NEXT: ; implicit-def: $sgpr9 -; GFX10-NEXT: ; implicit-def: $sgpr8 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB5_1: ; %OuterHeader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB5_2 Depth 2 -; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_ashr_i32 s7, s6, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], 2, v[8:9] -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[10:11] -; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s10 +; GFX10-NEXT: v_mov_b32_e32 v9, s11 +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[8:9] ; GFX10-NEXT: .LBB5_2: ; %InnerHeader ; GFX10-NEXT: ; Parent Loop BB5_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v11, v10 -; GFX10-NEXT: s_xor_b32 s10, s10, -1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s10 +; GFX10-NEXT: s_add_i32 s10, s10, 1 +; GFX10-NEXT: s_xor_b32 s9, s9, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s11, exec_lo, s10 -; GFX10-NEXT: s_or_b32 s8, s8, s11 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB5_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v8 -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v8, v1 -; GFX10-NEXT: v_add_co_u32 v10, s4, v4, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v5, v9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: s_or_b32 s7, vcc_lo, s7 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: s_andn2_b32 s4, s9, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, s8 -; GFX10-NEXT: flat_store_byte v[10:11], v9 -; GFX10-NEXT: s_or_b32 s9, s4, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s6 +; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v9, s4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: flat_store_byte v[8:9], v0 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: flat_store_byte v[6:7], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index b26c646d40e3d..bff3ed9228e05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -1,24 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: flat_store_dword v[1:2], v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -41,20 +41,20 @@ define void @temporal_divergent_i32_multiple_use(float %val, ptr %addr, ptr %add ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 -; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v5 -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: flat_store_dword v[1:2], v5 -; GFX10-NEXT: flat_store_dword v[3:4], v5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: flat_store_dword v[1:2], v0 +; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits