https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/132385
>From 0bc832089bf02e0069f441d70728943de51766c6 Mon Sep 17 00:00:00 2001 From: Petar Avramovic <petar.avramo...@amd.com> Date: Mon, 14 Apr 2025 16:35:19 +0200 Subject: [PATCH] AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 2 +- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 108 +++++++++ .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 43 +++- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 11 + llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 35 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 218 ++++++++++-------- .../AMDGPU/GlobalISel/regbankselect-ashr.mir | 6 +- .../AMDGPU/GlobalISel/regbankselect-lshr.mir | 17 +- .../GlobalISel/regbankselect-sext-inreg.mir | 24 +- .../AMDGPU/GlobalISel/regbankselect-shl.mir | 6 +- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 34 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 31 ++- 13 files changed, 375 insertions(+), 165 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 9544c9f43eeaf..53b8d64d2935d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -310,7 +310,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || - Opc == AMDGPU::G_MERGE_VALUES) { + Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index a7c1d7ab98adf..7ff822c6f6580 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -171,6 +171,62 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { MI.eraseFromParent(); } +const std::pair<Register, Register> +RegBankLegalizeHelper::unpackZExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff); + auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask); + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +const std::pair<Register, Register> +RegBankLegalizeHelper::unpackSExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16); + auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +const std::pair<Register, Register> +RegBankLegalizeHelper::unpackAExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = PackedS32; + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { + Register Lo, Hi; + switch (MI.getOpcode()) { + case AMDGPU::G_SHL: { + auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_LSHR: { + auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_ASHR: { + auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg()); + Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0); + Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0); + break; + } + default: + llvm_unreachable("Unpack lowering not implemented"); + } + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -306,6 +362,33 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { + auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); + int Amt = MI.getOperand(2).getImm(); + Register Lo, Hi; + // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend + if (Amt <= 32) { + auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0)); + if (Amt == 32) { + // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx + Lo = Freeze.getReg(0); + } else { + // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx + Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0); + } + + auto SignExtCst = B.buildConstant(SgprRB_S32, 31); + Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0); + } else { + // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx + Lo = Op1.getReg(0); + Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0); + } + + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet<Register, 4> &WaterfallSgprs) { @@ -328,6 +411,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); return; } + case UnpackBitShift: + return lowerUnpackBitShift(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -394,6 +479,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerSplitTo32(MI); case SplitTo32Select: return lowerSplitTo32Select(MI); + case SplitTo32SExtInReg: + return lowerSplitTo32SExtInReg(MI); case SplitLoad: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = DstTy.getSizeInBits(); @@ -483,6 +570,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprV2S16: + case VgprV2S16: + case UniInVgprV2S16: + return LLT::fixed_vector(2, 16); + case SgprV2S32: + case VgprV2S32: + return LLT::fixed_vector(2, 32); case SgprV4S32: case VgprV4S32: case UniInVgprV4S32: @@ -556,6 +650,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case SgprB32: case SgprB64: @@ -565,6 +661,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprB512: case UniInVcc: case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: case UniInVgprB64: @@ -586,6 +683,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: case VgprB32: case VgprB64: @@ -623,6 +722,8 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case Vgpr16: case Vgpr32: @@ -632,6 +733,8 @@ void RegBankLegalizeHelper::applyMappingDst( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); @@ -666,6 +769,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == SgprRB); @@ -739,6 +843,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); assert(RB == getRegBankFromID(MethodIDs[i])); @@ -764,6 +870,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); if (RB != VgprRB) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index a9011ba07b8e6..50bd86dc15a1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -111,10 +111,15 @@ class RegBankLegalizeHelper { SmallSet<Register, 4> &SgprWaterfallOperandRegs); void lowerVccExtToSel(MachineInstr &MI); + const std::pair<Register, Register> unpackZExt(Register Reg); + const std::pair<Register, Register> unpackSExt(Register Reg); + const std::pair<Register, Register> unpackAExt(Register Reg); + void lowerUnpackBitShift(MachineInstr &MI); void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); + void lowerSplitTo32SExtInReg(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f803217f82e6c..89056b0271f12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -60,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case V2S32: + return MRI.getType(Reg) == LLT::fixed_vector(2, 32); case V4S32: return MRI.getType(Reg) == LLT::fixed_vector(4, 32); case B32: @@ -92,6 +94,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); case UniB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); case UniB64: @@ -122,6 +126,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg); case DivP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg); + case DivV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); case DivB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); case DivB64: @@ -435,7 +441,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) : ST(&_ST), MRI(&_MRI) { - addRulesForGOpcs({G_ADD}, Standard) + addRulesForGOpcs({G_ADD, G_SUB}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -452,11 +458,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32}); addRulesForGOpcs({G_SHL}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_LSHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); - addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}); + addRulesForGOpcs({G_ASHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE}) @@ -515,6 +546,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}}) .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}}) .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}}) + .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}}) + .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}}) // This is non-trivial. VgprToVccCopy is done using compare instruction. .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}) .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}) @@ -550,6 +583,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}) .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}); + addRulesForGOpcs({G_SEXT_INREG}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}) + .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) + .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}}) + .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}}); + bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; bool hasSMRDSmall = ST->hasScalarSubwordLoads(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 1c70597024b6a..bddfb8dd1913f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID { V3S32, V4S32, + UniV2S16, + + DivV2S16, + // B types B32, B64, @@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID { SgprP3, SgprP4, SgprP5, + SgprV2S16, SgprV4S32, + SgprV2S32, SgprB32, SgprB64, SgprB96, @@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID { VgprP3, VgprP4, VgprP5, + VgprV2S16, + VgprV2S32, VgprB32, VgprB64, VgprB96, @@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID { // Dst only modifiers: read-any-lane and truncs UniInVcc, UniInVgprS32, + UniInVgprV2S16, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -173,11 +182,13 @@ enum LoweringMethodID { DoNotLower, VccExtToSel, UniExtToSel, + UnpackBitShift, S_BFE, V_BFE, VgprToVccCopy, SplitTo32, SplitTo32Select, + SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index aea32b3fedba7..dfeb1bbd708cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_ashr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_ashr_i8: @@ -70,14 +70,29 @@ define i8 @v_ashr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_ashr_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_sext_i32_i8 s0, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_ashr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_ashr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_ashr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8: ; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index be1dc7f0c67f9..6baa10bb48621 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_lshr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_lshr_i8: @@ -69,15 +69,33 @@ define i8 @v_lshr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_lshr_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, %amount @@ -93,18 +111,21 @@ define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { ; GFX8-LABEL: s_lshr_i8_7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 7 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_lshr_b32 s0, s0, 7 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8_7: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 @@ -831,22 +852,22 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s2, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v2i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s2, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 @@ -1024,34 +1045,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s4, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX9-NEXT: s_and_b32 s2, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v4i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s4, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s5 +; GFX10PLUS-NEXT: s_and_b32 s4, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s5 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> @@ -1221,58 +1242,58 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s8, s4 +; GFX9-NEXT: s_lshr_b32 s0, s0, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: s_lshr_b32 s4, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s2, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s4, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 +; GFX9-NEXT: s_lshr_b32 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v8i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s8, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s9 -; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s9 +; GFX10PLUS-NEXT: s_and_b32 s8, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s9 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s9 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s5, s1 +; GFX10PLUS-NEXT: s_and_b32 s4, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10PLUS-NEXT: s_and_b32 s5, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s7 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> @@ -1605,8 +1626,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 1 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1627,8 +1649,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1649,8 +1672,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1671,6 +1695,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX10-LABEL: v_lshr_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 @@ -1693,21 +1718,22 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX11-LABEL: v_lshr_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 1 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount @@ -1719,8 +1745,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, 1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1731,8 +1758,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1743,8 +1771,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1755,6 +1784,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 @@ -1766,8 +1796,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir index 615cfec2b31cf..a0cb85f710443 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: ashr_s32_ss @@ -206,8 +205,7 @@ body: | ; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32) ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir index c5024924a4d32..60b89bf42031d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: lshr_s32_ss @@ -201,15 +200,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32) ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir index cf0ca2c9eb634..1a8fa56a7f799 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s --- name: sext_inreg_s_s32_1 @@ -137,7 +136,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 1 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -162,7 +161,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 31 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -186,7 +185,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[FREEZE]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[FREEZE]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -209,9 +208,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 1 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 33 @@ -232,9 +230,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 3 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 3 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 35 @@ -255,9 +252,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 31 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 31 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 63 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir index b4290ea0a4203..6bdf8e7e1de6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: shl_s32_ss @@ -204,8 +203,7 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[BITCAST]], [[BITCAST1]](s32) ; CHECK-NEXT: [[SHL1:%[0-9]+]]:sgpr(s32) = G_SHL [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SHL]](s32), [[SHL1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index 46b75eb55cb52..a9b3deb3e49f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_sext_inreg_i8_4(i8 %value) { ; GCN-LABEL: v_sext_inreg_i8_4: @@ -1077,13 +1077,13 @@ define i64 @v_sext_inreg_i64_23(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_23: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_23: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 23 %ashr = ashr i64 %shl, 23 @@ -1170,13 +1170,13 @@ define i64 @v_sext_inreg_i64_31(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 31 %ashr = ashr i64 %shl, 31 @@ -1262,15 +1262,15 @@ define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, <i64 16, i64 16> %ashr = ashr <2 x i64> %shl, <i64 16, i64 16> @@ -1281,15 +1281,15 @@ define <2 x i64> @v_sext_inreg_v2i64_31(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, <i64 31, i64 31> %ashr = ashr <2 x i64> %shl, <i64 31, i64 31> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 139652eb55e3d..2f03c7156babc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_shl_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_shl_i8: @@ -64,13 +64,26 @@ define i8 @v_shl_i8_7(i8 %value) { } define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_shl_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_shl_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8: ; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits