llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) <details> <summary>Changes</summary> In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated. --- Full diff: https://github.com/llvm/llvm-project/pull/130047.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+10) - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+6) - (modified) llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp (+62) - (modified) llvm/unittests/Target/AMDGPU/CMakeLists.txt (+1) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 31a98ee132bf6..339eeec72da46 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", "Enable dynamic VGPR mode" >; +def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", + "DynamicVGPRBlockSize32", + "true", + "Use a block size of 32 for dynamic VGPR allocation (default is 16)" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c277223de13ac..4cc71f321f8f2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { if (WavesAfter < DAG.MinOccupancy) return true; + // For dynamic VGPR mode, we don't want to waste any VGPR blocks. + if (ST.isDynamicVGPREnabled()) { + unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( + &ST, PressureBefore.getVGPRNum(false)); + unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( + &ST, PressureAfter.getVGPRNum(false)); + if (BlocksAfter > BlocksBefore) + return true; + } + return false; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 1254cbad83b60..9ccf38fb4dbbe 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; bool DynamicVGPR = false; + bool DynamicVGPRBlockSize32 = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b51cf536467b9..bebbb0dde0b9b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 8; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) + return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 512; + if (STI->getFeatureBits().test(FeatureDynamicVGPR)) + // On GFX12 we can allocate at most 8 blocks of VGPRs. + return 8 * getVGPRAllocGranule(STI); return getAddressableNumArchVGPRs(STI); } diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp index 8fbd470815b79..21f45443281e7 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64, EXPECT_TRUE(ErrStr.empty()) << ErrStr; } +static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS, + TestFuncTy test) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, + "+dynamic-vgpr," + FS.str()); + ASSERT_TRUE(TM) << "No target machine"; + + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR)); + + std::stringstream Table; + bool Success = testAndRecord(Table, ST, test); + EXPECT_TRUE(Success && !PrintCpuRegLimits) + << CPUName << " dynamic VGPR " << FS + << ":\nOcc MinVGPR MaxVGPR\n" + << Table.str() << '\n'; +} + TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) { unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(); @@ -163,6 +181,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { }; testGPRLimits("VGPR", true, test); + + testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test); + testDynamicVGPRLimits("gfx1200", + "+wavefrontsize32,+dynamic-vgpr-block-size-32", test); +} + +static void testAbsoluteLimits(StringRef CPUName, StringRef FS, + unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc, + unsigned ExpectedMaxVGPRs) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS); + ASSERT_TRUE(TM) << "No target machine"; + + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + + // Test function without attributes. + LLVMContext Context; + Module M("", Context); + Function *Func = + Function::Create(FunctionType::get(Type::getVoidTy(Context), false), + GlobalValue::ExternalLinkage, "testFunc", &M); + Func->setCallingConv(CallingConv::AMDGPU_CS_Chain); + Func->addFnAttr("amdgpu-flat-work-group-size", "1,32"); + + auto Range = ST.getWavesPerEU(*Func); + EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS; + EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS; + EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS; + EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs()) + << CPUName << ' ' << FS; + + // Function with requested 'amdgpu-waves-per-eu' in a valid range. + Func->addFnAttr("amdgpu-waves-per-eu", "10,12"); + Range = ST.getWavesPerEU(*Func); + EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS; + EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS; +} + +TEST(AMDGPU, TestOccupancyAbsoluteLimits) { + testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256); + testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128); + testAbsoluteLimits( + "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32", + 1, 16, 256); } static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) { diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index ca8f48bc393ef..6d6f17883a07e 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS Core GlobalISel MC + MIRParser Support TargetParser ) `````````` </details> https://github.com/llvm/llvm-project/pull/130047 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits