[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 9a5bfd368f40d1c313dc1783df4f43f12b88258b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 31f4387623100e815d478245ca78b6dcc52f869e Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 31f4387623100e815d478245ca78b6dcc52f869e Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From f084882197a92f537c38ec19dfabdafdd9f15d09 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle approved this pull request. Thanks! https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From f084882197a92f537c38ec19dfabdafdd9f15d09 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 4afb1c71ac30cf952fc48c7f92abfa012a0c4efa Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 28 Feb 2025 15:56:04 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 44 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 52 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 +++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 176 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..51e9ac30391fe 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "uniformity" @@ -342,6 +343,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +400,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1162,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl::prin
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From a637e5612bfe21aa7e71209ac05924309ffc9fe5 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 21 Feb 2025 14:33:44 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 46 +++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 53 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +++-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 +++ .../divergence-temporal-divergent-reg.ll | 38 ++--- .../divergence-temporal-divergent-reg.mir | 8 +-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++--- 12 files changed, 179 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..6411fc9b4b974 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,7 +51,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include #define DEBUG_TYPE "uniformity" @@ -342,6 +345,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +402,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1140,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1164,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1204,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1225,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1237,2
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
nhaehnle wrote: How about this comment from earlier: > Every Inst may potentially appear with many UseInsts in the temporal > divergence list. The current code will create multiple new registers and > multiple COPY instructions, which seems wasteful even if downstream passes > can often clean it up. > > I would suggest capturing the created register in a DenseMap Register> for re-use. > > Also, how about inserting the COPY at the end of Inst->getParent()? That way, > the live range of the VGPR is reduced. ? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { petar-avramovic wrote: Updated types for recording TemporalDivergence and prints, improved new line prints. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 3f039f909b91cc5ad1f92208944e0b66447346df Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 21 Feb 2025 14:33:44 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 46 ++- llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 ++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 44 +- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 -- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 .../divergence-temporal-divergent-reg.ll | 18 .../divergence-temporal-divergent-reg.mir | 3 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++ 12 files changed, 157 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..6411fc9b4b974 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -51,7 +51,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include #define DEBUG_TYPE "uniformity" @@ -342,6 +345,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +402,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(ConstValueRefT, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1140,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(Val, const_cast(User), + Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1146,6 +1164,12 @@ template void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { bool haveDivergentArgs = false; + // When we print Value, LLVM IR instruction, we want to print extra new line. + // In LLVM IR print function for Value does not print new line at the end. + // In MIR print for MachineInstr prints new line at the end. + constexpr bool IsMIR = std::is_same::value; + std::string NewLine = IsMIR ? "" : "\n"; + // Control flow instructions may be divergent even if their inputs are // uniform. Thus, although exceedingly rare, it is possible to have a program // with no divergent values but with divergent control structures. @@ -1180,6 +1204,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Value :" << Context.print(Val) << NewLine + << "Used by :" << Context.print(UseInst) << NewLine + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1191,7 +1225,7 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { OS << " DIVERGENT: "; else OS << " "; - OS << Context.print(value) << '\n'; + OS << Context.print(value) << NewLine; } OS << "TERMINATORS\n"; @@ -1203,13 +1237,21
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From 538c0b43558d091886b844c8a1603e83d7aaf864 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 31 Jan 2025 13:04:17 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 33 ++ llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 +++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 45 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 .../divergence-temporal-divergent-reg.ll | 18 .../divergence-temporal-divergent-reg.mir | 3 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++ 12 files changed, 146 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e08..d0f7bd1412065 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +399,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(const InstructionT *, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1137,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +const InstructionT *Inst, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(const_cast(Inst), + const_cast(User), Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1195,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Inst, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1235,14 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range< +typename GenericUniformityInfo::TemporalDivergenceTuple *> +GenericUniformityInfo::getTemporalDivergenceList() const { + return make_range(DA->TemporalDivergenceList.begin(), +DA->TemporalDivergenceList.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b4..8d3b141aaeded 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -40,6 +40,9 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI = nullptr); GenericUniformityInfo() = default; @@ -78,6 +81,8 @@ template class GenericUniformityInfo { void print(raw_ostream &Out) const; + iterator_range getTemporalDivergenceList() const; + private: using ImplT = Gene
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { petar-avramovic wrote: Yes, true should map Register instead of Inst https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineBasicBlock *MBB = Inst->getParent(); +B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator(; + +Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); petar-avramovic wrote: It unnecessarily complicates new Reg bank select, regbankselect will set vgpr there. Also copy has implicit exec, should be special enough to indicate what we are doing. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { ruiling wrote: I am not sure if it is ok to assume only the first `def` can be temporal divergent. Maybe holds a map from the `Register` to a array of temporal divergence users? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineBasicBlock *MBB = Inst->getParent(); +B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator(; + +Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); ruiling wrote: I am not sure how it works in global-isel, can we set the RegisterClass of VgprReg to vector register here to make it more obvious this is copy from sgpr to vgpr? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/ssahasra commented: The changes to UA look good to me. I can't comment much about the actual patch itself. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle commented: I haven't done a detailed review of the code, but from a high-level algorithmic view this change already looks pretty reasonable to me. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineBasicBlock *MBB = Inst->getParent(); +B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator(; + +Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); +B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) +.addUse(ExecReg, RegState::Implicit); + +replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); + } + return false; +} nhaehnle wrote: I do have one high-level comment about this. Every `Inst` may potentially appear with many `UseInst`s in the temporal divergence list. The current code will create multiple new registers and multiple `COPY` instructions, which seems wasteful even if downstream passes can often clean it up. I would suggest capturing the created register in a `DenseMap` for re-use. Also, how about inserting the `COPY` at the end of `Inst->getParent()`? That way, the live range of the VGPR is reduced. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From a5c340d0301c3b36fadd352d7ed1c332789cb73b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 31 Jan 2025 13:04:17 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 33 ++ llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 +++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 45 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 .../divergence-temporal-divergent-reg.ll | 18 .../divergence-temporal-divergent-reg.mir | 3 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++ 12 files changed, 146 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e087..d0f7bd14120651 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +399,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(const InstructionT *, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1137,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +const InstructionT *Inst, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(const_cast(Inst), + const_cast(User), Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1195,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Inst, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1235,14 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range< +typename GenericUniformityInfo::TemporalDivergenceTuple *> +GenericUniformityInfo::getTemporalDivergenceList() const { + return make_range(DA->TemporalDivergenceList.begin(), +DA->TemporalDivergenceList.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b46..8d3b141aaeded7 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -40,6 +40,9 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI = nullptr); GenericUniformityInfo() = default; @@ -78,6 +81,8 @@ template class GenericUniformityInfo { void print(raw_ostream &Out) const; + iterator_range getTemporalDivergenceList() const; + private: using ImplT =
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -40,6 +40,10 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + // Use outside cycle with divergent exit + using UOCWDE = petar-avramovic wrote: My guess is that GenericUniformityAnalysisImpl and GenericUniformityInfo repeat typedefs because of terrible line break This would work typename GenericUniformityInfo::TemporalDivergenceTuple https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = ssahasra wrote: Alternatively, UOCWDE can be renamed to ``TemporalDivergenceTuple``? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTempDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineInstr *MI = const_cast(Inst); ssahasra wrote: I lean on the other side. If you look at LoopInfoBase or LoopBase, their functions take const pointers as arguments but return non-const pointers when asked. Sure, an analysis should treat its inputs as const, but when it returns something to the client, that client owns it anyway, so forcing that to be const is just an inconvenience. I would rather have the analysis do the const_cast before returning a list of pointers to something I already own. This seems to be the first time that uniformity analysis is returning something. Until now, the public interface has simply been a bunch of predicates like "isUniform" that take a const pointer as arguments. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTempDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineInstr *MI = const_cast(Inst); nhaehnle wrote: These come out of the analysis. The analysis itself uses const pointers/references in its implementation, which I believe is a good idea for const correctness. I wouldn't change that. So a `const_cast` is needed at some point. The only question is where. I think here is as good a place as any, though perhaps grouping them together with a small explanation is in order. Something like: ```c++ // As an analysis, UniformityAnalysis treats instructions as const. We have the parent function // as non-const, so casting const away here is inelegant but justified. MachineInstr *MI = const_cast(Inst); MachineInstr *UseMI = const_cast(UseInst); ``` https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = ssahasra wrote: Just a suggestion, I would consider giving the name "TemporalDivergenceList" to the entire type ``SmallVectorhttps://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range::UOCWDE *> ssahasra wrote: Just say ``auto`` as the return type here? Or if this needs to be exposed in an outer header file, then name a new type such as ``temporal_divergence_range``? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -40,6 +40,10 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + // Use outside cycle with divergent exit + using UOCWDE = ssahasra wrote: This declaration got repeated. One of them can be eliminated? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTempDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineInstr *MI = const_cast(Inst); arsenm wrote: Fix the const_casts, why is this const? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = arsenm wrote: Use a less inscrutable type name? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/ssahasra edited https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, ssahasra wrote: You're right. The LLVM doc does not actually define the term "temporal divergence". But it has always been used in a way that means "uniform inside cycle, divergent outside cycle, due to divergent cycle exit. But whether the value is uniform inside the cycle is less important. What matters is that values arrive at the use on exits from different iterations by different threads. I think we should use the name TemporalDivergence here. It's shorter and will show up when someone greps for temporal divergence. Let's also not add "Candidate" ... it just makes the name longer with only a little bit of new information. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, petar-avramovic wrote: I was considering TemporalDivergenceCandidate. I did not find strict definition of Temporal Divergence so I ended up using UseOutsideCycleWithDivergentExit since it is more technical and, I assume, not target dependent. It is not Temporal Divergence until we check uniformity of Src used OutsideCycleWithDivergentExit and it turns out to be uniform or the other case check type and it is i1. For us divergent i1 is also technically Temporal Divergence since it will ends up in sgpr. I am fine with using different name instead of "UseOutsideCycleWithDivergentExit" if you think it is more appropriate. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, ssahasra wrote: Everywhere in this patch, is there some reason to precisely say "UseOutsideCycleWithDivergentExit"? Can't we just say "TemporalDivergence"? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
github-actions[bot] wrote: :warning: C/C++ code formatter, clang-format found issues in your code. :warning: You can test this locally with the following command: ``bash git-clang-format --diff 1728ab49b46a31b63d8ecdc81fe87851aa40a725 3e04401258c91639105b1f2f17a84badbdf928ae --extensions cpp,h -- llvm/include/llvm/ADT/GenericUniformityImpl.h llvm/include/llvm/ADT/GenericUniformityInfo.h llvm/lib/Analysis/UniformityAnalysis.cpp llvm/lib/CodeGen/MachineUniformityAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp llvm/lib/Target/AMDGPU/SILowerI1Copies.h `` View the diff from clang-format here. ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 452d754985..8a0c9faa34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -225,7 +225,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { getAnalysis().getUniformityInfo(); MachineRegisterInfo &MRI = *B.getMRI(); const GCNSubtarget &ST = MF.getSubtarget(); - RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(), *ST.getRegBankInfo()); + RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(), +*ST.getRegBankInfo()); // Virtual registers at this point don't have register banks. // Virtual registers in def and use operands of already inst-selected // instruction have register class. `` https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
llvmbot wrote: @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) Changes Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- Patch is 23.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124298.diff 12 Files Affected: - (modified) llvm/include/llvm/ADT/GenericUniformityImpl.h (+37) - (modified) llvm/include/llvm/ADT/GenericUniformityInfo.h (+6) - (modified) llvm/lib/Analysis/UniformityAnalysis.cpp (+1-2) - (modified) llvm/lib/CodeGen/MachineUniformityAnalysis.cpp (+4-4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp (+45-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp (+20-4) - (modified) llvm/lib/Target/AMDGPU/SILowerI1Copies.h (+6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir (+4-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir (+10-9) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+9-8) ``diff diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e087..91ee0e41332199 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, + const InstructionT *, + const CycleT *); + inline iterator_range getUsesOutsideCycleWithDivergentExit() const { +return make_range(UsesOutsideCycleWithDivergentExit.begin(), + UsesOutsideCycleWithDivergentExit.end()); + } protected: /// \brief Value/block pair representing a single phi input. @@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl< +ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst, + const InstructionT *User, + const CycleT *Cycle) { + UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!UsesOutsideCycleWithDivergentExit.empty()) { +OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n"; + +for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range::UOCWDE *> +GenericUniformityInfo::getUsesOutsideCycleWithDivergentExit() const { + return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(), +DA->UsesOutsideCycleWithDivergentExit.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b46..660fd6d46114d7 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -40
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
llvmbot wrote: @llvm/pr-subscribers-llvm-analysis Author: Petar Avramovic (petar-avramovic) Changes Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- Patch is 23.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124298.diff 12 Files Affected: - (modified) llvm/include/llvm/ADT/GenericUniformityImpl.h (+37) - (modified) llvm/include/llvm/ADT/GenericUniformityInfo.h (+6) - (modified) llvm/lib/Analysis/UniformityAnalysis.cpp (+1-2) - (modified) llvm/lib/CodeGen/MachineUniformityAnalysis.cpp (+4-4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp (+45-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp (+20-4) - (modified) llvm/lib/Target/AMDGPU/SILowerI1Copies.h (+6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir (+4-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir (+10-9) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+9-8) ``diff diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e087..91ee0e41332199 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, + const InstructionT *, + const CycleT *); + inline iterator_range getUsesOutsideCycleWithDivergentExit() const { +return make_range(UsesOutsideCycleWithDivergentExit.begin(), + UsesOutsideCycleWithDivergentExit.end()); + } protected: /// \brief Value/block pair representing a single phi input. @@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl< +ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst, + const InstructionT *User, + const CycleT *Cycle) { + UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!UsesOutsideCycleWithDivergentExit.empty()) { +OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n"; + +for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range::UOCWDE *> +GenericUniformityInfo::getUsesOutsideCycleWithDivergentExit() const { + return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(), +DA->UsesOutsideCycleWithDivergentExit.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b46..660fd6d46114d7 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -40,6
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/124298 Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. >From 3e04401258c91639105b1f2f17a84badbdf928ae Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 24 Jan 2025 16:56:30 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 37 +++ llvm/include/llvm/ADT/GenericUniformityInfo.h | 6 +++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 8 ++-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 47 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 24 -- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 .../divergence-temporal-divergent-reg.ll | 18 +++ .../divergence-temporal-divergent-reg.mir | 3 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++ 12 files changed, 153 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e087..91ee0e41332199 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,10 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + // Use outside cycle with divergent exit + using UOCWDE = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, + const InstructionT *, + const CycleT *); + inline iterator_range getUsesOutsideCycleWithDivergentExit() const { +return make_range(UsesOutsideCycleWithDivergentExit.begin(), + UsesOutsideCycleWithDivergentExit.end()); + } protected: /// \brief Value/block pair representing a single phi input. @@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl< +ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst, + const InstructionT *User, + const CycleT *Cycle) { + UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!UsesOutsideCycleWithDivergentExit.empty()) { +OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n"; + +for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range::UOCWDE *> +GenericUniformityInfo::getUsesOutsideCycleWithDivergentExit() const { + return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(), +DA->UsesOutsideCycleWithDivergentExit.end());
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/124298?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#124299** https://app.graphite.dev/github/pr/llvm/llvm-project/124299?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#124298** https://app.graphite.dev/github/pr/llvm/llvm-project/124298?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/124298?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#124297** https://app.graphite.dev/github/pr/llvm/llvm-project/124297?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits