https://github.com/bwendling updated https://github.com/llvm/llvm-project/pull/92040
>From 94e01760b8363ad59a860c9c036918e670cc3783 Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Mon, 29 Apr 2024 14:40:54 -0700 Subject: [PATCH 1/9] [Clang][inlineasm] Add special support for "rm" output constraints Clang isn't able to support multiple constraints on inputs and outputs. Instead, it picks the "safest" one to use, i.e. the most conseravite. In the case of "rm" it picks the memory constraint. This leads to obviously horrible code: asm __volatile__ ("pushf\n\t" "popq %0" : "=rm" (x)); is converted to: #APP pushf popq -8(%rsp) #NO_APP movq -8(%rsp), %rax Blech! This hack^Wchange, makes a special exception for "rm" to use "r" if at all possible. The "RegMayBeFolded" flag is then used by the register allocators to allow for the old behavior if register pressure is too great. Fixes: https://github.com/llvm/llvm-project/issues/20571 Cc: Nick Desaulniers <[email protected]> Cc: Kees Cook <[email protected]> Cc: [email protected] --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 + llvm/include/llvm/CodeGen/TargetPassConfig.h | 2 + .../SelectionDAG/SelectionDAGBuilder.cpp | 25 +- .../SelectionDAG/SelectionDAGBuilder.h | 5 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 30 +- llvm/lib/CodeGen/TargetPassConfig.cpp | 6 + llvm/test/CodeGen/X86/asm-constraints-rm.ll | 363 ++++++++++++++++++ llvm/test/CodeGen/X86/inlineasm-sched-bug.ll | 5 +- 8 files changed, 424 insertions(+), 17 deletions(-) create mode 100644 llvm/test/CodeGen/X86/asm-constraints-rm.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 50a8c7eb75af5..ff321f6aa0f62 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4939,6 +4939,11 @@ class TargetLowering : public TargetLoweringBase { /// Memory, Other, Unknown. TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown; + /// The register may be folded. This is used if the constraint is "rm", + /// where we prefer using a register, but can fall back to a memory slot + /// under register pressure. + bool MayFoldRegister = false; + /// If this is the result output operand or a clobber, this is null, /// otherwise it is the incoming operand to the CallInst. This gets /// modified as the asm is processed. diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h index d00e0bed91a45..c1f4199536409 100644 --- a/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -496,6 +496,8 @@ class TargetPassConfig : public ImmutablePass { void registerCodeGenCallback(PassInstrumentationCallbacks &PIC, LLVMTargetMachine &); +bool usesGreedyOrDefaultRegisterAllocator(); + } // end namespace llvm #endif // LLVM_CODEGEN_TARGETPASSCONFIG_H diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ca352da5d36eb..7bc03becf1a5a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1008,7 +1008,8 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, } void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching, - unsigned MatchingIdx, const SDLoc &dl, + unsigned MatchingIdx, + bool MayFoldRegister, const SDLoc &dl, SelectionDAG &DAG, std::vector<SDValue> &Ops) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -1024,7 +1025,9 @@ void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching, // from the def. const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Regs.front()); + Flag.setRegClass(RC->getID()); + Flag.setRegMayBeFolded(MayFoldRegister); } SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32); @@ -9775,8 +9778,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, AsmNodeOperands.push_back(OpInfo.CallOperand); } else { // Otherwise, this outputs to a register (directly for C_Register / - // C_RegisterClass, and a target-defined fashion for - // C_Immediate/C_Other). Find a register that we can use. + // C_RegisterClass, and a target-defined fashion for C_Immediate / + // C_Other). Find a register that we can use. if (OpInfo.AssignedRegs.Regs.empty()) { emitInlineAsmError( Call, "couldn't allocate output register for constraint '" + @@ -9792,7 +9795,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, OpInfo.AssignedRegs.AddInlineAsmOperands( OpInfo.isEarlyClobber ? InlineAsm::Kind::RegDefEarlyClobber : InlineAsm::Kind::RegDef, - false, 0, getCurSDLoc(), DAG, AsmNodeOperands); + false, 0, OpInfo.MayFoldRegister, getCurSDLoc(), DAG, + AsmNodeOperands); } break; @@ -9834,9 +9838,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, SDLoc dl = getCurSDLoc(); // Use the produced MatchedRegs object to MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call); - MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, true, - OpInfo.getMatchedOperand(), dl, DAG, - AsmNodeOperands); + MatchedRegs.AddInlineAsmOperands( + InlineAsm::Kind::RegUse, true, OpInfo.getMatchedOperand(), + OpInfo.MayFoldRegister, dl, DAG, AsmNodeOperands); break; } @@ -9965,7 +9969,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, &Call); OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, false, - 0, dl, DAG, AsmNodeOperands); + 0, OpInfo.MayFoldRegister, dl, + DAG, AsmNodeOperands); break; } case InlineAsm::isClobber: @@ -9973,8 +9978,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, // allocator is aware that the physreg got clobbered. if (!OpInfo.AssignedRegs.Regs.empty()) OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::Clobber, - false, 0, getCurSDLoc(), DAG, - AsmNodeOperands); + false, 0, false, getCurSDLoc(), + DAG, AsmNodeOperands); break; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index ae361f8c500a0..daf9cfbbe1279 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -783,8 +783,9 @@ struct RegsForValue { /// code marker, matching input operand index (if applicable), and includes /// the number of values added into it. void AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching, - unsigned MatchingIdx, const SDLoc &dl, - SelectionDAG &DAG, std::vector<SDValue> &Ops) const; + unsigned MatchingIdx, bool MayFoldRegister, + const SDLoc &dl, SelectionDAG &DAG, + std::vector<SDValue> &Ops) const; /// Check if the total RegCount is greater than one. bool occupiesMultipleRegs() const { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 7beaeb9b7a171..cadb609ec72f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -33,6 +34,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include <cctype> using namespace llvm; @@ -5668,6 +5670,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL, unsigned ResNo = 0; // ResNo - The result number of the next output. unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number. + const Triple &T = getTargetMachine().getTargetTriple(); for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { ConstraintOperands.emplace_back(std::move(CI)); AsmOperandInfo &OpInfo = ConstraintOperands.back(); @@ -5678,6 +5681,16 @@ TargetLowering::ParseConstraints(const DataLayout &DL, OpInfo.ConstraintVT = MVT::Other; + // Special treatment for all platforms (currently only x86) that can fold a + // register into a spill. This is used for the "rm" constraint, where we + // would vastly prefer to use 'r' over 'm', but can't because of LLVM's + // architecture picks the most "conservative" constraint to ensure that (in + // the case of "rm") register pressure cause bad things to happen. + if (T.isX86() && !OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 && + llvm::is_contained(OpInfo.Codes, "r") && + llvm::is_contained(OpInfo.Codes, "m")) + OpInfo.MayFoldRegister = true; + // Compute the value type for each operand. switch (OpInfo.Type) { case InlineAsm::isOutput: @@ -5954,7 +5967,12 @@ TargetLowering::ConstraintWeight /// 1) If there is an 'other' constraint, and if the operand is valid for /// that constraint, use it. This makes us take advantage of 'i' /// constraints when available. -/// 2) Otherwise, pick the most general constraint present. This prefers +/// 2) Special processing is done for the "rm" constraint. If specified, we +/// opt for the 'r' constraint, but mark the operand as being "foldable." +/// In the face of register exhaustion, the register allocator is free to +/// choose to use a stack slot. This only applies to the greedy and default +/// register allocators. FIXME: Support other allocators (fast?). +/// 3) Otherwise, pick the most general constraint present. This prefers /// 'm' over 'r', for example. /// TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences( @@ -5962,6 +5980,16 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences( ConstraintGroup Ret; Ret.reserve(OpInfo.Codes.size()); + + // If we can fold the register (i.e. it has an "rm" constraint), opt for the + // 'r' constraint, and allow the register allocator to spill if need be. + // Applies only to the greedy and default register allocators. + if (OpInfo.MayFoldRegister && usesGreedyOrDefaultRegisterAllocator()) { + Ret.emplace_back(ConstraintPair("r", getConstraintType("r"))); + Ret.emplace_back(ConstraintPair("m", getConstraintType("m"))); + return Ret; + } + for (StringRef Code : OpInfo.Codes) { TargetLowering::ConstraintType CType = getConstraintType(Code); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 8832b51333d91..b768cde55d79f 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1077,6 +1077,12 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use")); +bool llvm::usesGreedyOrDefaultRegisterAllocator() { + return RegAlloc == (RegisterRegAlloc:: + FunctionPassCtor)&createGreedyRegisterAllocator || + RegAlloc == &useDefaultRegisterAllocator; +} + /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll new file mode 100644 index 0000000000000..f718f6b26abb3 --- /dev/null +++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll @@ -0,0 +1,363 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-I386 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-I386 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-I386 %s + +; The Greedy register allocator should use registers when there isn't register +; pressure. + +define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test1: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' input no pressure -> %eax %ecx +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test1: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' input no pressure -> %ecx %edx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test1: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test1: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp) +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test1: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test1: +; FAST-I386: #APP +; FAST-I386: # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp) +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + %d = getelementptr inbounds i8, ptr %ptr, i64 12 + %1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 + %2 = load i32, ptr %ptr, align 4 + ret i32 %2 +} + +define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test2: +; GREEDY-X86_64: #APP # 8-byte Folded Reload +; GREEDY-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test2: +; GREEDY-I386: #APP # 8-byte Folded Reload +; GREEDY-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp) +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test2: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test2: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp) +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test2: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test2: +; FAST-I386: #APP +; FAST-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) {{[0-9]+}}(%esp) +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + %d = getelementptr inbounds i8, ptr %ptr, i64 12 + %1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 + %2 = load i32, ptr %ptr, align 4 + ret i32 %2 +} + +define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test3: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' output no pressure -> %eax %ecx +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test3: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' output no pressure -> %ecx %edx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test3: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' output no pressure -> 4(%rdi) 12(%rdi) +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test3: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' output no pressure -> 4(%eax) 12(%eax) +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test3: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' output no pressure -> 4(%rdi) 12(%rdi) +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test3: +; FAST-I386: #APP +; FAST-I386: # 'rm' output no pressure -> 4(%eax) 12(%eax) +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %d = getelementptr inbounds i8, ptr %ptr, i64 12 + tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1 + %0 = load i32, ptr %ptr, align 4 + ret i32 %0 +} + +define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test4: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # tied 'rm' no pressure -> %eax %ecx %eax %ecx +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test4: +; GREEDY-I386: #APP +; GREEDY-I386: # tied 'rm' no pressure -> %ecx %edx %ecx %edx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test4: +; BASIC-X86_64: #APP +; BASIC-X86_64: # tied 'rm' no pressure -> %eax %ecx %eax %ecx +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test4: +; BASIC-I386: #APP +; BASIC-I386: # tied 'rm' no pressure -> %eax %ecx %eax %ecx +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test4: +; FAST-X86_64: #APP +; FAST-X86_64: # tied 'rm' no pressure -> %ecx %eax %ecx %eax +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test4: +; FAST-I386: #APP +; FAST-I386: # tied 'rm' no pressure -> %edx %ecx %edx %ecx +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + %d = getelementptr inbounds i8, ptr %ptr, i64 12 + %1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %0, i32 %1) #1 + %2 = load i32, ptr %ptr, align 4 + ret i32 %2 +} + +define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test5: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' input -> %eax +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test5: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' input -> %ecx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test5: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' input -> -{{[0-9]+}}(%rsp) +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test5: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' input -> (%esp) +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test5: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' input -> -{{[0-9]+}}(%rsp) +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test5: +; FAST-I386: #APP +; FAST-I386: # 'rm' input -> (%esp) +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1 + %1 = load i32, ptr %ptr, align 4 + ret i32 %1 +} + +define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test6: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' and 'r' input -> %eax %ecx +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test6: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' and 'r' input -> %ecx %edx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test6: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %ecx +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test6: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' and 'r' input -> (%esp) %ecx +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test6: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %eax +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test6: +; FAST-I386: #APP +; FAST-I386: # 'rm' and 'r' input -> (%esp) %ecx +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + %d = getelementptr inbounds i8, ptr %ptr, i64 12 + %1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 + %2 = load i32, ptr %ptr, align 4 + ret i32 %2 +} + +define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test7: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' output -> %eax +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test7: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' output -> %ecx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test7: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' output -> 4(%rdi) +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test7: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' output -> 4(%eax) +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test7: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' output -> 4(%rdi) +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test7: +; FAST-I386: #APP +; FAST-I386: # 'rm' output -> 4(%eax) +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1 + %0 = load i32, ptr %ptr, align 4 + ret i32 %0 +} + +define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test8: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'rm' tied -> %eax +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test8: +; GREEDY-I386: #APP +; GREEDY-I386: # 'rm' tied -> %ecx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test8: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'rm' tied -> %eax +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test8: +; BASIC-I386: #APP +; BASIC-I386: # 'rm' tied -> %eax +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test8: +; FAST-X86_64: #APP +; FAST-X86_64: # 'rm' tied -> %eax +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test8: +; FAST-I386: #APP +; FAST-I386: # 'rm' tied -> %ecx +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %0) #1 + %1 = load i32, ptr %ptr, align 4 + ret i32 %1 +} + +define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 { +; GREEDY-X86_64-LABEL: test9: +; GREEDY-X86_64: #APP +; GREEDY-X86_64: # 'r' output == input location -> %eax +; GREEDY-X86_64: #NO_APP +; +; GREEDY-I386-LABEL: test9: +; GREEDY-I386: #APP +; GREEDY-I386: # 'r' output == input location -> %ecx +; GREEDY-I386: #NO_APP +; +; BASIC-X86_64-LABEL: test9: +; BASIC-X86_64: #APP +; BASIC-X86_64: # 'r' output == input location -> %eax +; BASIC-X86_64: #NO_APP +; +; BASIC-I386-LABEL: test9: +; BASIC-I386: #APP +; BASIC-I386: # 'r' output == input location -> %eax +; BASIC-I386: #NO_APP +; +; FAST-X86_64-LABEL: test9: +; FAST-X86_64: #APP +; FAST-X86_64: # 'r' output == input location -> %eax +; FAST-X86_64: #NO_APP +; +; FAST-I386-LABEL: test9: +; FAST-I386: #APP +; FAST-I386: # 'r' output == input location -> %ecx +; FAST-I386: #NO_APP +entry: + %b = getelementptr inbounds i8, ptr %ptr, i64 4 + %0 = load i32, ptr %b, align 4 + %1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1 + store i32 %1, ptr %b, align 4 + %2 = load i32, ptr %ptr, align 4 + ret i32 %2 +} + +attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll index be4d1c29332f7..a322bd3003a58 100644 --- a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll +++ b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll @@ -6,16 +6,13 @@ define i32 @foo(i32 %treemap) nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: negl %ecx ; CHECK-NEXT: andl %eax, %ecx -; CHECK-NEXT: movl %ecx, (%esp) ; CHECK-NEXT: #APP -; CHECK-NEXT: bsfl (%esp), %eax +; CHECK-NEXT: bsfl %ecx, %eax ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: popl %ecx ; CHECK-NEXT: retl entry: %sub = sub i32 0, %treemap >From 9378b7ae2fa44c977bd8e1ab500db3883ecdb3da Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Wed, 10 Jul 2024 11:00:13 -0700 Subject: [PATCH 2/9] Remove function identifying the register allocator used. --- llvm/include/llvm/CodeGen/TargetPassConfig.h | 2 -- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- llvm/lib/CodeGen/TargetPassConfig.cpp | 6 ------ 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h index c1f4199536409..d00e0bed91a45 100644 --- a/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -496,8 +496,6 @@ class TargetPassConfig : public ImmutablePass { void registerCodeGenCallback(PassInstrumentationCallbacks &PIC, LLVMTargetMachine &); -bool usesGreedyOrDefaultRegisterAllocator(); - } // end namespace llvm #endif // LLVM_CODEGEN_TARGETPASSCONFIG_H diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e10014a64b25e..fa74f2789bfdb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6001,7 +6001,7 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences( // If we can fold the register (i.e. it has an "rm" constraint), opt for the // 'r' constraint, and allow the register allocator to spill if need be. // Applies only to the greedy and default register allocators. - if (OpInfo.MayFoldRegister && usesGreedyOrDefaultRegisterAllocator()) { + if (OpInfo.MayFoldRegister) { Ret.emplace_back(ConstraintPair("r", getConstraintType("r"))); Ret.emplace_back(ConstraintPair("m", getConstraintType("m"))); return Ret; diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 558dd6523aeec..3658e8320a0cc 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1080,12 +1080,6 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use")); -bool llvm::usesGreedyOrDefaultRegisterAllocator() { - return RegAlloc == (RegisterRegAlloc:: - FunctionPassCtor)&createGreedyRegisterAllocator || - RegAlloc == &useDefaultRegisterAllocator; -} - /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages >From c1cfcefd1901d57306d77ee0397c98670caf56af Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 25 Jul 2024 11:30:17 -0700 Subject: [PATCH 3/9] Run instnamer and remove unneeded '-O2'. --- llvm/test/CodeGen/X86/asm-constraints-rm.ll | 86 ++++++++++----------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll index f718f6b26abb3..6031eb7b22e6d 100644 --- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll +++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s -; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-I386 %s -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-X86_64 %s -; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-I386 %s -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-X86_64 %s -; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-I386 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-I386 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=basic < %s | FileCheck --check-prefix=BASIC-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=basic < %s | FileCheck --check-prefix=BASIC-I386 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=fast < %s | FileCheck --check-prefix=FAST-X86_64 %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=fast < %s | FileCheck --check-prefix=FAST-I386 %s ; The Greedy register allocator should use registers when there isn't register ; pressure. @@ -41,12 +41,12 @@ define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_a ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 + %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 - %1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 - %2 = load i32, ptr %ptr, align 4 - ret i32 %2 + %i1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + %i2 = load i32, ptr %ptr, align 4 + ret i32 %i2 } define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { @@ -81,12 +81,12 @@ define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_a ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 + %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 - %1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 - %2 = load i32, ptr %ptr, align 4 - ret i32 %2 + %i1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + %i2 = load i32, ptr %ptr, align 4 + ret i32 %i2 } define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 { @@ -123,8 +123,8 @@ entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1 - %0 = load i32, ptr %ptr, align 4 - ret i32 %0 + %i = load i32, ptr %ptr, align 4 + ret i32 %i } define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 { @@ -159,12 +159,12 @@ define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 { ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 + %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 - %1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %0, i32 %1) #1 - %2 = load i32, ptr %ptr, align 4 - ret i32 %2 + %i1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1) #1 + %i2 = load i32, ptr %ptr, align 4 + ret i32 %i2 } define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { @@ -199,10 +199,10 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 - tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1 - %1 = load i32, ptr %ptr, align 4 - ret i32 %1 + %i = load i32, ptr %b, align 4 + tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1 + %i1 = load i32, ptr %ptr, align 4 + ret i32 %i1 } define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { @@ -237,12 +237,12 @@ define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_a ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 + %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 - %1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1 - %2 = load i32, ptr %ptr, align 4 - ret i32 %2 + %i1 = load i32, ptr %d, align 4 + tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + %i2 = load i32, ptr %ptr, align 4 + ret i32 %i2 } define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { @@ -278,8 +278,8 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1 - %0 = load i32, ptr %ptr, align 4 - ret i32 %0 + %i = load i32, ptr %ptr, align 4 + ret i32 %i } define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 { @@ -314,10 +314,10 @@ define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 { ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 - tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %0) #1 - %1 = load i32, ptr %ptr, align 4 - ret i32 %1 + %i = load i32, ptr %b, align 4 + tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i) #1 + %i1 = load i32, ptr %ptr, align 4 + ret i32 %i1 } define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 { @@ -352,11 +352,11 @@ define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 { ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - %0 = load i32, ptr %b, align 4 - %1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1 - store i32 %1, ptr %b, align 4 - %2 = load i32, ptr %ptr, align 4 - ret i32 %2 + %i = load i32, ptr %b, align 4 + %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1 + store i32 %i1, ptr %b, align 4 + %i2 = load i32, ptr %ptr, align 4 + ret i32 %i2 } attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } >From 03d62f1f9fb7b47ff55cbaae0f7619c78491a3f1 Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 15 Jan 2026 01:21:53 -0800 Subject: [PATCH 4/9] [CodeGen] Add InlineAsmPrepare pass to convert "rm" constraints to "m" for fast regalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a new IR-level pass, InlineAsmPrepare, that converts inline assembly "rm" (register-or-memory) constraints to "m" (memory-only) constraints when using the fast register allocator on x86 platforms. Background: The "rm" constraint allows the compiler to choose between a register or memory operand. However, LLVM's architecture conservatively picks the most restrictive interpretation to avoid issues with register pressure. This causes the fast register allocator to always select registers for "rm" constraints, even when memory would be more appropriate, leading to unnecessary spills and suboptimal code. Solution: The InlineAsmPrepare pass runs at -O0 before instruction selection and transforms inline asm calls by: 1. Converting "rm" constraints to "m" (memory-only) 2. Creating allocas for each converted operand 3. For inputs: storing values to allocas and passing pointers 4. For outputs: passing alloca pointers and loading results afterward 5. For tied constraints (e.g., "=rm,0"): storing input to output's alloca and passing the same pointer for both the output and tied input Implementation details: - InlineAsmPrepare.cpp: New pass that processes CallInst nodes with inline asm * Parses constraint strings and identifies "rm" patterns * Tracks tied input-output pairs via TiedOutput mapping * Reconstructs return values by loading from allocas for converted outputs * Properly handles struct returns with mixed converted/unconverted outputs - SelectionDAG integration: * Added MayFoldRegister flag to mark "rm" constraints * Modified constraint preference to prefer 'r' over 'm' when MayFoldRegister is set, allowing the register allocator to spill to memory if needed * Updated AddInlineAsmOperands to propagate the MayFoldRegister flag - Pass integration: * Registered in PassBuilder and PassRegistry * Added to TargetPassConfig for -O0 compilation Test coverage: Added inline-asm-prepare-memory.ll with three scenarios: - Input-only: "rm" input → "m" with alloca + store + pointer arg - Output-only: "=rm" output → "=*m" with alloca + pointer arg + load - Tied/read-write: "=rm,0" → "=*m,0" with store + dual pointer args + load The pass only runs on x86 platforms at -O0 to improve code generation for the fast register allocator without impacting optimized builds. --- clang/lib/CodeGen/CGStmt.cpp | 13 +- clang/test/CodeGen/asm.c | 2 +- llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 23 ++ llvm/include/llvm/CodeGen/Passes.h | 5 + llvm/include/llvm/CodeGen/TargetLowering.h | 10 +- llvm/include/llvm/IR/InlineAsm.h | 8 + llvm/include/llvm/InitializePasses.h | 1 + llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/CodeGen.cpp | 1 + llvm/lib/CodeGen/InlineAsmPrepare.cpp | 334 ++++++++++++++++++ llvm/lib/CodeGen/TargetPassConfig.cpp | 3 + llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/test/CodeGen/AArch64/O0-pipeline.ll | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 1 + llvm/test/CodeGen/LoongArch/O0-pipeline.ll | 1 + llvm/test/CodeGen/PowerPC/O0-pipeline.ll | 1 + llvm/test/CodeGen/RISCV/O0-pipeline.ll | 1 + llvm/test/CodeGen/SPIRV/llc-pipeline.ll | 1 + llvm/test/CodeGen/X86/O0-pipeline.ll | 1 + llvm/test/CodeGen/X86/asm-constraints-rm.ll | 69 ++-- .../CodeGen/X86/inline-asm-prepare-memory.ll | 38 ++ 22 files changed, 472 insertions(+), 45 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/InlineAsmPrepare.h create mode 100644 llvm/lib/CodeGen/InlineAsmPrepare.cpp create mode 100644 llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index cf5ddb78c3a1d..64eba8040f113 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -2921,13 +2921,20 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { if (!Constraints.empty()) Constraints += ','; - // If this is a register output, then make the inline asm return it - // by-value. If this is a memory result, return the value by-reference. + // - If this is a register output, then make the inline asm return it + // by-value. + // - If this is an "rm" constraint on x86, then treat it like a register + // output. (We'll correct this before ISel if using the FastRA.) + // - If this is a memory result, return the value by-reference. QualType QTy = OutExpr->getType(); const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) || hasAggregateEvaluationKind(QTy); - if (!Info.allowsMemory() && IsScalarOrAggregate) { + const bool X86RegisterMemoryConstraints = + getTarget().getTriple().isX86() && + (OutputConstraint == "rm" || OutputConstraint == "mr"); + if (IsScalarOrAggregate && + (!Info.allowsMemory() || X86RegisterMemoryConstraints)) { Constraints += "=" + OutputConstraint; ResultRegQualTys.push_back(QTy); ResultRegDests.push_back(Dest); diff --git a/clang/test/CodeGen/asm.c b/clang/test/CodeGen/asm.c index 9687c993e6464..66a7142ee7fca 100644 --- a/clang/test/CodeGen/asm.c +++ b/clang/test/CodeGen/asm.c @@ -259,7 +259,7 @@ void t31(int len) { __asm__ volatile("" : "+%%rm"(len), "+rm"(len)); // CHECK: @t31 - // CHECK: call void asm sideeffect "", "=*%rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}" + // CHECK: call i32 asm sideeffect "", "=*%rm,=rm,0,1,~{dirflag},~{fpsr},~{flags}" } // CHECK: @t32 diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h new file mode 100644 index 0000000000000..a400a78390dff --- /dev/null +++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h @@ -0,0 +1,23 @@ +//===-- InlineAsmPrepare - Prepare inline asm for code gen ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_INLINEASMPREPARE_H +#define LLVM_CODEGEN_INLINEASMPREPARE_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_INLINEASMPREPARE_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 303b9076131e3..9e1e34269baca 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -630,6 +630,11 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass(); /// Lowers KCFI operand bundles for indirect calls. LLVM_ABI FunctionPass *createKCFIPass(); + +/// Modify inline asms with "rm" constraints to "m" for the fast register +/// allocator. +LLVM_ABI FunctionPass *createInlineAsmPass(); + } // namespace llvm #endif diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e94668a5d7a76..76a790d057115 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5222,11 +5222,6 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// Memory, Other, Unknown. TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown; - /// The register may be folded. This is used if the constraint is "rm", - /// where we prefer using a register, but can fall back to a memory slot - /// under register pressure. - bool MayFoldRegister = false; - /// If this is the result output operand or a clobber, this is null, /// otherwise it is the incoming operand to the CallInst. This gets /// modified as the asm is processed. @@ -5235,6 +5230,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// The ValueType for the operand value. MVT ConstraintVT = MVT::Other; + /// The register may be folded. This is used if the constraint is "rm", + /// where we prefer using a register, but can fall back to a memory slot + /// under register pressure. + bool MayFoldRegister = false; + /// Copy constructor for copying from a ConstraintInfo. AsmOperandInfo(InlineAsm::ConstraintInfo Info) : InlineAsm::ConstraintInfo(std::move(Info)) {} diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h index 96887d129a69f..6491b0ff5e82b 100644 --- a/llvm/include/llvm/IR/InlineAsm.h +++ b/llvm/include/llvm/IR/InlineAsm.h @@ -181,6 +181,14 @@ class InlineAsm final : public Value { bool hasArg() const { return Type == isInput || (Type == isOutput && isIndirect); } + + /// hassRegMemConstraints - Returns true if and only if the constraint + /// codes are "rm". This is useful when converting between a register form + /// to a memory form. + bool hasRegMemConstraints() const { + return Codes.size() == 2 && is_contained(Codes, "r") && + is_contained(Codes, "m"); + } }; /// ParseConstraints - Split up the constraint string into the specific diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index e9e3ca3cc93a0..c3b550beb1e7f 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -145,6 +145,7 @@ initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &); LLVM_ABI void initializeImplicitNullChecksPass(PassRegistry &); LLVM_ABI void initializeIndirectBrExpandLegacyPassPass(PassRegistry &); LLVM_ABI void initializeInferAddressSpacesPass(PassRegistry &); +LLVM_ABI void initializeInlineAsmPreparePass(PassRegistry &); LLVM_ABI void initializeInstSimplifyLegacyPassPass(PassRegistry &); LLVM_ABI void initializeInstructionCombiningPassPass(PassRegistry &); LLVM_ABI void initializeInstructionSelectPass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index f26b2cb6fddf5..9a1561402adfd 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -77,6 +77,7 @@ add_llvm_component_library(LLVMCodeGen IfConversion.cpp ImplicitNullChecks.cpp IndirectBrExpandPass.cpp + InlineAsmPrepare.cpp InitUndef.cpp InlineSpiller.cpp InsertCodePrefetch.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 3550eea13979a..a837f1c54f82e 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -53,6 +53,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeImplicitNullChecksPass(Registry); initializeIndirectBrExpandLegacyPassPass(Registry); initializeInitUndefLegacyPass(Registry); + initializeInlineAsmPreparePass(Registry); initializeInterleavedLoadCombinePass(Registry); initializeInterleavedAccessPass(Registry); initializeJMCInstrumenterPass(Registry); diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp new file mode 100644 index 0000000000000..9524bcb302f8f --- /dev/null +++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp @@ -0,0 +1,334 @@ +//===-- InlineAsmPrepare - Prepare inline asm for code generation ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/InlineAsmPrepare.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/DerivedTypes.h" +#include <sstream> + +using namespace llvm; + +#define DEBUG_TYPE "inline-asm-prepare" + +namespace { + +class InlineAsmPrepare : public FunctionPass { + InlineAsmPrepare(InlineAsmPrepare &) = delete; + +public: + InlineAsmPrepare() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + bool runOnFunction(Function &F) override; + + static char ID; +}; + +char InlineAsmPrepare::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(InlineAsmPrepare, DEBUG_TYPE, + "Convert inline asm \"rm\" insts for fast register allocation", + false, false) +FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); } + +// For each inline asm, the "rm" constraint needs to default to "m" for the +// fast register allocator. +static SmallVector<CallBase *, 4> findInlineAsms(Function &F) { + SmallVector<CallBase *, 4> InlineAsms; + + for_each(F, [&](BasicBlock &BB) { + for_each(BB, [&](Instruction &I) { + CallBase *CB = dyn_cast<CallBase>(&I); + if (!CB || !CB->isInlineAsm()) + return; + InlineAsms.push_back(CB); + }); + }); + + return InlineAsms; +} + +static bool isRegMemConstraint(StringRef Constraint) { + return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr"); +} + +// Convert instances of the "rm" constraints into "m". +static std::string convertConstraintsToMemory(StringRef ConstraintStr) { + auto I = ConstraintStr.begin(), E = ConstraintStr.end(); + std::ostringstream Out; + + while (I != E) { + bool IsOutput = false; + bool HasIndirect = false; + if (*I == '=') { + Out << *I; + IsOutput = true; + ++I; + } + if (*I == '*') { + Out << '*'; + HasIndirect = true; + ++I; + } + if (*I == '+') { + Out << '+'; + IsOutput = true; + ++I; + } + + auto Comma = std::find(I, E, ','); + std::string Sub(I, Comma); + if (isRegMemConstraint(Sub)) { + if (IsOutput && !HasIndirect) + Out << '*'; + Out << 'm'; + } else { + Out << Sub; + } + + if (Comma == E) + break; + + Out << ','; + I = Comma + 1; + } + + return Out.str(); +} + +bool InlineAsmPrepare::runOnFunction(Function &F) { + // Only process "rm" on x86 platforms. + if (!F.getParent()->getTargetTriple().isX86()) + return false; + + SmallVector<CallBase *, 4> IAs = findInlineAsms(F); + if (IAs.empty()) + return false; + + bool Changed = false; + for (CallBase *CB : IAs) { + InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); + const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints(); + + std::string NewConstraintStr = + convertConstraintsToMemory(IA->getConstraintString()); + if (NewConstraintStr == IA->getConstraintString()) + continue; + + IRBuilder<> Builder(CB); + // IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin()); + + // Collect new arguments and return types. + SmallVector<Value *, 8> NewArgs; + SmallVector<Type *, 8> NewArgTypes; + SmallVector<Type *, 2> NewRetTypes; + + SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs; + + // Track allocas created for converted outputs. + // Maps constraint index to the AllocaInst created for it (if any). + SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr); + + // Track pairs of Input-Output tied constraints. + // TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint j. + SmallVector<int, 8> TiedOutput(Constraints.size(), -1); + for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { + const auto &C = Constraints[I]; + if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) { + int InputIdx = C.MatchingInput; + if (InputIdx >= 0 && InputIdx < (int)Constraints.size()) + TiedOutput[InputIdx] = I; + } + if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) { + int OutputIdx = C.MatchingInput; + if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size()) + TiedOutput[I] = OutputIdx; + } + } + + unsigned ArgNo = 0; + unsigned OutputIdx = 0; + for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { + const auto &C = Constraints[I]; + + if (C.Type == InlineAsm::isOutput) { + // Output-only or Output with matching input (Read-Write) + Type *RetTy = CB->getType(); + Type *SlotTy = RetTy; + + if (StructType *ST = dyn_cast<StructType>(RetTy)) + SlotTy = ST->getElementType(OutputIdx); + + if (C.hasRegMemConstraints()) { + // Converted to memory constraint. Create alloca and pass pointer as + // argument. + AllocaInst *Slot = Builder.CreateAlloca(SlotTy, nullptr, "asm_mem"); + NewArgs.push_back(Slot); + NewArgTypes.push_back(Slot->getType()); + ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy}); + OutputAllocas[I] = Slot; + // No return value for this output since it's now an out-parameter. + } else { + // Unchanged, still an output return value. + NewRetTypes.push_back(SlotTy); + } + + OutputIdx++; + } else if (C.Type == InlineAsm::isInput) { + // Input + Value *ArgVal = CB->getArgOperand(ArgNo); + Type *ArgTy = ArgVal->getType(); + bool Handled = false; + + if (TiedOutput[I] != -1) { + int MatchIdx = TiedOutput[I]; + if (AllocaInst *Slot = OutputAllocas[MatchIdx]) { + // The matched output was converted to memory. + // Store this input into the alloca. + Builder.CreateStore(ArgVal, Slot); + // Pass the alloca pointer as the argument, instead of ArgVal. + // This ensures the tied "0" constraint matches the "*m" output. + NewArgs.push_back(Slot); + NewArgTypes.push_back(Slot->getType()); + Handled = true; + } + } + + if (!Handled) { + if (C.hasRegMemConstraints()) { + // Converted to memory constraint. + // Create alloca, store input, pass pointer as argument. + AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem"); + Builder.CreateStore(ArgVal, Slot); + NewArgs.push_back(Slot); + NewArgTypes.push_back(Slot->getType()); + } else { + // Unchanged + NewArgs.push_back(ArgVal); + NewArgTypes.push_back(ArgTy); + } + } + ArgNo++; + } + } + + Type *NewRetTy = nullptr; + if (NewRetTypes.empty()) { + NewRetTy = Type::getVoidTy(F.getContext()); + } else if (NewRetTypes.size() == 1) { + NewRetTy = NewRetTypes[0]; + } else { + NewRetTy = StructType::get(F.getContext(), NewRetTypes); + } + + FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false); + auto *NewIA = InlineAsm::get( + NewFTy, IA->getAsmString(), NewConstraintStr, + IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect(), + IA->canThrow()); + + CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs); + NewCall->setCallingConv(CB->getCallingConv()); + NewCall->setAttributes(CB->getAttributes()); + NewCall->setDebugLoc(CB->getDebugLoc()); + + for (const auto &Item : ElementTypeAttrs) + NewCall->addParamAttr(Item.first, + Attribute::get(F.getContext(), + Attribute::ElementType, + Item.second)); + + // Reconstruct the return value and update users. + if (!CB->use_empty()) { + Value *Replacement = nullptr; + Type *RetTy = CB->getType(); + + if (RetTy->isVoidTy()) { + // No return value, nothing to replace. + } else if (isa<StructType>(RetTy)) { + // Multiple outputs. Reconstruct the struct. + Value *Res = UndefValue::get(RetTy); + unsigned NewRetIdx = 0; + unsigned OriginalOutIdx = 0; + + for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { + if (Constraints[I].Type != InlineAsm::isOutput) + continue; + + Value *Val = nullptr; + if (AllocaInst *Slot = OutputAllocas[I]) { + // Converted to memory. Load from alloca. + Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot); + } else { + // Not converted. Extract from NewCall return. + if (NewRetTypes.size() == 1) { + Val = NewCall; + } else { + Val = Builder.CreateExtractValue(NewCall, NewRetIdx); + } + NewRetIdx++; + } + + Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++); + } + Replacement = Res; + } else { + // Single output. + // Find the output constraint (should be the first one). + unsigned OutConstraintIdx = 0; + for (unsigned I = 0; I < Constraints.size(); ++I) { + if (Constraints[I].Type == InlineAsm::isOutput) { + OutConstraintIdx = I; + break; + } + } + + if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) { + Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot); + } else { + Replacement = NewCall; + } + } + + if (Replacement) { + CB->replaceAllUsesWith(Replacement); + } + } + + CB->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +PreservedAnalyses InlineAsmPreparePass::run(Function &F, + FunctionAnalysisManager &FAM) { + InlineAsmPrepare IAP; + + bool Changed = IAP.runOnFunction(F); + if (!Changed) + return PreservedAnalyses::all(); + + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 27e1afdcd7724..56ea3e12a528c 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -983,6 +983,9 @@ void TargetPassConfig::addISelPrepare() { if (getOptLevel() != CodeGenOptLevel::None) addPass(createObjCARCContractPass()); + if (getOptLevel() == CodeGenOptLevel::None) + addPass(createInlineAsmPass()); + addPass(createCallBrPass()); // Add both the safe stack and the stack protection passes: each of them will diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8bb78c8c7df63..6e8237c571d6f 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -106,6 +106,7 @@ #include "llvm/CodeGen/HardwareLoops.h" #include "llvm/CodeGen/IndirectBrExpand.h" #include "llvm/CodeGen/InitUndef.h" +#include "llvm/CodeGen/InlineAsmPrepare.h" #include "llvm/CodeGen/InterleavedAccess.h" #include "llvm/CodeGen/InterleavedLoadCombine.h" #include "llvm/CodeGen/JMCInstrumenter.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 2cfb5b2592601..1b6774157e291 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -450,6 +450,7 @@ FUNCTION_PASS("helloworld", HelloWorldPass()) FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM)) FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass()) FUNCTION_PASS("infer-alignment", InferAlignmentPass()) +FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass()) FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings()) FUNCTION_PASS("instcount", InstCountPass()) FUNCTION_PASS("instnamer", InstructionNamerPass()) diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index cc0655b31d892..5f09a0fb04247 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -33,6 +33,7 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: AArch64 Stack Tagging ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Convert inline asm "rm" insts for fast register allocation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6940c1b238e1d..a2273c1ae93c6 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -94,6 +94,7 @@ ; GCN-O0-NEXT: Call Graph SCC Pass Manager ; GCN-O0-NEXT: DummyCGSCCPass ; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Convert inline asm "rm" insts for fast register allocation ; GCN-O0-NEXT: Prepare callbr ; GCN-O0-NEXT: Safe Stack instrumentation pass ; GCN-O0-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll index ad7eee3f975f6..eeb1488dcf4f3 100644 --- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll @@ -31,6 +31,7 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Convert inline asm "rm" insts for fast register allocation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll index d586328c5062e..fd2595a3c181b 100644 --- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll @@ -30,6 +30,7 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Convert inline asm "rm" insts for fast register allocation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll index c3e0ed9b85ec7..d8c899ddafb2a 100644 --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -32,6 +32,7 @@ ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Convert inline asm "rm" insts for fast register allocation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll index cbd06ae1eec4e..f20c224b3e1d4 100644 --- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll +++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll @@ -44,6 +44,7 @@ ; SPIRV-O0-NEXT: SPIRV emit intrinsics ; SPIRV-O0-NEXT: FunctionPass Manager ; SPIRV-O0-NEXT: SPIRV legalize bitcast pass +; SPIRV-O0-NEXT: Convert inline asm "rm" insts for fast register allocation ; SPIRV-O0-NEXT: Prepare callbr ; SPIRV-O0-NEXT: Safe Stack instrumentation pass ; SPIRV-O0-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 673b36968bdeb..4310ca2c4403d 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -32,6 +32,7 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Convert inline asm "rm" insts for fast register allocation ; CHECK-NEXT: Prepare callbr ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll index 6031eb7b22e6d..66ca437317997 100644 --- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll +++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll @@ -6,10 +6,10 @@ ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=fast < %s | FileCheck --check-prefix=FAST-X86_64 %s ; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=fast < %s | FileCheck --check-prefix=FAST-I386 %s -; The Greedy register allocator should use registers when there isn't register -; pressure. +; The non-fast register allocators should use registers when there isn't +; register pressure. -define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test1: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' input no pressure -> %eax %ecx @@ -22,12 +22,12 @@ define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_a ; ; BASIC-X86_64-LABEL: test1: ; BASIC-X86_64: #APP -; BASIC-X86_64: # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) +; BASIC-X86_64: # 'rm' input no pressure -> %ecx %eax ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test1: ; BASIC-I386: #APP -; BASIC-I386: # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp) +; BASIC-I386: # 'rm' input no pressure -> %ecx %eax ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test1: @@ -44,12 +44,12 @@ entry: %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 %i1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) %i2 = load i32, ptr %ptr, align 4 ret i32 %i2 } -define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test2: ; GREEDY-X86_64: #APP # 8-byte Folded Reload ; GREEDY-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) @@ -61,13 +61,13 @@ define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_a ; GREEDY-I386: #NO_APP ; ; BASIC-X86_64-LABEL: test2: -; BASIC-X86_64: #APP +; BASIC-X86_64: #APP # 8-byte Folded Reload ; BASIC-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test2: -; BASIC-I386: #APP -; BASIC-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp) +; BASIC-I386: #APP # 8-byte Folded Reload +; BASIC-I386: # 'rm' input pressure -> (%esp) {{[0-9]+}}(%esp) ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test2: @@ -84,12 +84,12 @@ entry: %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 %i1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) %i2 = load i32, ptr %ptr, align 4 ret i32 %i2 } -define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 { +define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test3: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' output no pressure -> %eax %ecx @@ -102,12 +102,12 @@ define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 { ; ; BASIC-X86_64-LABEL: test3: ; BASIC-X86_64: #APP -; BASIC-X86_64: # 'rm' output no pressure -> 4(%rdi) 12(%rdi) +; BASIC-X86_64: # 'rm' output no pressure -> %eax %ecx ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test3: ; BASIC-I386: #APP -; BASIC-I386: # 'rm' output no pressure -> 4(%eax) 12(%eax) +; BASIC-I386: # 'rm' output no pressure -> %eax %ecx ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test3: @@ -122,12 +122,12 @@ define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 { entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 - tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1 + tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) %i = load i32, ptr %ptr, align 4 ret i32 %i } -define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 { +define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test4: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # tied 'rm' no pressure -> %eax %ecx %eax %ecx @@ -162,12 +162,12 @@ entry: %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 %i1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1) #1 + tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1) %i2 = load i32, ptr %ptr, align 4 ret i32 %i2 } -define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test5: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' input -> %eax @@ -180,12 +180,12 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a ; ; BASIC-X86_64-LABEL: test5: ; BASIC-X86_64: #APP -; BASIC-X86_64: # 'rm' input -> -{{[0-9]+}}(%rsp) +; BASIC-X86_64: # 'rm' input -> %eax ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test5: ; BASIC-I386: #APP -; BASIC-I386: # 'rm' input -> (%esp) +; BASIC-I386: # 'rm' input -> %eax ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test5: @@ -200,12 +200,12 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 %i = load i32, ptr %b, align 4 - tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1 + tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i) %i1 = load i32, ptr %ptr, align 4 ret i32 %i1 } -define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 { +define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test6: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' and 'r' input -> %eax %ecx @@ -218,12 +218,12 @@ define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_a ; ; BASIC-X86_64-LABEL: test6: ; BASIC-X86_64: #APP -; BASIC-X86_64: # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %ecx +; BASIC-X86_64: # 'rm' and 'r' input -> %ecx %eax ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test6: ; BASIC-I386: #APP -; BASIC-I386: # 'rm' and 'r' input -> (%esp) %ecx +; BASIC-I386: # 'rm' and 'r' input -> %ecx %eax ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test6: @@ -240,12 +240,12 @@ entry: %i = load i32, ptr %b, align 4 %d = getelementptr inbounds i8, ptr %ptr, i64 12 %i1 = load i32, ptr %d, align 4 - tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1 + tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) %i2 = load i32, ptr %ptr, align 4 ret i32 %i2 } -define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { +define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test7: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' output -> %eax @@ -258,12 +258,12 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { ; ; BASIC-X86_64-LABEL: test7: ; BASIC-X86_64: #APP -; BASIC-X86_64: # 'rm' output -> 4(%rdi) +; BASIC-X86_64: # 'rm' output -> %eax ; BASIC-X86_64: #NO_APP ; ; BASIC-I386-LABEL: test7: ; BASIC-I386: #APP -; BASIC-I386: # 'rm' output -> 4(%eax) +; BASIC-I386: # 'rm' output -> %eax ; BASIC-I386: #NO_APP ; ; FAST-X86_64-LABEL: test7: @@ -277,12 +277,12 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 { ; FAST-I386: #NO_APP entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 - tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1 + tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) %i = load i32, ptr %ptr, align 4 ret i32 %i } -define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 { +define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test8: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'rm' tied -> %eax @@ -315,12 +315,12 @@ define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 { entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 %i = load i32, ptr %b, align 4 - tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i) #1 + tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i) %i1 = load i32, ptr %ptr, align 4 ret i32 %i1 } -define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 { +define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr { ; GREEDY-X86_64-LABEL: test9: ; GREEDY-X86_64: #APP ; GREEDY-X86_64: # 'r' output == input location -> %eax @@ -353,11 +353,8 @@ define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 { entry: %b = getelementptr inbounds i8, ptr %ptr, i64 4 %i = load i32, ptr %b, align 4 - %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1 + %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i) store i32 %i1, ptr %b, align 4 %i2 = load i32, ptr %ptr, align 4 ret i32 %i2 } - -attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll new file mode 100644 index 0000000000000..ce1e16a6518e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll @@ -0,0 +1,38 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -stop-after=inline-asm-prepare < %s | FileCheck %s + +define void @func_rm_input(i32 %x) { +; CHECK-LABEL: @func_rm_input +; CHECK: %asm_mem = alloca i32 +; CHECK: store i32 %x, ptr %asm_mem +; CHECK: call i32 asm sideeffect "mov $1, $0", "=r,m,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem) +entry: + %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x) + ret void +} + +define void @func_rm_output(ptr %p) { +; CHECK-LABEL: @func_rm_output +; CHECK: %asm_mem = alloca i32 +; CHECK: call void asm sideeffect "mov $1, $0", "=*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem) +; CHECK: %[[VAL:.*]] = load i32, ptr %asm_mem +; CHECK: store i32 %[[VAL]], ptr %p +entry: + %0 = call i32 asm sideeffect "mov $1, $0", "=rm,~{dirflag},~{fpsr},~{flags}"() + store i32 %0, ptr %p + ret void +} + +define void @func_rm_inout(ptr %x_ptr) { +; CHECK-LABEL: @func_rm_inout +; CHECK: %x = load i32, ptr %x_ptr +; CHECK: %asm_mem = alloca i32 +; CHECK: store i32 %x, ptr %asm_mem +; CHECK: call void asm sideeffect "inc $0", "=*m,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem, ptr %asm_mem) +; CHECK: %[[VAL2:.*]] = load i32, ptr %asm_mem +; CHECK: store i32 %[[VAL2]], ptr %x_ptr +entry: + %x = load i32, ptr %x_ptr + %0 = call i32 asm sideeffect "inc $0", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %x) + store i32 %0, ptr %x_ptr + ret void +} >From 00d53a67c0cab310cfd6924eee72027668ba1e8a Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 15 Jan 2026 02:52:08 -0800 Subject: [PATCH 5/9] Reformat --- llvm/lib/CodeGen/InlineAsmPrepare.cpp | 141 +++++++++++++------------- 1 file changed, 70 insertions(+), 71 deletions(-) diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp index 9524bcb302f8f..15a4b2827c9b0 100644 --- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp +++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp @@ -14,14 +14,14 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/DerivedTypes.h" #include <sstream> using namespace llvm; @@ -150,7 +150,8 @@ bool InlineAsmPrepare::runOnFunction(Function &F) { SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr); // Track pairs of Input-Output tied constraints. - // TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint j. + // TiedOutput[i] = j means Constraint i is an Input tied to Output + // Constraint j. SmallVector<int, 8> TiedOutput(Constraints.size(), -1); for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { const auto &C = Constraints[I]; @@ -215,18 +216,18 @@ bool InlineAsmPrepare::runOnFunction(Function &F) { } if (!Handled) { - if (C.hasRegMemConstraints()) { - // Converted to memory constraint. - // Create alloca, store input, pass pointer as argument. - AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem"); - Builder.CreateStore(ArgVal, Slot); - NewArgs.push_back(Slot); - NewArgTypes.push_back(Slot->getType()); - } else { - // Unchanged - NewArgs.push_back(ArgVal); - NewArgTypes.push_back(ArgTy); - } + if (C.hasRegMemConstraints()) { + // Converted to memory constraint. + // Create alloca, store input, pass pointer as argument. + AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem"); + Builder.CreateStore(ArgVal, Slot); + NewArgs.push_back(Slot); + NewArgTypes.push_back(Slot->getType()); + } else { + // Unchanged + NewArgs.push_back(ArgVal); + NewArgTypes.push_back(ArgTy); + } } ArgNo++; } @@ -242,10 +243,9 @@ bool InlineAsmPrepare::runOnFunction(Function &F) { } FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false); - auto *NewIA = InlineAsm::get( - NewFTy, IA->getAsmString(), NewConstraintStr, - IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect(), - IA->canThrow()); + auto *NewIA = InlineAsm::get(NewFTy, IA->getAsmString(), NewConstraintStr, + IA->hasSideEffects(), IA->isAlignStack(), + IA->getDialect(), IA->canThrow()); CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs); NewCall->setCallingConv(CB->getCallingConv()); @@ -253,66 +253,65 @@ bool InlineAsmPrepare::runOnFunction(Function &F) { NewCall->setDebugLoc(CB->getDebugLoc()); for (const auto &Item : ElementTypeAttrs) - NewCall->addParamAttr(Item.first, - Attribute::get(F.getContext(), - Attribute::ElementType, - Item.second)); + NewCall->addParamAttr( + Item.first, + Attribute::get(F.getContext(), Attribute::ElementType, Item.second)); // Reconstruct the return value and update users. if (!CB->use_empty()) { - Value *Replacement = nullptr; - Type *RetTy = CB->getType(); - - if (RetTy->isVoidTy()) { - // No return value, nothing to replace. - } else if (isa<StructType>(RetTy)) { - // Multiple outputs. Reconstruct the struct. - Value *Res = UndefValue::get(RetTy); - unsigned NewRetIdx = 0; - unsigned OriginalOutIdx = 0; - - for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { - if (Constraints[I].Type != InlineAsm::isOutput) - continue; - - Value *Val = nullptr; - if (AllocaInst *Slot = OutputAllocas[I]) { - // Converted to memory. Load from alloca. - Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot); - } else { - // Not converted. Extract from NewCall return. - if (NewRetTypes.size() == 1) { - Val = NewCall; - } else { - Val = Builder.CreateExtractValue(NewCall, NewRetIdx); - } - NewRetIdx++; - } - - Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++); - } - Replacement = Res; - } else { - // Single output. - // Find the output constraint (should be the first one). - unsigned OutConstraintIdx = 0; - for (unsigned I = 0; I < Constraints.size(); ++I) { - if (Constraints[I].Type == InlineAsm::isOutput) { - OutConstraintIdx = I; - break; - } - } - - if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) { - Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot); + Value *Replacement = nullptr; + Type *RetTy = CB->getType(); + + if (RetTy->isVoidTy()) { + // No return value, nothing to replace. + } else if (isa<StructType>(RetTy)) { + // Multiple outputs. Reconstruct the struct. + Value *Res = UndefValue::get(RetTy); + unsigned NewRetIdx = 0; + unsigned OriginalOutIdx = 0; + + for (unsigned I = 0, E = Constraints.size(); I != E; ++I) { + if (Constraints[I].Type != InlineAsm::isOutput) + continue; + + Value *Val = nullptr; + if (AllocaInst *Slot = OutputAllocas[I]) { + // Converted to memory. Load from alloca. + Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot); + } else { + // Not converted. Extract from NewCall return. + if (NewRetTypes.size() == 1) { + Val = NewCall; } else { - Replacement = NewCall; + Val = Builder.CreateExtractValue(NewCall, NewRetIdx); } + NewRetIdx++; + } + + Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++); + } + Replacement = Res; + } else { + // Single output. + // Find the output constraint (should be the first one). + unsigned OutConstraintIdx = 0; + for (unsigned I = 0; I < Constraints.size(); ++I) { + if (Constraints[I].Type == InlineAsm::isOutput) { + OutConstraintIdx = I; + break; + } } - if (Replacement) { - CB->replaceAllUsesWith(Replacement); + if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) { + Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot); + } else { + Replacement = NewCall; } + } + + if (Replacement) { + CB->replaceAllUsesWith(Replacement); + } } CB->eraseFromParent(); >From 9fb25404cda7fd098955e930847e77fa31ac97b5 Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 15 Jan 2026 02:56:17 -0800 Subject: [PATCH 6/9] Use poison instead of undef --- llvm/lib/CodeGen/InlineAsmPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp index 15a4b2827c9b0..c6848d1fe3539 100644 --- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp +++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp @@ -266,7 +266,7 @@ bool InlineAsmPrepare::runOnFunction(Function &F) { // No return value, nothing to replace. } else if (isa<StructType>(RetTy)) { // Multiple outputs. Reconstruct the struct. - Value *Res = UndefValue::get(RetTy); + Value *Res = PoisonValue::get(RetTy); unsigned NewRetIdx = 0; unsigned OriginalOutIdx = 0; >From ca0ae4918c1c83a4c4ae098fe9d031303b3c42ce Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 15 Jan 2026 03:02:21 -0800 Subject: [PATCH 7/9] fix LLVM ABI issues. --- llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 3 ++- llvm/include/llvm/CodeGen/Passes.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h index a400a78390dff..5ff22cde3dc67 100644 --- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h +++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h @@ -9,13 +9,14 @@ #ifndef LLVM_CODEGEN_INLINEASMPREPARE_H #define LLVM_CODEGEN_INLINEASMPREPARE_H +#include "llvm/Support/Compiler.h" #include "llvm/IR/PassManager.h" namespace llvm { class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> { public: - PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 9e1e34269baca..ae37c5b4ba272 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -567,7 +567,7 @@ LLVM_ABI FunctionPass *createCFIFixup(); LLVM_ABI FunctionPass *createCFIInstrInserter(); // Expands floating point instructions. -FunctionPass *createExpandIRInstsPass(CodeGenOptLevel); +LLVM_ABI FunctionPass *createExpandIRInstsPass(CodeGenOptLevel); /// Creates CFGuard longjmp target identification pass. /// \see CFGuardLongjmp.cpp >From 189d0242323928b903e2d81f9d046152eccfa567 Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Thu, 15 Jan 2026 03:04:48 -0800 Subject: [PATCH 8/9] Correct header order. --- llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h index 5ff22cde3dc67..e5ff4db562577 100644 --- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h +++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h @@ -9,8 +9,8 @@ #ifndef LLVM_CODEGEN_INLINEASMPREPARE_H #define LLVM_CODEGEN_INLINEASMPREPARE_H -#include "llvm/Support/Compiler.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" namespace llvm { >From 431a09a7c8fbc4c1af7f0b51441e63a36959f30c Mon Sep 17 00:00:00 2001 From: Bill Wendling <[email protected]> Date: Fri, 16 Jan 2026 18:25:37 -0800 Subject: [PATCH 9/9] Follow the style guide re-for loops. --- llvm/lib/CodeGen/InlineAsmPrepare.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp index c6848d1fe3539..a8b92a960e78b 100644 --- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp +++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp @@ -56,14 +56,14 @@ FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); } static SmallVector<CallBase *, 4> findInlineAsms(Function &F) { SmallVector<CallBase *, 4> InlineAsms; - for_each(F, [&](BasicBlock &BB) { - for_each(BB, [&](Instruction &I) { + for (auto &BB : F) { + for (auto &I : BB) { CallBase *CB = dyn_cast<CallBase>(&I); if (!CB || !CB->isInlineAsm()) - return; + continue; InlineAsms.push_back(CB); - }); - }); + } + } return InlineAsms; } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
