https://github.com/c-rhodes updated https://github.com/llvm/llvm-project/pull/177974
>From 2dc6da5e9ac93690b315f6620c5838c5b750bd0d Mon Sep 17 00:00:00 2001 From: Croose <[email protected]> Date: Fri, 16 Jan 2026 12:06:20 +0000 Subject: [PATCH] [ARM] Fix inlining issue in ARM (#169337) There is an issue on ARM where a function wont be inlined due to mismatching target features between caller and callee. The caller has `HasV8Ops` and `FeatureDotProd` and the callee does not, but AFAIK this should not be a problem. https://godbolt.org/z/f19h3zT66 is an example showing how the call is not inlined on armv7. The expected asm output would be something like: ```asm .fnstart vsdot.s8 q0, q1, d4[0] bx lr .Lfunc_end0: ``` Thanks to @Amichaxx we managed to narrow it down and now can resolve this problem by adding `ARM::FeatureDotProd, ARM::HasV8Ops` to InlineFeaturesAllowed in llvm/lib/Target/ARM/ARMTargetTransformInfo.h, after which the inlining occurs successfully. Whilst we're at it we have also added some debugging to make it easier to tell why (or why not) a function is being inlined for ARM, and a couple other features that seem to be missing from the list. This patch was motivated by an issue experienced with rust that was traced back to llvm, and thus was designed to address that. (cherry picked from commit fab06fae0064a2f1208331f9c355a26a4f9777f0) --- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 49 ++++++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 166 ++++++++++++++---- .../Transforms/Inline/ARM/inline-dotprod.ll | 35 ++++ 3 files changed, 216 insertions(+), 34 deletions(-) create mode 100644 llvm/test/Transforms/Inline/ARM/inline-dotprod.ll diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 88a7fb185bf16..b947c8a10e2d8 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -107,6 +107,55 @@ bool ARMTTIImpl::areInlineCompatible(const Function *Caller, // the callers'. bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == (CalleeBits & InlineFeaturesAllowed); + + LLVM_DEBUG({ + if (!MatchExact || !MatchSubset) { + dbgs() << "=== Inline compatibility debug ===\n"; + dbgs() << "Caller: " << Caller->getName() << "\n"; + dbgs() << "Callee: " << Callee->getName() << "\n"; + + // Bit diffs + FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only + FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only + + // Counts + dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n"; + dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n"; + + dbgs() << "Only-in-caller feature indices ["; + { + bool First = true; + for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) { + if (ExtraInCaller.test(I)) { + if (!First) + dbgs() << ", "; + dbgs() << I; + First = false; + } + } + } + dbgs() << "]\n"; + + dbgs() << "Only-in-callee feature indices ["; + { + bool First = true; + for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) { + if (MissingInCaller.test(I)) { + if (!First) + dbgs() << ", "; + dbgs() << I; + First = false; + } + } + } + dbgs() << "]\n"; + + // Indices map to features as found in + // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc + dbgs() << "MatchExact=" << (MatchExact ? "true" : "false") + << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n"; + } + }); return MatchExact && MatchSubset; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index a23256364dd9a..fafd2d44a818c 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -40,13 +40,13 @@ class Type; class Value; namespace TailPredication { - enum Mode { - Disabled = 0, - EnabledNoReductions, - Enabled, - ForceEnabledNoReductions, - ForceEnabled - }; +enum Mode { + Disabled = 0, + EnabledNoReductions, + Enabled, + ForceEnabledNoReductions, + ForceEnabled +}; } // For controlling conversion of memcpy into Tail Predicated loop. @@ -64,37 +64,135 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> { const ARMTargetLowering *TLI; // Currently the following features are excluded from InlineFeaturesAllowed. - // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32 + // ModeThumb, FeatureNoARM, ModeSoftFloat. // Depending on whether they are set or unset, different // instructions/registers are available. For example, inlining a callee with // -thumb-mode in a caller with +thumb-mode, may cause the assembler to // fail if the callee uses ARM only instructions, e.g. in inline asm. - const FeatureBitset InlineFeaturesAllowed = { - ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, - ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, - ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb, - ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex, - ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc, - ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt, - ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS, - ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing, - ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32, - ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR, - ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits, - ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg, - ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, - ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, - ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, - ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, - ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, - ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, - ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, - ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, - ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, - ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, - ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, - ARM::FeatureNoNegativeImmediates - }; + const FeatureBitset InlineFeaturesAllowed = {ARM::Feature8MSecExt, + ARM::FeatureAClass, + ARM::FeatureAES, + ARM::FeatureAcquireRelease, + ARM::FeatureAvoidMOVsShOp, + ARM::FeatureAvoidMULS, + ARM::FeatureAvoidPartialCPSR, + ARM::FeatureBF16, + ARM::FeatureCRC, + ARM::FeatureCheapPredicableCPSR, + ARM::FeatureCheckVLDnAlign, + ARM::FeatureCrypto, + ARM::FeatureD32, + ARM::FeatureDB, + ARM::FeatureDFB, + ARM::FeatureDSP, + ARM::FeatureDontWidenVMOVS, + ARM::FeatureDotProd, + ARM::FeatureExecuteOnly, + ARM::FeatureExpandMLx, + ARM::FeatureFP16, + ARM::FeatureFP16FML, + ARM::FeatureFP64, + ARM::FeatureFPAO, + ARM::FeatureFPARMv8, + ARM::FeatureFPARMv8_D16, + ARM::FeatureFPARMv8_D16_SP, + ARM::FeatureFPARMv8_SP, + ARM::FeatureFPRegs, + ARM::FeatureFPRegs16, + ARM::FeatureFPRegs64, + ARM::FeatureFullFP16, + ARM::FeatureFuseAES, + ARM::FeatureFuseLiterals, + ARM::FeatureHWDivARM, + ARM::FeatureHWDivThumb, + ARM::FeatureHasNoBranchPredictor, + ARM::FeatureHasRetAddrStack, + ARM::FeatureHasSlowFPVFMx, + ARM::FeatureHasSlowFPVMLx, + ARM::FeatureHasVMLxHazards, + ARM::FeatureLOB, + ARM::FeatureLongCalls, + ARM::FeatureMClass, + ARM::FeatureMP, + ARM::FeatureMVEVectorCostFactor1, + ARM::FeatureMVEVectorCostFactor2, + ARM::FeatureMVEVectorCostFactor4, + ARM::FeatureMatMulInt8, + ARM::FeatureMuxedUnits, + ARM::FeatureNEON, + ARM::FeatureNEONForFP, + ARM::FeatureNEONForFPMovs, + ARM::FeatureNoMovt, + ARM::FeatureNoNegativeImmediates, + ARM::FeatureNoPostRASched, + ARM::FeaturePerfMon, + ARM::FeaturePref32BitThumb, + ARM::FeaturePrefISHSTBarrier, + ARM::FeaturePreferBranchAlign32, + ARM::FeaturePreferBranchAlign64, + ARM::FeaturePreferVMOVSR, + ARM::FeatureProfUnpredicate, + ARM::FeatureRAS, + ARM::FeatureRClass, + ARM::FeatureReserveR9, + ARM::FeatureSB, + ARM::FeatureSHA2, + ARM::FeatureSlowFPBrcc, + ARM::FeatureSlowLoadDSubreg, + ARM::FeatureSlowOddRegister, + ARM::FeatureSlowVDUP32, + ARM::FeatureSlowVGETLNi32, + ARM::FeatureSplatVFPToNeon, + ARM::FeatureStrictAlign, + ARM::FeatureThumb2, + ARM::FeatureTrustZone, + ARM::FeatureUseMIPipeliner, + ARM::FeatureUseMISched, + ARM::FeatureUseWideStrideVFP, + ARM::FeatureV7Clrex, + ARM::FeatureVFP2, + ARM::FeatureVFP2_SP, + ARM::FeatureVFP3, + ARM::FeatureVFP3_D16, + ARM::FeatureVFP3_D16_SP, + ARM::FeatureVFP3_SP, + ARM::FeatureVFP4, + ARM::FeatureVFP4_D16, + ARM::FeatureVFP4_D16_SP, + ARM::FeatureVFP4_SP, + ARM::FeatureVMLxForwarding, + ARM::FeatureVirtualization, + ARM::FeatureZCZeroing, + ARM::HasMVEFloatOps, + ARM::HasMVEIntegerOps, + ARM::HasV5TEOps, + ARM::HasV5TOps, + ARM::HasV6KOps, + ARM::HasV6MOps, + ARM::HasV6Ops, + ARM::HasV6T2Ops, + ARM::HasV7Ops, + ARM::HasV8MBaselineOps, + ARM::HasV8MMainlineOps, + ARM::HasV8Ops, + ARM::HasV8_1MMainlineOps, + ARM::HasV8_1aOps, + ARM::HasV8_2aOps, + ARM::HasV8_3aOps, + ARM::HasV8_4aOps, + ARM::HasV8_5aOps, + ARM::HasV8_6aOps, + ARM::HasV8_7aOps, + ARM::HasV8_8aOps, + ARM::HasV8_9aOps, + ARM::HasV9_0aOps, + ARM::HasV9_1aOps, + ARM::HasV9_2aOps, + ARM::HasV9_3aOps, + ARM::HasV9_4aOps, + ARM::HasV9_5aOps, + ARM::HasV9_6aOps, + ARM::HasV9_7aOps}; const ARMSubtarget *getST() const { return ST; } const ARMTargetLowering *getTLI() const { return TLI; } diff --git a/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll new file mode 100644 index 0000000000000..2f8dbb7f01822 --- /dev/null +++ b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll @@ -0,0 +1,35 @@ +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes=inline | FileCheck %s +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s + +declare i32 @foo(...) #0 + +define i32 @callee() #0 { +entry: + %call = call i32 (...) @foo() + ret i32 %call +} + +define i32 @dotcallee() #1 { +entry: + %call = call i32 (...) @foo() + ret i32 %call +} + +define i32 @dotcaller() #1 { +entry: + %call = call i32 @callee() + ret i32 %call +; CHECK-LABEL: dotcaller +; CHECK: call i32 (...) @foo() +} + +define i32 @caller() #0 { +entry: + %call = call i32 @dotcallee() + ret i32 %call +; CHECK-LABEL: caller +; CHECK: call i32 @dotcallee() +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+dsp,+neon" } +attributes #1 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+dotprod" } _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
