llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-codegen

Author: 王品呈 (wpcwzy)

<details>
<summary>Changes</summary>

This PR addresses an issue #<!-- -->120868 where the LLVM loop vectorizer fails 
to utilize SIMD counterparts of statndard math functions (such as acos, sin 
etc.) even when they are properly annotated with "#pragma omp declare simd" in 
system headers like glibc.

The failures were mostly caused by the following two components. First, clang 
emmitted LLVM intrinsics for some math functions, ignoring the "omp declare 
simd" metadata. Second, LoopAccessAnalysis and the cost model strictly rejected 
vectorization for any call marked "no-builtin", even if the call possessed an 
explicit vector variant via "vector-function-abi-variant" and can legally 
execute sequentially.

Changes contained in this PR are listed below:
- For clang codegen: Inhibits lowering to LLVM math intrinsics (e.g. 
llvm.acos.*) when an "OMPDeclareSimdDeclAttr" is present. Emits the mapping 
attribute using "VFABI::MappingAttrName" to the function declaration itself 
when appropriate.
- For LAA, LV &amp; VFABIDemangler: Permits vectorizing calls that possess 
no-builtin so long as "VFDatabase::getMappings(*Call)" is not empty. Modifies 
VFABIDemangler to query the “Function”  for “vector-function-abi-variant” 
attributes when the “CallInst” misses them.
- For transform warnings: Stops emitting FailedRequestedVectorization remarks 
for OpenMP annotated parallel loops, avoiding false positives since 
unvectorized execution is a legal fallback. (glilbc uses -Werror option so this 
false postive could cause a failure in compiling.)

Regression tests for the suppresed warnings and the no-builtin variant mapping 
extraction are added.

Before submitting this PR, the LLVM unit tests, regression tests, 
llvm-test-suite and glibc tests are ran without failing, and compiling glibc 
with this patch enabled successfully emitted call instructions to glibc's 
libmvec functions, solving issue #<!-- -->120868 .

---
Full diff: https://github.com/llvm/llvm-project/pull/189523.diff


8 Files Affected:

- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+17) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+75) 
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+4-3) 
- (modified) llvm/lib/IR/VFABIDemangler.cpp (+5-1) 
- (modified) llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp (+27-21) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+1-1) 
- (added) 
llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
 (+30) 
- (added) llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll (+51) 


``````````diff
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index da72a43643a54..23c89bb49a303 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -69,6 +69,19 @@ static bool shouldEmitBuiltinAsIR(unsigned BuiltinID,
   return false;
 }
 
+static bool shouldPreserveLibCallForDeclareSimd(const FunctionDecl *FD,
+                                                const LangOptions &LangOpts) {
+  if (!FD || !LangOpts.OpenMP)
+    return false;
+
+  for (const FunctionDecl *Redecl : FD->redecls()) {
+    if (Redecl->hasAttr<OMPDeclareSimdDeclAttr>())
+      return true;
+  }
+
+  return false;
+}
+
 static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
                                         unsigned BuiltinID, const CallExpr *E,
                                         ReturnValueSlot ReturnValue,
@@ -2647,6 +2660,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
           BuiltinID, CGM.getTriple(), ErrnoOverriden, getLangOpts().MathErrno,
           OptNone, IsOptimizationEnabled);
 
+  if (GenerateFPMathIntrinsics &&
+      shouldPreserveLibCallForDeclareSimd(FD, getLangOpts()))
+    GenerateFPMathIntrinsics = false;
+
   if (GenerateFPMathIntrinsics) {
     switch (BuiltinIDIfNoAsmLabel) {
     case Builtin::BIacos:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a8255ac74cfcf..338340602d30f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/VFABIDemangler.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -11311,6 +11312,75 @@ emitX86DeclareSimdFunction(const FunctionDecl *FD, 
llvm::Function *Fn,
   }
 }
 
+static void emitDeclareSimdVariantMetadata(CodeGenModule &CGM,
+                                           llvm::Function *Fn) {
+  llvm::SmallVector<std::string, 8> VariantMappings;
+  for (llvm::Attribute Attr : Fn->getAttributes().getFnAttrs()) {
+    if (!Attr.isStringAttribute())
+      continue;
+
+    StringRef VariantName = Attr.getKindAsString();
+    if (!VariantName.starts_with("_ZGV"))
+      continue;
+
+    std::optional<llvm::VFInfo> Info =
+        llvm::VFABI::tryDemangleForVFABI(VariantName, Fn->getFunctionType());
+    if (!Info)
+      continue;
+
+    llvm::FunctionType *VectorTy =
+        llvm::VFABI::createFunctionType(*Info, Fn->getFunctionType());
+    llvm::Function *VecFn = llvm::cast<llvm::Function>(
+        Fn->getParent()->getOrInsertFunction(Info->VectorName, VectorTy)
+            .getCallee());
+    CGM.addCompilerUsedGlobal(VecFn);
+    VariantMappings.push_back(std::string(VariantName));
+  }
+
+  if (VariantMappings.empty())
+    return;
+
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  for (const std::string &VariantName : VariantMappings)
+    Out << VariantName << ',';
+  Buffer.pop_back();
+
+  Fn->removeFnAttr(llvm::VFABI::MappingsAttrName);
+  Fn->addFnAttr(llvm::VFABI::MappingsAttrName, Buffer.str());
+}
+
+static bool shouldEmitDeclareSimdVariantMetadata(const FunctionDecl *FD,
+                                                 const SourceManager &SM) {
+  if (!FD)
+    return false;
+
+  for (const FunctionDecl *Redecl : FD->redecls()) {
+    if (!Redecl->hasAttr<OMPDeclareSimdDeclAttr>())
+      continue;
+
+    unsigned BuiltinID = Redecl->getBuiltinID();
+    if (BuiltinID &&
+      Redecl->getASTContext().BuiltinInfo.isLibFunction(BuiltinID))
+      return true;
+
+    SourceLocation Loc = SM.getExpansionLoc(Redecl->getLocation());
+    if (!Loc.isValid())
+      continue;
+
+    if (SM.isInSystemHeader(Loc))
+      return true;
+
+    // Keep support for library-style declarations coming from headers that are
+    // not marked as system headers (e.g. staged glibc build directories), but
+    // avoid enabling this for every non-main-file declaration.
+    if (!SM.isWrittenInMainFile(Loc) && Redecl->hasExternalFormalLinkage())
+      return true;
+  }
+
+  return false;
+}
+
 // This are the Functions that are needed to mangle the name of the
 // vector functions generated by the compiler, according to the rules
 // defined in the "Vector Function ABI specifications for AArch64",
@@ -11577,6 +11647,7 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const 
FunctionDecl *FD,
                                               llvm::Function *Fn) {
   ASTContext &C = CGM.getContext();
   FD = FD->getMostRecentDecl();
+  const FunctionDecl *MostRecentFD = FD;
   while (FD) {
     // Map params to their positions in function decl.
     llvm::DenseMap<const Decl *, unsigned> ParamPositions;
@@ -11724,6 +11795,10 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const 
FunctionDecl *FD,
     }
     FD = FD->getPreviousDecl();
   }
+
+  if (shouldEmitDeclareSimdVariantMetadata(
+          MostRecentFD, CGM.getContext().getSourceManager()))
+    emitDeclareSimdVariantMetadata(CGM, Fn);
 }
 
 namespace {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5d88e5f54e3d6..15eee6fc2054c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2575,9 +2575,10 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const 
LoopInfo *LI,
 
         // If the function has an explicit vectorized counterpart, and does not
         // take output/input pointers, we can safely assume that it can be
-        // vectorized.
-        if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
-            !hasPointerArgs(Call) && !VFDatabase::getMappings(*Call).empty())
+        // vectorized. This remains valid even when the scalar call is marked
+        // no-builtin, because the explicit mapping supplies the vector 
variant.
+        if (Call && Call->getCalledFunction() && !hasPointerArgs(Call) &&
+            !VFDatabase::getMappings(*Call).empty())
           continue;
 
         auto *Ld = dyn_cast<LoadInst>(&I);
diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index 4fcf43616d60c..ab213d3034446 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -534,7 +534,11 @@ VFParamKind VFABI::getVFParamKindFromString(const 
StringRef Token) {
 
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
-  const StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
+  StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
+  if (S.empty()) {
+    if (const Function *F = CI.getCalledFunction())
+      S = F->getFnAttribute(VFABI::MappingsAttrName).getValueAsString();
+  }
   if (S.empty())
     return;
 
diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp 
b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index e53019768e881..a56459842914c 100644
--- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -46,28 +46,34 @@ static void warnAboutLeftoverTransformations(Loop *L,
   }
 
   if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
-    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
-    std::optional<ElementCount> VectorizeWidth =
-        getOptionalElementCountLoopAttribute(L);
-    std::optional<int> InterleaveCount =
-        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+    // OpenMP SIMD loops are represented as annotated parallel loops.  They can
+    // still legally execute without loop-vectorization, so avoid turning these
+    // cases into transform-warning diagnostics that are commonly promoted to
+    // hard errors by -Werror builds.
+    if (!L->isAnnotatedParallel()) {
+      LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
+      std::optional<ElementCount> VectorizeWidth =
+          getOptionalElementCountLoopAttribute(L);
+      std::optional<int> InterleaveCount =
+          getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
 
-    if (!VectorizeWidth || VectorizeWidth->isVector())
-      ORE->emit(
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
-                                            "FailedRequestedVectorization",
-                                            L->getStartLoc(), L->getHeader())
-          << "loop not vectorized: the optimizer was unable to perform the "
-             "requested transformation; the transformation might be disabled "
-             "or specified as part of an unsupported transformation ordering");
-    else if (InterleaveCount.value_or(0) != 1)
-      ORE->emit(
-          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
-                                            "FailedRequestedInterleaving",
-                                            L->getStartLoc(), L->getHeader())
-          << "loop not interleaved: the optimizer was unable to perform the "
-             "requested transformation; the transformation might be disabled "
-             "or specified as part of an unsupported transformation ordering");
+      if (!VectorizeWidth || VectorizeWidth->isVector())
+        ORE->emit(
+            DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                              "FailedRequestedVectorization",
+                                              L->getStartLoc(), L->getHeader())
+            << "loop not vectorized: the optimizer was unable to perform the "
+               "requested transformation; the transformation might be disabled 
"
+               "or specified as part of an unsupported transformation 
ordering");
+      else if (InterleaveCount.value_or(0) != 1)
+        ORE->emit(
+            DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+                                              "FailedRequestedInterleaving",
+                                              L->getStartLoc(), L->getHeader())
+            << "loop not interleaved: the optimizer was unable to perform the "
+               "requested transformation; the transformation might be disabled 
"
+               "or specified as part of an unsupported transformation 
ordering");
+    }
   }
 
   if (hasDistributeTransformation(L) == TM_ForcedByUser) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a63956c0cba6b..c2097691e45b9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5963,7 +5963,7 @@ void 
LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         break;
       }
 
-      if (TLI && VecFunc && !CI->isNoBuiltin())
+      if (VecFunc)
         VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
 
       // Find the cost of an intrinsic; some targets may have instructions that
diff --git 
a/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
 
b/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
new file mode 100644
index 0000000000000..6f7773ac034ae
--- /dev/null
+++ 
b/llvm/test/Transforms/LoopTransformWarning/omp-simd-vectorization-remarks-suppressed.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=transform-warning -disable-output 
-pass-remarks-missed=transform-warning -pass-remarks-analysis=transform-warning 
< %s 2>&1 | FileCheck -allow-empty %s
+;
+; OpenMP SIMD loops are represented as annotated parallel loops. Keep
+; transform-warning from emitting FailedRequestedVectorization for these loops.
+;
+; CHECK-NOT: FailedRequestedVectorization
+; CHECK-NOT: loop not vectorized
+
+define void @test(ptr %x, ptr %c) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %next, %loop ]
+  %src.ptr = getelementptr inbounds double, ptr %c, i64 %i
+  %v = load double, ptr %src.ptr, align 8, !llvm.access.group !1
+  %dst.ptr = getelementptr inbounds double, ptr %x, i64 %i
+  store double %v, ptr %dst.ptr, align 8, !llvm.access.group !1
+  %next = add nuw nsw i64 %i, 1
+  %done = icmp eq i64 %next, 64
+  br i1 %done, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !2, !3}
+!1 = distinct !{}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!3 = !{!"llvm.loop.parallel_accesses", !1}
diff --git a/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll 
b/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll
new file mode 100644
index 0000000000000..78d843f5b0e0d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-builtin-vfabi.ll
@@ -0,0 +1,51 @@
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
+
+; NOTE: This is a focused reproducer for OpenMP declare-simd style VFABI 
mapping.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @test_vector_abi() local_unnamed_addr #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src = getelementptr inbounds nuw double, ptr @c, i64 %iv
+  %v = load double, ptr %src, align 8, !tbaa !4, !llvm.access.group !8
+  %r = tail call double @acosh(double noundef %v) #2, !llvm.access.group !8
+  %dst = getelementptr inbounds nuw double, ptr @x, i64 %iv
+  store double %r, ptr %dst, align 8, !tbaa !4, !llvm.access.group !8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %done = icmp eq i64 %iv.next, 1000
+  br i1 %done, label %exit, label %loop, !llvm.loop !9
+
+exit:
+  ret i32 0
+}
+
+@x = dso_local local_unnamed_addr global [1000 x double] zeroinitializer, 
align 16
+@c = dso_local local_unnamed_addr global [1000 x double] zeroinitializer, 
align 16
+
+; CHECK-LABEL: @test_vector_abi(
+; CHECK: vector.body:
+; CHECK: call <2 x double> @_ZGVbN2v_acosh
+
+declare double @acosh(double noundef) local_unnamed_addr #1
+
+declare <2 x double> @_ZGVbN2v_acosh(<2 x double>)
+declare <4 x double> @_ZGVcN4v_acosh(<4 x double>)
+declare <4 x double> @_ZGVdN4v_acosh(<4 x double>)
+declare <8 x double> @_ZGVeN8v_acosh(<8 x double>)
+
+attributes #0 = { noinline nounwind strictfp }
+attributes #1 = { nounwind "_ZGVbN2v_acosh" "_ZGVcN4v_acosh" "_ZGVdN4v_acosh" 
"_ZGVeN8v_acosh" "no-builtins" 
"vector-function-abi-variant"="_ZGVbN2v_acosh,_ZGVcN4v_acosh,_ZGVdN4v_acosh,_ZGVeN8v_acosh"
 }
+attributes #2 = { nobuiltin nounwind strictfp "no-builtins" }
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{}
+!9 = distinct !{!9, !10, !11}
+!10 = !{!"llvm.loop.parallel_accesses", !8}
+!11 = !{!"llvm.loop.vectorize.enable", i1 true}

``````````

</details>


https://github.com/llvm/llvm-project/pull/189523
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to