Author: Craig Topper
Date: 2020-07-27T16:20:07+02:00
New Revision: 152c2b1befb1879e690f228d95bbfe1bd381a1da

URL: 
https://github.com/llvm/llvm-project/commit/152c2b1befb1879e690f228d95bbfe1bd381a1da
DIFF: 
https://github.com/llvm/llvm-project/commit/152c2b1befb1879e690f228d95bbfe1bd381a1da.diff

LOG: [LegalizeTypes] Teach DAGTypeLegalizer::GenWidenVectorLoads to pad with 
undef if needed when concatenating small or loads to match a larger load

In the included test case the align 16 allowed the v23f32 load to handled as 
load v16f32, load v4f32, and load v4f32(one element not used). These loads all 
need to be concatenated together into a final vector. In this case we tried to 
concatenate the two v4f32 loads to match the type of the v16f32 load so we 
could do a second concat_vectors, but those loads alone only add up to v8f32. 
So we need to two v4f32 undefs to pad it.

It appears we've tried to hack around a similar issue in this code before by 
adding undef padding to loads in one of the earlier loops in this function. 
Originally in r147964 by padding all loads narrower than previous loads to the 
same size. Later modifed to only the last load in r293088. This patch removes 
that earlier code and just handles it on demand where we know we need it.

Fixes PR46820

Differential Revision: https://reviews.llvm.org/D84463

(cherry picked from commit 8131e190647ac2b5b085b48a6e3b48c1d7520a66)

Added: 
    llvm/test/CodeGen/X86/pr46820.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 414ba25ffd5f..b2299931021c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4913,7 +4913,8 @@ SDValue 
DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  // Allow wider loads.
+  // Allow wider loads if they are sufficiently aligned to avoid memory faults
+  // and if the original load is simple.
   unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
@@ -4965,19 +4966,6 @@ SDValue 
DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                       LD->getPointerInfo().getWithOffset(Offset),
                       LD->getOriginalAlign(), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
-      if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
-        // Later code assumes the vector loads produced will be mergeable, so 
we
-        // must pad the final entry up to the previous width. Scalars are
-        // combined separately.
-        SmallVector<SDValue, 16> Loads;
-        Loads.push_back(L);
-        unsigned size = L->getValueSizeInBits(0);
-        while (size < LdOp->getValueSizeInBits(0)) {
-          Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
-          size += L->getValueSizeInBits(0);
-        }
-        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
-      }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
@@ -5018,8 +5006,17 @@ SDValue 
DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
+      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
+      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      SmallVector<SDValue, 16> WidenOps(NumOps);
+      unsigned j = 0;
+      for (; j != End-Idx; ++j)
+        WidenOps[j] = ConcatOps[Idx+j];
+      for (; j != NumOps; ++j)
+        WidenOps[j] = DAG.getUNDEF(LdTy);
+
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
-                                     makeArrayRef(&ConcatOps[Idx], End - Idx));
+                                     WidenOps);
       Idx = End - 1;
       LdTy = NewLdTy;
     }

diff  --git a/llvm/test/CodeGen/X86/pr46820.ll 
b/llvm/test/CodeGen/X86/pr46820.ll
new file mode 100644
index 000000000000..76093801f9d0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46820.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s
+
+; The alignment of 16 causes type legalization to split this as 3 loads,
+; v16f32, v4f32, and v4f32. This loads 24 elements, but the load is aligned
+; to 16 bytes so this i safe. There was an issue with type legalization 
building
+; the proper concat_vectors for this because the two v4f32s don't add up to
+; v16f32 and require padding.
+
+define <23 x float> @load23(<23 x float>* %p) {
+; CHECK-LABEL: load23:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    vmovups 64(%rsi), %ymm0
+; CHECK-NEXT:    vmovups (%rsi), %zmm1
+; CHECK-NEXT:    vmovaps 64(%rsi), %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss %xmm3, 88(%rdi)
+; CHECK-NEXT:    vmovaps %xmm2, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovlps %xmm0, 80(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %t0 = load <23 x float>, <23 x float>* %p, align 16
+  ret <23 x float> %t0
+}
+
+; Same test as above with minimal alignment just to demonstrate the 
diff erent
+; codegen.
+define <23 x float> @load23_align_1(<23 x float>* %p) {
+; CHECK-LABEL: load23_align_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    vmovups (%rsi), %zmm0
+; CHECK-NEXT:    vmovups 64(%rsi), %xmm1
+; CHECK-NEXT:    movq 80(%rsi), %rcx
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss %xmm2, 88(%rdi)
+; CHECK-NEXT:    movq %rcx, 80(%rdi)
+; CHECK-NEXT:    vmovaps %xmm1, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %t0 = load <23 x float>, <23 x float>* %p, align 1
+  ret <23 x float> %t0
+}


        
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to