[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-03-08 Thread Ahsan Saghir via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGacce401068e7: [PowerPC] Change target data layout for 
16-byte stack alignment (authored by saghir).

Changed prior to commit:
  https://reviews.llvm.org/D96265?vs=323451=328994#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96265/new/

https://reviews.llvm.org/D96265

Files:
  clang/lib/Basic/Targets/PPC.h
  clang/test/CodeGen/target-data.c
  lld/test/ELF/common-archive-lookup.s
  llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
  llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll

Index: llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
===
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-BE
+; RUN: opt --passes=sroa,loop-vectorize,loop-unroll,instcombine -S \
+; RUN: -vectorizer-maximize-bandwidth --mtriple=powerpc64le-- -mcpu=pwr10 < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-OPT
+
+target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+
+define dso_local signext i32 @test_32byte_vector() nounwind {
+; CHECK-LE-LABEL: test_32byte_vector:
+; CHECK-LE:   # %bb.0: # %entry
+; CHECK-LE-NEXT:mflr r0
+; CHECK-LE-NEXT:std r30, -16(r1)
+; CHECK-LE-NEXT:mr r30, r1
+; CHECK-LE-NEXT:std r0, 16(r1)
+; CHECK-LE-NEXT:clrldi r0, r1, 59
+; CHECK-LE-NEXT:subfic r0, r0, -96
+; CHECK-LE-NEXT:stdux r1, r1, r0
+; CHECK-LE-NEXT:addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-LE-NEXT:addis r4, r2, .LCPI0_1@toc@ha
+; CHECK-LE-NEXT:addi r3, r3, .LCPI0_0@toc@l
+; CHECK-LE-NEXT:addi r4, r4, .LCPI0_1@toc@l
+; CHECK-LE-NEXT:lvx v2, 0, r3
+; CHECK-LE-NEXT:lvx v3, 0, r4
+; CHECK-LE-NEXT:addi r4, r1, 48
+; CHECK-LE-NEXT:addi r3, r1, 32
+; CHECK-LE-NEXT:stvx v2, 0, r4
+; CHECK-LE-NEXT:stvx v3, 0, r3
+; CHECK-LE-NEXT:bl test
+; CHECK-LE-NEXT:nop
+; CHECK-LE-NEXT:lwa r3, 32(r1)
+; CHECK-LE-NEXT:mr r1, r30
+; CHECK-LE-NEXT:ld r0, 16(r1)
+; CHECK-LE-NEXT:ld r30, -16(r1)
+; CHECK-LE-NEXT:mtlr r0
+; CHECK-LE-NEXT:blr
+;
+; CHECK-BE-LABEL: test_32byte_vector:
+; CHECK-BE:   # %bb.0: # %entry
+; CHECK-BE-NEXT:mflr r0
+; CHECK-BE-NEXT:std r30, -16(r1)
+; CHECK-BE-NEXT:std r0, 16(r1)
+; CHECK-BE-NEXT:clrldi r0, r1, 59
+; CHECK-BE-NEXT:mr r30, r1
+; CHECK-BE-NEXT:subfic r0, r0, -192
+; CHECK-BE-NEXT:stdux r1, r1, r0
+; CHECK-BE-NEXT:lis r3, -8192
+; CHECK-BE-NEXT:li r4, 5
+; CHECK-BE-NEXT:lis r5, -16384
+; CHECK-BE-NEXT:lis r6, -32768
+; CHECK-BE-NEXT:ori r3, r3, 1
+; CHECK-BE-NEXT:rldic r4, r4, 32, 29
+; CHECK-BE-NEXT:ori r5, r5, 1
+; CHECK-BE-NEXT:ori r6, r6, 1
+; CHECK-BE-NEXT:rldic r3, r3, 3, 29
+; CHECK-BE-NEXT:ori r4, r4, 6
+; CHECK-BE-NEXT:rldic r5, r5, 2, 30
+; CHECK-BE-NEXT:rldic r6, r6, 1, 31
+; CHECK-BE-NEXT:std r3, 152(r1)
+; CHECK-BE-NEXT:addi r3, r1, 128
+; CHECK-BE-NEXT:std r4, 144(r1)
+; CHECK-BE-NEXT:std r5, 136(r1)
+; CHECK-BE-NEXT:std r6, 128(r1)
+; CHECK-BE-NEXT:bl test
+; CHECK-BE-NEXT:nop
+; CHECK-BE-NEXT:lwa r3, 128(r1)
+; CHECK-BE-NEXT:mr r1, r30
+; CHECK-BE-NEXT:ld r0, 16(r1)
+; CHECK-BE-NEXT:ld r30, -16(r1)
+; CHECK-BE-NEXT:mtlr r0
+; CHECK-BE-NEXT:blr
+entry:
+  %a = alloca <8 x i32>, align 32
+  %0 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
+  store <8 x i32> , <8 x i32>* %a, align 32
+  call void @test(<8 x i32>* %a)
+  %1 = load <8 x i32>, <8 x i32>* %a, align 32
+  %vecext = extractelement <8 x i32> %1, i32 0
+  %2 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %2)
+  ret i32 %vecext
+}
+
+define dso_local signext i32 @test_32byte_aligned_vector() nounwind {
+; CHECK-LE-LABEL: test_32byte_aligned_vector:
+; CHECK-LE:   # %bb.0: # %entry
+; CHECK-LE-NEXT:mflr r0
+; CHECK-LE-NEXT:std r30, -16(r1)
+; CHECK-LE-NEXT:mr r30, r1
+; CHECK-LE-NEXT:std r0, 16(r1)
+; CHECK-LE-NEXT:clrldi r0, r1, 59
+; CHECK-LE-NEXT:subfic r0, r0, -64
+; CHECK-LE-NEXT:stdux r1, r1, r0
+; CHECK-LE-NEXT:addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-LE-NEXT:addi r3, r3, .LCPI1_0@toc@l
+; CHECK-LE-NEXT:lvx v2, 0, r3
+; CHECK-LE-NEXT:addi r3, r1, 32
+; CHECK-LE-NEXT:stvx v2, 0, r3
+; CHECK-LE-NEXT:bl test1
+; CHECK-LE-NEXT:nop
+; CHECK-LE-NEXT:lwa 

[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-02-12 Thread Nemanja Ivanovic via Phabricator via cfe-commits
nemanjai accepted this revision.
nemanjai added a comment.
This revision is now accepted and ready to land.

Thank you for your patience. LGTM now.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96265/new/

https://reviews.llvm.org/D96265

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-02-12 Thread Ahsan Saghir via Phabricator via cfe-commits
saghir updated this revision to Diff 323451.
saghir added a comment.

Updated tests to add run line for llc and target datalayout.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96265/new/

https://reviews.llvm.org/D96265

Files:
  clang/lib/Basic/Targets/PPC.h
  clang/test/CodeGen/target-data.c
  lld/test/ELF/common-archive-lookup.s
  llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
  llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll

Index: llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
===
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-BE
+; RUN: opt --passes=sroa,loop-vectorize,loop-unroll,instcombine -S \
+; RUN: -vectorizer-maximize-bandwidth --mtriple=powerpc64le-- -mcpu=pwr10 < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-OPT
+
+target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+
+define dso_local signext i32 @test_32byte_vector() nounwind {
+; CHECK-LE-LABEL: test_32byte_vector:
+; CHECK-LE:   # %bb.0: # %entry
+; CHECK-LE-NEXT:mflr r0
+; CHECK-LE-NEXT:std r30, -16(r1)
+; CHECK-LE-NEXT:mr r30, r1
+; CHECK-LE-NEXT:std r0, 16(r1)
+; CHECK-LE-NEXT:clrldi r0, r1, 59
+; CHECK-LE-NEXT:subfic r0, r0, -96
+; CHECK-LE-NEXT:stdux r1, r1, r0
+; CHECK-LE-NEXT:addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-LE-NEXT:addis r4, r2, .LCPI0_1@toc@ha
+; CHECK-LE-NEXT:addi r3, r3, .LCPI0_0@toc@l
+; CHECK-LE-NEXT:addi r4, r4, .LCPI0_1@toc@l
+; CHECK-LE-NEXT:lvx v2, 0, r3
+; CHECK-LE-NEXT:lvx v3, 0, r4
+; CHECK-LE-NEXT:addi r4, r1, 48
+; CHECK-LE-NEXT:addi r3, r1, 32
+; CHECK-LE-NEXT:stvx v2, 0, r4
+; CHECK-LE-NEXT:stvx v3, 0, r3
+; CHECK-LE-NEXT:bl test
+; CHECK-LE-NEXT:nop
+; CHECK-LE-NEXT:lwa r3, 32(r1)
+; CHECK-LE-NEXT:mr r1, r30
+; CHECK-LE-NEXT:ld r0, 16(r1)
+; CHECK-LE-NEXT:ld r30, -16(r1)
+; CHECK-LE-NEXT:mtlr r0
+; CHECK-LE-NEXT:blr
+;
+; CHECK-BE-LABEL: test_32byte_vector:
+; CHECK-BE:   # %bb.0: # %entry
+; CHECK-BE-NEXT:mflr r0
+; CHECK-BE-NEXT:std r30, -16(r1)
+; CHECK-BE-NEXT:std r0, 16(r1)
+; CHECK-BE-NEXT:clrldi r0, r1, 59
+; CHECK-BE-NEXT:mr r30, r1
+; CHECK-BE-NEXT:subfic r0, r0, -192
+; CHECK-BE-NEXT:stdux r1, r1, r0
+; CHECK-BE-NEXT:lis r3, -8192
+; CHECK-BE-NEXT:li r4, 5
+; CHECK-BE-NEXT:lis r5, -16384
+; CHECK-BE-NEXT:lis r6, -32768
+; CHECK-BE-NEXT:ori r3, r3, 1
+; CHECK-BE-NEXT:rldic r4, r4, 32, 29
+; CHECK-BE-NEXT:ori r5, r5, 1
+; CHECK-BE-NEXT:ori r6, r6, 1
+; CHECK-BE-NEXT:rldic r3, r3, 3, 29
+; CHECK-BE-NEXT:ori r4, r4, 6
+; CHECK-BE-NEXT:rldic r5, r5, 2, 30
+; CHECK-BE-NEXT:rldic r6, r6, 1, 31
+; CHECK-BE-NEXT:std r3, 152(r1)
+; CHECK-BE-NEXT:addi r3, r1, 128
+; CHECK-BE-NEXT:std r4, 144(r1)
+; CHECK-BE-NEXT:std r5, 136(r1)
+; CHECK-BE-NEXT:std r6, 128(r1)
+; CHECK-BE-NEXT:bl test
+; CHECK-BE-NEXT:nop
+; CHECK-BE-NEXT:lwa r3, 128(r1)
+; CHECK-BE-NEXT:mr r1, r30
+; CHECK-BE-NEXT:ld r0, 16(r1)
+; CHECK-BE-NEXT:ld r30, -16(r1)
+; CHECK-BE-NEXT:mtlr r0
+; CHECK-BE-NEXT:blr
+entry:
+  %a = alloca <8 x i32>, align 32
+  %0 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
+  store <8 x i32> , <8 x i32>* %a, align 32
+  call void @test(<8 x i32>* %a)
+  %1 = load <8 x i32>, <8 x i32>* %a, align 32
+  %vecext = extractelement <8 x i32> %1, i32 0
+  %2 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %2)
+  ret i32 %vecext
+}
+
+define dso_local signext i32 @test_32byte_aligned_vector() nounwind {
+; CHECK-LE-LABEL: test_32byte_aligned_vector:
+; CHECK-LE:   # %bb.0: # %entry
+; CHECK-LE-NEXT:mflr r0
+; CHECK-LE-NEXT:std r30, -16(r1)
+; CHECK-LE-NEXT:mr r30, r1
+; CHECK-LE-NEXT:std r0, 16(r1)
+; CHECK-LE-NEXT:clrldi r0, r1, 59
+; CHECK-LE-NEXT:subfic r0, r0, -64
+; CHECK-LE-NEXT:stdux r1, r1, r0
+; CHECK-LE-NEXT:addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-LE-NEXT:addi r3, r3, .LCPI1_0@toc@l
+; CHECK-LE-NEXT:lvx v2, 0, r3
+; CHECK-LE-NEXT:addi r3, r1, 32
+; CHECK-LE-NEXT:stvx v2, 0, r3
+; CHECK-LE-NEXT:bl test1
+; CHECK-LE-NEXT:nop
+; CHECK-LE-NEXT:lwa r3, 32(r1)
+; CHECK-LE-NEXT:mr r1, r30
+; CHECK-LE-NEXT:ld r0, 16(r1)
+; CHECK-LE-NEXT:ld r30, -16(r1)
+; CHECK-LE-NEXT:mtlr r0
+; CHECK-LE-NEXT:blr
+;
+; CHECK-BE-LABEL: 

[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-02-12 Thread Ahsan Saghir via Phabricator via cfe-commits
saghir updated this revision to Diff 323307.
saghir added a comment.

Merged tests into one file.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96265/new/

https://reviews.llvm.org/D96265

Files:
  clang/lib/Basic/Targets/PPC.h
  clang/test/CodeGen/target-data.c
  lld/test/ELF/common-archive-lookup.s
  llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
  llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll

Index: llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
===
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
@@ -0,0 +1,106 @@
+; RUN: opt --passes=sroa,loop-vectorize,loop-unroll,instcombine -S \
+; RUN: -vectorizer-maximize-bandwidth --mtriple=powerpc64le-- < %s | \
+; RUN: FileCheck %s
+
+
+define dso_local signext i32 @test_32byte_vector() nounwind {
+; CHECK-LABEL: @test_32byte_vector(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = alloca <8 x i32>, align 32
+; CHECK: store <8 x i32> , <8 x i32>* [[TMP0:%.*]], align 32
+; CHECK: load <8 x i32>, <8 x i32>* [[TMP0:%.*]], align 32
+entry:
+  %a = alloca <8 x i32>, align 32
+  %0 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
+  store <8 x i32> , <8 x i32>* %a, align 32
+  call void @test(<8 x i32>* %a)
+  %1 = load <8 x i32>, <8 x i32>* %a, align 32
+  %vecext = extractelement <8 x i32> %1, i32 0
+  %2 = bitcast <8 x i32>* %a to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %2)
+  ret i32 %vecext
+}
+
+define dso_local signext i32 @test_32byte_aligned_vector() nounwind {
+; CHECK-LABEL: @test_32byte_aligned_vector(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = alloca <4 x i32>, align 32
+; CHECK: store <4 x i32> , <4 x i32>* [[TMP0:%.*]], align 32
+
+entry:
+  %a = alloca <4 x i32>, align 32
+  %0 = bitcast <4 x i32>* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0)
+  store <4 x i32> , <4 x i32>* %a, align 32
+  call void @test1(<4 x i32>* %a)
+  %1 = load <4 x i32>, <4 x i32>* %a, align 32
+  %vecext = extractelement <4 x i32> %1, i32 0
+  %2 = bitcast <4 x i32>* %a to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %2)
+  ret i32 %vecext
+}
+
+
+@Arr1 = dso_local global [64 x i8] zeroinitializer, align 1
+
+define dso_local void @test_Array() nounwind {
+; CHECK-LABEL: @test_Array(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %Arr2 = alloca [64 x i16], align 32
+; CHECK: store <16 x i16> [[TMP0:%.*]], <16 x i16>* [[TMP0:%.*]], align 32
+entry:
+  %Arr2 = alloca [64 x i16], align 2
+  %i = alloca i32, align 4
+  %0 = bitcast [64 x i16]* %Arr2 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 128, i8* %0)
+  %1 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+  %2 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %2, 64
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond
+  %3 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3)
+  br label %for.end
+
+for.body: ; preds = %for.cond
+  %4 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @Arr1, i64 0, i64 %idxprom
+  %5 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %5 to i16
+  %6 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %6 to i64
+  %arrayidx2 = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 %idxprom1
+  store i16 %conv, i16* %arrayidx2, align 2
+  br label %for.inc
+
+for.inc:  ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:  ; preds = %for.cond.cleanup
+  %arraydecay = getelementptr inbounds [64 x i16], [64 x i16]* %Arr2, i64 0, i64 0
+  call void @test_arr(i16* %arraydecay)
+  %8 = bitcast [64 x i16]* %Arr2 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 128, i8* %8)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) nounwind
+
+declare void @test(<8 x i32>*) nounwind
+declare void @test1(<4 x i32>*) nounwind
+declare void @test_arr(i16*)
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) nounwind
+
+
Index: llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
===
--- llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -157,9 +157,8 @@
   // Specify the vector alignment explicitly. For v256i1 and v512i1, the
   // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
   // which is 256 and 512 bytes - way over aligned.
-  if ((T.getArch() == 

[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-02-11 Thread Nemanja Ivanovic via Phabricator via cfe-commits
nemanjai requested changes to this revision.
nemanjai added a comment.
This revision now requires changes to proceed.

Can you please merge the tests into one file. There is no compelling reason to 
split them up and it is more difficult to review and make sense of what is 
going on. The test case should have:

1. A function that allocates a 32-byte vector (i.e. defined with 
`__attribute__((vector_size(32)))`)
2. A function that allocates a 32-byte aligned vector (i.e. defined with 
`__attribute__((aligned(32)))`)
3. A function that allocates an array that wouldn't be aligned to something in 
excess of stack alignment but gets over-aligned due to vectorization

You can achieve 3. above with something like this:

  $ cat t.c
  char Arr1[64];
  void test(short *);
  void cpy() {
short Arr2[64];
for (int i = 0; i < 64; i++)
  Arr2[i] = Arr1[i];
test(Arr2);
  }
  
  clang -O3 -S t.c -emit-llvm -Xclang -disable-llvm-passes

Then the test case should run something like the following:
`opt --passes=sroa,loop-vectorize,loop-unroll,instcombine t.ll -S -o t.opt.ll 
-vectorizer-maximize-bandwidth --mtriple=powerpc64le--`
And check the alignment of the `alloca` and vector store. They should be 16 and 
not 32 even though the vector is 32 bytes wide.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D96265/new/

https://reviews.llvm.org/D96265

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D96265: [PowerPC] Change target data layout for 16-byte stack alignment

2021-02-08 Thread Ahsan Saghir via Phabricator via cfe-commits
saghir created this revision.
Herald added subscribers: shchenz, kbarton, hiraditya, nemanjai, emaste.
saghir requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, MaskRay.
Herald added projects: clang, LLVM.

This changes the target data layout to make stack align to 16 bytes
on Power10. Before this change, stack was being aligned to 32 bytes.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D96265

Files:
  clang/lib/Basic/Targets/PPC.h
  clang/test/CodeGen/target-data.c
  lld/test/ELF/common-archive-lookup.s
  llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
  llvm/test/CodeGen/PowerPC/32byte-aligned-vector-stack-alignment.ll
  llvm/test/CodeGen/PowerPC/32byte-array-stack-alignment.ll
  llvm/test/CodeGen/PowerPC/32byte-vector-stack-alignment.ll
  llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll

Index: llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
===
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/P10-stack-alignment.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s --check-prefix=CHECK-BE
+
+@.str = private unnamed_addr constant [7 x i8] c"%d %d\0A\00", align 1
+
+define dso_local signext i32 @test() local_unnamed_addr nounwind {
+; CHECK-LABEL: test:
+; CHECK:   # %bb.0: # %entry
+; CHECK-NEXT:mflr r0
+; CHECK-NEXT:std r30, -16(r1)
+; CHECK-NEXT:mr r30, r1
+; CHECK-NEXT:std r0, 16(r1)
+; CHECK-NEXT:clrldi r0, r1, 59
+; CHECK-NEXT:subfic r0, r0, -448
+; CHECK-NEXT:stdux r1, r1, r0
+; CHECK-NEXT:addi r3, r1, 96
+; CHECK-NEXT:li r4, 0
+; CHECK-NEXT:li r5, 200
+; CHECK-NEXT:bl memset@notoc
+; CHECK-NEXT:plxv v2, .LCPI0_1@PCREL(0), 1
+; CHECK-NEXT:plxv v5, .LCPI0_0@PCREL(0), 1
+; CHECK-NEXT:pli r3, 1684234849
+; CHECK-NEXT:plxv v1, .LCPI0_3@PCREL(0), 1
+; CHECK-NEXT:plxv v0, .LCPI0_2@PCREL(0), 1
+; CHECK-NEXT:plxv v7, .LCPI0_5@PCREL(0), 1
+; CHECK-NEXT:plxv v6, .LCPI0_4@PCREL(0), 1
+; CHECK-NEXT:stw r3, 416(r1)
+; CHECK-NEXT:vmrghb v4, v2, v2
+; CHECK-NEXT:vmrglb v3, v2, v2
+; CHECK-NEXT:stxv v2, 320(r1)
+; CHECK-NEXT:xxspltiw v2, 524296
+; CHECK-NEXT:stxv v5, 336(r1)
+; CHECK-NEXT:stxv v0, 368(r1)
+; CHECK-NEXT:stxv v1, 352(r1)
+; CHECK-NEXT:stxv v6, 400(r1)
+; CHECK-NEXT:stxv v7, 384(r1)
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vslh v3, v3, v2
+; CHECK-NEXT:vsrh v4, v4, v2
+; CHECK-NEXT:vsrh v3, v3, v2
+; CHECK-NEXT:stxv v4, 112(r1)
+; CHECK-NEXT:vmrglb v4, v5, v5
+; CHECK-NEXT:stxv v3, 96(r1)
+; CHECK-NEXT:vmrglb v3, v1, v1
+; CHECK-NEXT:li r3, 97
+; CHECK-NEXT:vmrghb v5, v5, v5
+; CHECK-NEXT:lha r5, 96(r1)
+; CHECK-NEXT:sth r3, 288(r1)
+; CHECK-NEXT:pli r3, 6488162
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vslh v3, v3, v2
+; CHECK-NEXT:stw r3, 290(r1)
+; CHECK-NEXT:li r3, 100
+; CHECK-NEXT:vslh v5, v5, v2
+; CHECK-NEXT:vsrh v4, v4, v2
+; CHECK-NEXT:vsrh v3, v3, v2
+; CHECK-NEXT:sth r3, 294(r1)
+; CHECK-NEXT:lbz r3, 320(r1)
+; CHECK-NEXT:vsrh v5, v5, v2
+; CHECK-NEXT:stxv v4, 128(r1)
+; CHECK-NEXT:vmrghb v4, v1, v1
+; CHECK-NEXT:stxv v3, 160(r1)
+; CHECK-NEXT:vmrglb v3, v0, v0
+; CHECK-NEXT:stxv v5, 144(r1)
+; CHECK-NEXT:extsb r4, r3
+; CHECK-NEXT:paddi r3, 0, .L.str@PCREL, 1
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vslh v3, v3, v2
+; CHECK-NEXT:vsrh v4, v4, v2
+; CHECK-NEXT:vsrh v3, v3, v2
+; CHECK-NEXT:stxv v4, 176(r1)
+; CHECK-NEXT:vmrghb v4, v0, v0
+; CHECK-NEXT:stxv v3, 192(r1)
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vsrh v4, v4, v2
+; CHECK-NEXT:stxv v4, 208(r1)
+; CHECK-NEXT:vmrglb v4, v7, v7
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vsrh v3, v4, v2
+; CHECK-NEXT:vmrghb v4, v7, v7
+; CHECK-NEXT:stxv v3, 224(r1)
+; CHECK-NEXT:vmrglb v3, v6, v6
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vsrh v4, v4, v2
+; CHECK-NEXT:vslh v3, v3, v2
+; CHECK-NEXT:stxv v4, 240(r1)
+; CHECK-NEXT:vmrghb v4, v6, v6
+; CHECK-NEXT:vsrh v3, v3, v2
+; CHECK-NEXT:stxv v3, 256(r1)
+; CHECK-NEXT:vslh v4, v4, v2
+; CHECK-NEXT:vsrh v2, v4, v2
+; CHECK-NEXT:stxv v2, 272(r1)
+; CHECK-NEXT:bl printf@notoc
+; CHECK-NEXT:li r3, 0
+; CHECK-NEXT:mr r1, r30
+; CHECK-NEXT:ld r0, 16(r1)
+; CHECK-NEXT:ld r30, -16(r1)
+; CHECK-NEXT:mtlr r0
+; CHECK-NEXT:blr
+;
+; CHECK-BE-LABEL: test:
+; CHECK-BE:   # %bb.0: # %entry
+; CHECK-BE-NEXT:mflr r0
+; CHECK-BE-NEXT: