[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-23 Thread Reid Kleckner via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG2af74e27ed7d: [MS] Overhaul how clang passes overaligned 
args on x86_32 (authored by rnk).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114

Files:
  clang/include/clang/CodeGen/CGFunctionInfo.h
  clang/lib/CodeGen/CGCall.cpp
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/x86_32-arguments-win32.c
  clang/test/CodeGenCXX/inalloca-overaligned.cpp
  clang/test/CodeGenCXX/inalloca-vector.cpp

Index: clang/test/CodeGenCXX/inalloca-vector.cpp
===
--- /dev/null
+++ clang/test/CodeGenCXX/inalloca-vector.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -w -triple i686-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes up to three vectors in registers, and the rest indirectly. Check
+// that both are compatible with an inalloca prototype.
+
+struct NonTrivial {
+  NonTrivial();
+  NonTrivial(const NonTrivial &o);
+  unsigned handle;
+};
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+__m128 gv128;
+
+// nt, w, and q will be in the inalloca pack.
+void receive_vec_128(NonTrivial nt, __m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local void  @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %0)
+
+void pass_vec_128() {
+  __m128 z = {0};
+  receive_vec_128(NonTrivial(), z, z, z, z, z);
+}
+// CHECK-LABEL: define dso_local void @"?pass_vec_128@@YAXXZ"()
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 0
+// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* %{{.*}})
+
+// Store q, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 1
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// Store w, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 2
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// CHECK: call void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %{{[^,]*}})
+
+// w will be passed indirectly by register, and q will be passed indirectly, but
+// the pointer will be in memory.
+void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local x86_fastcallcc void @"?fastcall_receive_vec@@Y{{[^"]*}}"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <4 x float>* inreg %0,
+// CHECK-SAME: i32 inreg %edx,
+// CHECK-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)
+
+
+void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2,
+ __m128 x, __m128 y, __m128 z,
+ __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// FIXME: Enable these checks, clang generates wrong IR.
+// CHECK-LABEL: define dso_local x86_vectorcallcc void @"?vectorcall_receive_vec@@Y{{[^"]*}}"
+// CHECKX-SAME: (double inreg %xmm0,
+// CHECKX-SAME: double inreg %xmm1,
+// CHECKX-SAME: double inreg %xmm2,
+// CHECKX-SAME: <4 x float> inreg %x,
+// CHECKX-SAME: <4 x float> inreg %y,
+// CHECKX-SAME: <4 x float> inreg %z,
+// CHECKX-SAME: <4 x float>* inreg %0,
+// CHECKX-SAME: i32 inreg %edx,
+// CHECKX-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)
Index: clang/test/CodeGenCXX/inalloca-overaligned.cpp
===
--- /dev/null
+++ clang/test/CodeGenCXX/inalloca-overaligned.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that
+// works with inalloca.
+
+// FIXME: Pass non-trivial *and* overaligned types indirec

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-22 Thread Craig Topper via Phabricator via cfe-commits
craig.topper accepted this revision.
craig.topper added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-16 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

rnk wrote:
> rjmccall wrote:
> > rnk wrote:
> > > rnk wrote:
> > > > rjmccall wrote:
> > > > > Would it be better to handle `inalloca` differently, maybe as a flag 
> > > > > rather than as a top-level kind?  I'm concerned about gradually 
> > > > > duplicating a significant amount of the expressivity of other kinds.
> > > > In the past, I've drafted a more than one unfinished designs for how we 
> > > > could remodel inalloca with tokens so that it can be per-argument 
> > > > instead of something that applies to all argument memory. 
> > > > Unfortunately, I never found the time to finish or implement one.
> > > > 
> > > > As I was working on this patch, I was thinking to myself that this 
> > > > could be the moment to implement one of those designs, but it would be 
> > > > pretty involved. Part of the issue is that, personally, I have very 
> > > > little interest in improving x86_32 code quality, so a big redesign 
> > > > wouldn't deliver much benefit. The benefits would all be code 
> > > > simplifications and maintenance cost reductions, which are nice, but 
> > > > seem to only get me through the prototype design stage.
> > > > 
> > > > I'll go dig up my last doc and try to share it, but for now, I think we 
> > > > have to suffer the extra inalloca code in this patch.
> > > Here's what I wrote, with some sketches of possible LLVM IR that could 
> > > replace inalloca:
> > > https://reviews.llvm.org/P8183
> > > 
> > > The basic idea is that we need a call setup instruction that forms a 
> > > region with the call. During CodeGen, we can look forward (potentially 
> > > across BBs) to determine how much argument stack memory to allocate, 
> > > allocate it (perhaps in pieces as we go along), and then skip the normal 
> > > call stack argument adjustment during regular call lowering.
> > > 
> > > Suggestions for names better than "argmem" are welcome.
> > > 
> > > The major complication comes from edges that exit the call setup region. 
> > > These could be exceptional exits or normal exits with statement 
> > > expressions and break, return, or goto. Along these paths we need to 
> > > adjust SP, or risk leaking stack memory. Today, with inalloca, I think we 
> > > leak stack memory.
> > > In the past, I've drafted a more than one unfinished designs for how we 
> > > could remodel inalloca with tokens so that it can be per-argument instead 
> > > of something that applies to all argument memory. Unfortunately, I never 
> > > found the time to finish or implement one.
> > 
> > Sorry!  I think it would be great to rethink `inalloca` to avoid the 
> > duplication and so on, but I certainly didn't mean to suggest that we 
> > should do that as part of this patch.  (I'll look at your proposal later.)  
> > I was trying to ask if it would make sense to change how `inalloca` 
> > arguments are represented by `ABIInfo`, so that we could e.g. build a 
> > normal indirect `ABIInfo` and then flag that it also needs to be written 
> > into an `inalloca` buffer.
> I see. I think implementing that would require a greater refactoring of 
> ClangToLLVMArgMapping. We'd need a new place to store the inalloca struct 
> field index. That is currently put in a union with some other stuff, which 
> would no longer work.
Okay.  Then in the short term, I guess this is fine, and the long-term 
improvement is to change the `inalloca` design.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-14 Thread pre-merge checks [bot] via Phabricator via cfe-commits
merge_guards_bot added a comment.

{icon times-circle color=red} Unit tests: fail. 61871 tests passed, 1 failed 
and 781 were skipped.

  failed: Clang.PCH/ms-pch-macro.c

{icon question-circle color=gray} clang-tidy: unknown.

{icon times-circle color=red} clang-format: fail. Please format your changes 
with clang-format by running `git-clang-format HEAD^` or applying this patch 
.

Build artifacts 
: 
diff.json 
,
 clang-format.patch 
,
 CMakeCache.txt 
,
 console-log.txt 
,
 test-results.xml 



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-14 Thread Reid Kleckner via Phabricator via cfe-commits
rnk updated this revision to Diff 238152.
rnk marked 2 inline comments as done.
rnk added a comment.

- rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114

Files:
  clang/include/clang/CodeGen/CGFunctionInfo.h
  clang/lib/CodeGen/CGCall.cpp
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/x86_32-arguments-win32.c
  clang/test/CodeGenCXX/inalloca-overaligned.cpp
  clang/test/CodeGenCXX/inalloca-vector.cpp

Index: clang/test/CodeGenCXX/inalloca-vector.cpp
===
--- /dev/null
+++ clang/test/CodeGenCXX/inalloca-vector.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -w -triple i686-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes up to three vectors in registers, and the rest indirectly. Check
+// that both are compatible with an inalloca prototype.
+
+struct NonTrivial {
+  NonTrivial();
+  NonTrivial(const NonTrivial &o);
+  unsigned handle;
+};
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+__m128 gv128;
+
+// nt, w, and q will be in the inalloca pack.
+void receive_vec_128(NonTrivial nt, __m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local void  @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %0)
+
+void pass_vec_128() {
+  __m128 z = {0};
+  receive_vec_128(NonTrivial(), z, z, z, z, z);
+}
+// CHECK-LABEL: define dso_local void @"?pass_vec_128@@YAXXZ"()
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 0
+// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* %{{.*}})
+
+// Store q, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 1
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// Store w, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 2
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// CHECK: call void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %{{[^,]*}})
+
+// w will be passed indirectly by register, and q will be passed indirectly, but
+// the pointer will be in memory.
+void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local x86_fastcallcc void @"?fastcall_receive_vec@@Y{{[^"]*}}"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <4 x float>* inreg %0,
+// CHECK-SAME: i32 inreg %edx,
+// CHECK-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)
+
+
+void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2,
+ __m128 x, __m128 y, __m128 z,
+ __m128 w, int edx, __m128 q, NonTrivial nt) {
+  gv128 = x + y + z + w + q;
+}
+// FIXME: Enable these checks, clang generates wrong IR.
+// CHECK-LABEL: define dso_local x86_vectorcallcc void @"?vectorcall_receive_vec@@Y{{[^"]*}}"
+// CHECKX-SAME: (double inreg %xmm0,
+// CHECKX-SAME: double inreg %xmm1,
+// CHECKX-SAME: double inreg %xmm2,
+// CHECKX-SAME: <4 x float> inreg %x,
+// CHECKX-SAME: <4 x float> inreg %y,
+// CHECKX-SAME: <4 x float> inreg %z,
+// CHECKX-SAME: <4 x float>* inreg %0,
+// CHECKX-SAME: i32 inreg %edx,
+// CHECKX-SAME: <{ <4 x float>*, %struct.NonTrivial }>* inalloca %1)
Index: clang/test/CodeGenCXX/inalloca-overaligned.cpp
===
--- /dev/null
+++ clang/test/CodeGenCXX/inalloca-overaligned.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that
+// works with inalloca.
+
+// FIXME: Pass non-trivial *and* overaligned types indirectly. Right now the C++
+// ABI rules say to use inalloca, and they take p

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-14 Thread Reid Kleckner via Phabricator via cfe-commits
rnk added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

rjmccall wrote:
> rnk wrote:
> > rnk wrote:
> > > rjmccall wrote:
> > > > Would it be better to handle `inalloca` differently, maybe as a flag 
> > > > rather than as a top-level kind?  I'm concerned about gradually 
> > > > duplicating a significant amount of the expressivity of other kinds.
> > > In the past, I've drafted a more than one unfinished designs for how we 
> > > could remodel inalloca with tokens so that it can be per-argument instead 
> > > of something that applies to all argument memory. Unfortunately, I never 
> > > found the time to finish or implement one.
> > > 
> > > As I was working on this patch, I was thinking to myself that this could 
> > > be the moment to implement one of those designs, but it would be pretty 
> > > involved. Part of the issue is that, personally, I have very little 
> > > interest in improving x86_32 code quality, so a big redesign wouldn't 
> > > deliver much benefit. The benefits would all be code simplifications and 
> > > maintenance cost reductions, which are nice, but seem to only get me 
> > > through the prototype design stage.
> > > 
> > > I'll go dig up my last doc and try to share it, but for now, I think we 
> > > have to suffer the extra inalloca code in this patch.
> > Here's what I wrote, with some sketches of possible LLVM IR that could 
> > replace inalloca:
> > https://reviews.llvm.org/P8183
> > 
> > The basic idea is that we need a call setup instruction that forms a region 
> > with the call. During CodeGen, we can look forward (potentially across BBs) 
> > to determine how much argument stack memory to allocate, allocate it 
> > (perhaps in pieces as we go along), and then skip the normal call stack 
> > argument adjustment during regular call lowering.
> > 
> > Suggestions for names better than "argmem" are welcome.
> > 
> > The major complication comes from edges that exit the call setup region. 
> > These could be exceptional exits or normal exits with statement expressions 
> > and break, return, or goto. Along these paths we need to adjust SP, or risk 
> > leaking stack memory. Today, with inalloca, I think we leak stack memory.
> > In the past, I've drafted a more than one unfinished designs for how we 
> > could remodel inalloca with tokens so that it can be per-argument instead 
> > of something that applies to all argument memory. Unfortunately, I never 
> > found the time to finish or implement one.
> 
> Sorry!  I think it would be great to rethink `inalloca` to avoid the 
> duplication and so on, but I certainly didn't mean to suggest that we should 
> do that as part of this patch.  (I'll look at your proposal later.)  I was 
> trying to ask if it would make sense to change how `inalloca` arguments are 
> represented by `ABIInfo`, so that we could e.g. build a normal indirect 
> `ABIInfo` and then flag that it also needs to be written into an `inalloca` 
> buffer.
I see. I think implementing that would require a greater refactoring of 
ClangToLLVMArgMapping. We'd need a new place to store the inalloca struct field 
index. That is currently put in a union with some other stuff, which would no 
longer work.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

AntonYudintsev wrote:
> rjmccall wrote:
> > AntonYudintsev wrote:
> > > rjmccall wrote:
> > > > AntonYudintsev wrote:
> > > > > rjmccall wrote:
> > > > > > rnk wrote:
> > > > > > > rjmccall wrote:
> > > > > > > > rnk wrote:
> > > > > > > > > craig.topper wrote:
> > > > > > > > > > What happens in the backend with inreg if 512-bit vectors 
> > > > > > > > > > aren't legal?
> > > > > > > > > LLVM splits the vector up using the largest legal vector 
> > > > > > > > > size. As many pieces as possible are passed in available 
> > > > > > > > > XMM/YMM registers, and the rest are passed in memory. MSVC, 
> > > > > > > > > of course, assumes the user wanted the larger vector size, 
> > > > > > > > > and uses whatever vector instructions it needs to move the 
> > > > > > > > > arguments around.
> > > > > > > > > 
> > > > > > > > > Previously, I advocated for a model where calling an Intel 
> > > > > > > > > intrinsic function had the effect of implicitly marking the 
> > > > > > > > > caller with the target attributes of the intr

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-08 Thread Anton Yudintsev via Phabricator via cfe-commits
AntonYudintsev added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rjmccall wrote:
> AntonYudintsev wrote:
> > rjmccall wrote:
> > > AntonYudintsev wrote:
> > > > rjmccall wrote:
> > > > > rnk wrote:
> > > > > > rjmccall wrote:
> > > > > > > rnk wrote:
> > > > > > > > craig.topper wrote:
> > > > > > > > > What happens in the backend with inreg if 512-bit vectors 
> > > > > > > > > aren't legal?
> > > > > > > > LLVM splits the vector up using the largest legal vector size. 
> > > > > > > > As many pieces as possible are passed in available XMM/YMM 
> > > > > > > > registers, and the rest are passed in memory. MSVC, of course, 
> > > > > > > > assumes the user wanted the larger vector size, and uses 
> > > > > > > > whatever vector instructions it needs to move the arguments 
> > > > > > > > around.
> > > > > > > > 
> > > > > > > > Previously, I advocated for a model where calling an Intel 
> > > > > > > > intrinsic function had the effect of implicitly marking the 
> > > > > > > > caller with the target attributes of the intrinsic. This falls 
> > > > > > > > down if the user tries to write a single function that 
> > > > > > > > conditionally branches between code that uses different 
> > > > > > > > instruction set extensions. You can imagine the SSE2 codepath 
> > > > > > > > accidentally using AVX instructions because the compiler thinks 
> > > > > > > > they are better. I'm told that ICC models CPU 
> > > > > > > > micro-architectural features in the CFG, but I don't ever 
> > > > > > > > expect that LLVM will do that. If we're stuck with per-function 
> > > > > > > > CPU feature settings, it seems like it would be nice to try to 
> > > > > > > > do what the user asked by default, and warn the user if we see 
> > > > > > > > them doing a cpuid check in a function that has been implicitly 
> > > > > > > > blessed with some target attributes. You could imagine doing a 
> > > > > > > > similar thing when large vector type variables are used: if a 
> > > > > > > > large vector argument or local is used, implicitly enable the 
> > > > > > > > appropriate target features to move vectors of that size around.
> > > > > > > > 
> > > > > > > > This idea didn't get anywhere, and the current situation has 
> > > > > > > > persisted.
> > > > > > > > 
> > > > > > > > ---
> > > > > > > > 
> > > > > > > > You know, maybe we should just keep clang the way it is, and 
> > > > > > > > just set up a warning in the backend that says "hey, I split 
> > > > > > > > your large vector. You probably didn't want that." And then we 
> > > > > > > > just continue doing what we do now. Nobody likes backend 
> > > > > > > > warnings, but it seems better than the current direction of the 
> > > > > > > > frontend knowing every detail of x86 vector extensions.
> > > > > > > If target attributes affect ABI, it seems really dangerous to 
> > > > > > > implicitly set attributes based on what intrinsics are called.
> > > > > > > 
> > > > > > > The local CPU-testing problem seems similar to the problems with 
> > > > > > > local `#pragma STDC FENV_ACCESS` blocks that the constrained-FP 
> > > > > > > people are looking into.  They both have a "this operation is 
> > > > > > > normally fully optimizable, but we might need to be more careful 
> > > > > > > in specific functions" aspect to them.  I wonder if there's a 
> > > > > > > reasonable way to unify the approaches, or at least benefit from 
> > > > > > > lessons learned.
> > > > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > > > anybody actually want the behavior that LLVM implements today where 
> > > > > > large vectors get split across registers and memory? I think most 
> > > > > > users who pass a 512 bit vector want it to either be passed in ZMM 
> > > > > > registers or fail to compile. Why do we even support passing 1024 
> > > > > > bit vectors? Could we make that an error?
> > > > > > 
> > > > > > Anyway, major redesigns aside, should clang do something when large 
> > > > > > vectors are passed? Maybe we should warn here? Passing by address 
> > > > > > is usually the safest way to pass something, so that's an option. 
> > > > > > Implementing splitting logic in clang doesn't seem worth it.
> > > > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > > > anybody actually want the behavior that LLVM implements today where 
> > > > > > large vectors get split across registers and memory?
> > > 

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-07 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

AntonYudintsev wrote:
> rjmccall wrote:
> > AntonYudintsev wrote:
> > > rjmccall wrote:
> > > > rnk wrote:
> > > > > rjmccall wrote:
> > > > > > rnk wrote:
> > > > > > > craig.topper wrote:
> > > > > > > > What happens in the backend with inreg if 512-bit vectors 
> > > > > > > > aren't legal?
> > > > > > > LLVM splits the vector up using the largest legal vector size. As 
> > > > > > > many pieces as possible are passed in available XMM/YMM 
> > > > > > > registers, and the rest are passed in memory. MSVC, of course, 
> > > > > > > assumes the user wanted the larger vector size, and uses whatever 
> > > > > > > vector instructions it needs to move the arguments around.
> > > > > > > 
> > > > > > > Previously, I advocated for a model where calling an Intel 
> > > > > > > intrinsic function had the effect of implicitly marking the 
> > > > > > > caller with the target attributes of the intrinsic. This falls 
> > > > > > > down if the user tries to write a single function that 
> > > > > > > conditionally branches between code that uses different 
> > > > > > > instruction set extensions. You can imagine the SSE2 codepath 
> > > > > > > accidentally using AVX instructions because the compiler thinks 
> > > > > > > they are better. I'm told that ICC models CPU micro-architectural 
> > > > > > > features in the CFG, but I don't ever expect that LLVM will do 
> > > > > > > that. If we're stuck with per-function CPU feature settings, it 
> > > > > > > seems like it would be nice to try to do what the user asked by 
> > > > > > > default, and warn the user if we see them doing a cpuid check in 
> > > > > > > a function that has been implicitly blessed with some target 
> > > > > > > attributes. You could imagine doing a similar thing when large 
> > > > > > > vector type variables are used: if a large vector argument or 
> > > > > > > local is used, implicitly enable the appropriate target features 
> > > > > > > to move vectors of that size around.
> > > > > > > 
> > > > > > > This idea didn't get anywhere, and the current situation has 
> > > > > > > persisted.
> > > > > > > 
> > > > > > > ---
> > > > > > > 
> > > > > > > You know, maybe we should just keep clang the way it is, and just 
> > > > > > > set up a warning in the backend that says "hey, I split your 
> > > > > > > large vector. You probably didn't want that." And then we just 
> > > > > > > continue doing what we do now. Nobody likes backend warnings, but 
> > > > > > > it seems better than the current direction of the frontend 
> > > > > > > knowing every detail of x86 vector extensions.
> > > > > > If target attributes affect ABI, it seems really dangerous to 
> > > > > > implicitly set attributes based on what intrinsics are called.
> > > > > > 
> > > > > > The local CPU-testing problem seems similar to the problems with 
> > > > > > local `#pragma STDC FENV_ACCESS` blocks that the constrained-FP 
> > > > > > people are looking into.  They both have a "this operation is 
> > > > > > normally fully optimizable, but we might need to be more careful in 
> > > > > > specific functions" aspect to them.  I wonder if there's a 
> > > > > > reasonable way to unify the approaches, or at least benefit from 
> > > > > > lessons learned.
> > > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > > anybody actually want the behavior that LLVM implements today where 
> > > > > large vectors get split across registers and memory? I think most 
> > > > > users who pass a 512 bit vector want it to either be passed in ZMM 
> > > > > registers or fail to compile. Why do we even support passing 1024 bit 
> > > > > vectors? Could we make that an error?
> > > > > 
> > > > > Anyway, major redesigns aside, should clang do something when large 
> > > > > vectors are passed? Maybe we should warn here? Passing by address is 
> > > > > usually the safest way to pass something, so that's an option. 
> > > > > Implementing splitting logic in clang doesn't seem worth it.
> > > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > > anybody actually want the behavior that LLVM implements today where 
> > > > > large vectors get split across registers and memory?
> > > > 
> > > > I take it you're implying that the actual (Windows-only?) platform ABI 
> > > > doesn't say anything about this because other compilers don't allow 
> > > > large vec

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-07 Thread Anton Yudintsev via Phabricator via cfe-commits
AntonYudintsev added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rjmccall wrote:
> AntonYudintsev wrote:
> > rjmccall wrote:
> > > rnk wrote:
> > > > rjmccall wrote:
> > > > > rnk wrote:
> > > > > > craig.topper wrote:
> > > > > > > What happens in the backend with inreg if 512-bit vectors aren't 
> > > > > > > legal?
> > > > > > LLVM splits the vector up using the largest legal vector size. As 
> > > > > > many pieces as possible are passed in available XMM/YMM registers, 
> > > > > > and the rest are passed in memory. MSVC, of course, assumes the 
> > > > > > user wanted the larger vector size, and uses whatever vector 
> > > > > > instructions it needs to move the arguments around.
> > > > > > 
> > > > > > Previously, I advocated for a model where calling an Intel 
> > > > > > intrinsic function had the effect of implicitly marking the caller 
> > > > > > with the target attributes of the intrinsic. This falls down if the 
> > > > > > user tries to write a single function that conditionally branches 
> > > > > > between code that uses different instruction set extensions. You 
> > > > > > can imagine the SSE2 codepath accidentally using AVX instructions 
> > > > > > because the compiler thinks they are better. I'm told that ICC 
> > > > > > models CPU micro-architectural features in the CFG, but I don't 
> > > > > > ever expect that LLVM will do that. If we're stuck with 
> > > > > > per-function CPU feature settings, it seems like it would be nice 
> > > > > > to try to do what the user asked by default, and warn the user if 
> > > > > > we see them doing a cpuid check in a function that has been 
> > > > > > implicitly blessed with some target attributes. You could imagine 
> > > > > > doing a similar thing when large vector type variables are used: if 
> > > > > > a large vector argument or local is used, implicitly enable the 
> > > > > > appropriate target features to move vectors of that size around.
> > > > > > 
> > > > > > This idea didn't get anywhere, and the current situation has 
> > > > > > persisted.
> > > > > > 
> > > > > > ---
> > > > > > 
> > > > > > You know, maybe we should just keep clang the way it is, and just 
> > > > > > set up a warning in the backend that says "hey, I split your large 
> > > > > > vector. You probably didn't want that." And then we just continue 
> > > > > > doing what we do now. Nobody likes backend warnings, but it seems 
> > > > > > better than the current direction of the frontend knowing every 
> > > > > > detail of x86 vector extensions.
> > > > > If target attributes affect ABI, it seems really dangerous to 
> > > > > implicitly set attributes based on what intrinsics are called.
> > > > > 
> > > > > The local CPU-testing problem seems similar to the problems with 
> > > > > local `#pragma STDC FENV_ACCESS` blocks that the constrained-FP 
> > > > > people are looking into.  They both have a "this operation is 
> > > > > normally fully optimizable, but we might need to be more careful in 
> > > > > specific functions" aspect to them.  I wonder if there's a reasonable 
> > > > > way to unify the approaches, or at least benefit from lessons learned.
> > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > anybody actually want the behavior that LLVM implements today where 
> > > > large vectors get split across registers and memory? I think most users 
> > > > who pass a 512 bit vector want it to either be passed in ZMM registers 
> > > > or fail to compile. Why do we even support passing 1024 bit vectors? 
> > > > Could we make that an error?
> > > > 
> > > > Anyway, major redesigns aside, should clang do something when large 
> > > > vectors are passed? Maybe we should warn here? Passing by address is 
> > > > usually the safest way to pass something, so that's an option. 
> > > > Implementing splitting logic in clang doesn't seem worth it.
> > > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > > anybody actually want the behavior that LLVM implements today where 
> > > > large vectors get split across registers and memory?
> > > 
> > > I take it you're implying that the actual (Windows-only?) platform ABI 
> > > doesn't say anything about this because other compilers don't allow large 
> > > vectors.  How large are the vectors we do have ABI rules for?  Do they 
> > > have the problem as the SysV ABI where the ABI rules are sensitive to 
> > > compiler flags?
> > > 
> > > Any

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-07 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

AntonYudintsev wrote:
> rjmccall wrote:
> > rnk wrote:
> > > rjmccall wrote:
> > > > rnk wrote:
> > > > > craig.topper wrote:
> > > > > > What happens in the backend with inreg if 512-bit vectors aren't 
> > > > > > legal?
> > > > > LLVM splits the vector up using the largest legal vector size. As 
> > > > > many pieces as possible are passed in available XMM/YMM registers, 
> > > > > and the rest are passed in memory. MSVC, of course, assumes the user 
> > > > > wanted the larger vector size, and uses whatever vector instructions 
> > > > > it needs to move the arguments around.
> > > > > 
> > > > > Previously, I advocated for a model where calling an Intel intrinsic 
> > > > > function had the effect of implicitly marking the caller with the 
> > > > > target attributes of the intrinsic. This falls down if the user tries 
> > > > > to write a single function that conditionally branches between code 
> > > > > that uses different instruction set extensions. You can imagine the 
> > > > > SSE2 codepath accidentally using AVX instructions because the 
> > > > > compiler thinks they are better. I'm told that ICC models CPU 
> > > > > micro-architectural features in the CFG, but I don't ever expect that 
> > > > > LLVM will do that. If we're stuck with per-function CPU feature 
> > > > > settings, it seems like it would be nice to try to do what the user 
> > > > > asked by default, and warn the user if we see them doing a cpuid 
> > > > > check in a function that has been implicitly blessed with some target 
> > > > > attributes. You could imagine doing a similar thing when large vector 
> > > > > type variables are used: if a large vector argument or local is used, 
> > > > > implicitly enable the appropriate target features to move vectors of 
> > > > > that size around.
> > > > > 
> > > > > This idea didn't get anywhere, and the current situation has 
> > > > > persisted.
> > > > > 
> > > > > ---
> > > > > 
> > > > > You know, maybe we should just keep clang the way it is, and just set 
> > > > > up a warning in the backend that says "hey, I split your large 
> > > > > vector. You probably didn't want that." And then we just continue 
> > > > > doing what we do now. Nobody likes backend warnings, but it seems 
> > > > > better than the current direction of the frontend knowing every 
> > > > > detail of x86 vector extensions.
> > > > If target attributes affect ABI, it seems really dangerous to 
> > > > implicitly set attributes based on what intrinsics are called.
> > > > 
> > > > The local CPU-testing problem seems similar to the problems with local 
> > > > `#pragma STDC FENV_ACCESS` blocks that the constrained-FP people are 
> > > > looking into.  They both have a "this operation is normally fully 
> > > > optimizable, but we might need to be more careful in specific 
> > > > functions" aspect to them.  I wonder if there's a reasonable way to 
> > > > unify the approaches, or at least benefit from lessons learned.
> > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > anybody actually want the behavior that LLVM implements today where large 
> > > vectors get split across registers and memory? I think most users who 
> > > pass a 512 bit vector want it to either be passed in ZMM registers or 
> > > fail to compile. Why do we even support passing 1024 bit vectors? Could 
> > > we make that an error?
> > > 
> > > Anyway, major redesigns aside, should clang do something when large 
> > > vectors are passed? Maybe we should warn here? Passing by address is 
> > > usually the safest way to pass something, so that's an option. 
> > > Implementing splitting logic in clang doesn't seem worth it.
> > > I agree, we wouldn't want intrinsic usage to change ABI. But, does 
> > > anybody actually want the behavior that LLVM implements today where large 
> > > vectors get split across registers and memory?
> > 
> > I take it you're implying that the actual (Windows-only?) platform ABI 
> > doesn't say anything about this because other compilers don't allow large 
> > vectors.  How large are the vectors we do have ABI rules for?  Do they have 
> > the problem as the SysV ABI where the ABI rules are sensitive to compiler 
> > flags?
> > 
> > Anyway, I didn't realize the i386 Windows ABI *ever* used registers for 
> > arguments.  (Whether you can convince LLVM to do so  for a function 
> > signature that Clang 

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-06 Thread Anton Yudintsev via Phabricator via cfe-commits
AntonYudintsev added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rjmccall wrote:
> rnk wrote:
> > rjmccall wrote:
> > > rnk wrote:
> > > > craig.topper wrote:
> > > > > What happens in the backend with inreg if 512-bit vectors aren't 
> > > > > legal?
> > > > LLVM splits the vector up using the largest legal vector size. As many 
> > > > pieces as possible are passed in available XMM/YMM registers, and the 
> > > > rest are passed in memory. MSVC, of course, assumes the user wanted the 
> > > > larger vector size, and uses whatever vector instructions it needs to 
> > > > move the arguments around.
> > > > 
> > > > Previously, I advocated for a model where calling an Intel intrinsic 
> > > > function had the effect of implicitly marking the caller with the 
> > > > target attributes of the intrinsic. This falls down if the user tries 
> > > > to write a single function that conditionally branches between code 
> > > > that uses different instruction set extensions. You can imagine the 
> > > > SSE2 codepath accidentally using AVX instructions because the compiler 
> > > > thinks they are better. I'm told that ICC models CPU 
> > > > micro-architectural features in the CFG, but I don't ever expect that 
> > > > LLVM will do that. If we're stuck with per-function CPU feature 
> > > > settings, it seems like it would be nice to try to do what the user 
> > > > asked by default, and warn the user if we see them doing a cpuid check 
> > > > in a function that has been implicitly blessed with some target 
> > > > attributes. You could imagine doing a similar thing when large vector 
> > > > type variables are used: if a large vector argument or local is used, 
> > > > implicitly enable the appropriate target features to move vectors of 
> > > > that size around.
> > > > 
> > > > This idea didn't get anywhere, and the current situation has persisted.
> > > > 
> > > > ---
> > > > 
> > > > You know, maybe we should just keep clang the way it is, and just set 
> > > > up a warning in the backend that says "hey, I split your large vector. 
> > > > You probably didn't want that." And then we just continue doing what we 
> > > > do now. Nobody likes backend warnings, but it seems better than the 
> > > > current direction of the frontend knowing every detail of x86 vector 
> > > > extensions.
> > > If target attributes affect ABI, it seems really dangerous to implicitly 
> > > set attributes based on what intrinsics are called.
> > > 
> > > The local CPU-testing problem seems similar to the problems with local 
> > > `#pragma STDC FENV_ACCESS` blocks that the constrained-FP people are 
> > > looking into.  They both have a "this operation is normally fully 
> > > optimizable, but we might need to be more careful in specific functions" 
> > > aspect to them.  I wonder if there's a reasonable way to unify the 
> > > approaches, or at least benefit from lessons learned.
> > I agree, we wouldn't want intrinsic usage to change ABI. But, does anybody 
> > actually want the behavior that LLVM implements today where large vectors 
> > get split across registers and memory? I think most users who pass a 512 
> > bit vector want it to either be passed in ZMM registers or fail to compile. 
> > Why do we even support passing 1024 bit vectors? Could we make that an 
> > error?
> > 
> > Anyway, major redesigns aside, should clang do something when large vectors 
> > are passed? Maybe we should warn here? Passing by address is usually the 
> > safest way to pass something, so that's an option. Implementing splitting 
> > logic in clang doesn't seem worth it.
> > I agree, we wouldn't want intrinsic usage to change ABI. But, does anybody 
> > actually want the behavior that LLVM implements today where large vectors 
> > get split across registers and memory?
> 
> I take it you're implying that the actual (Windows-only?) platform ABI 
> doesn't say anything about this because other compilers don't allow large 
> vectors.  How large are the vectors we do have ABI rules for?  Do they have 
> the problem as the SysV ABI where the ABI rules are sensitive to compiler 
> flags?
> 
> Anyway, I didn't realize the i386 Windows ABI *ever* used registers for 
> arguments.  (Whether you can convince LLVM to do so  for a function signature 
> that Clang isn't supposed to emit for ABI-conforming functions is a separate 
> story.)   You're saying it uses them for vectors?  Presumably up to some 
> limit, and only when they're

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

rnk wrote:
> rnk wrote:
> > rjmccall wrote:
> > > Would it be better to handle `inalloca` differently, maybe as a flag 
> > > rather than as a top-level kind?  I'm concerned about gradually 
> > > duplicating a significant amount of the expressivity of other kinds.
> > In the past, I've drafted a more than one unfinished designs for how we 
> > could remodel inalloca with tokens so that it can be per-argument instead 
> > of something that applies to all argument memory. Unfortunately, I never 
> > found the time to finish or implement one.
> > 
> > As I was working on this patch, I was thinking to myself that this could be 
> > the moment to implement one of those designs, but it would be pretty 
> > involved. Part of the issue is that, personally, I have very little 
> > interest in improving x86_32 code quality, so a big redesign wouldn't 
> > deliver much benefit. The benefits would all be code simplifications and 
> > maintenance cost reductions, which are nice, but seem to only get me 
> > through the prototype design stage.
> > 
> > I'll go dig up my last doc and try to share it, but for now, I think we 
> > have to suffer the extra inalloca code in this patch.
> Here's what I wrote, with some sketches of possible LLVM IR that could 
> replace inalloca:
> https://reviews.llvm.org/P8183
> 
> The basic idea is that we need a call setup instruction that forms a region 
> with the call. During CodeGen, we can look forward (potentially across BBs) 
> to determine how much argument stack memory to allocate, allocate it (perhaps 
> in pieces as we go along), and then skip the normal call stack argument 
> adjustment during regular call lowering.
> 
> Suggestions for names better than "argmem" are welcome.
> 
> The major complication comes from edges that exit the call setup region. 
> These could be exceptional exits or normal exits with statement expressions 
> and break, return, or goto. Along these paths we need to adjust SP, or risk 
> leaking stack memory. Today, with inalloca, I think we leak stack memory.
> In the past, I've drafted a more than one unfinished designs for how we could 
> remodel inalloca with tokens so that it can be per-argument instead of 
> something that applies to all argument memory. Unfortunately, I never found 
> the time to finish or implement one.

Sorry!  I think it would be great to rethink `inalloca` to avoid the 
duplication and so on, but I certainly didn't mean to suggest that we should do 
that as part of this patch.  (I'll look at your proposal later.)  I was trying 
to ask if it would make sense to change how `inalloca` arguments are 
represented by `ABIInfo`, so that we could e.g. build a normal indirect 
`ABIInfo` and then flag that it also needs to be written into an `inalloca` 
buffer.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rnk wrote:
> rjmccall wrote:
> > rnk wrote:
> > > craig.topper wrote:
> > > > What happens in the backend with inreg if 512-bit vectors aren't legal?
> > > LLVM splits the vector up using the largest legal vector size. As many 
> > > pieces as possible are passed in available XMM/YMM registers, and the 
> > > rest are passed in memory. MSVC, of course, assumes the user wanted the 
> > > larger vector size, and uses whatever vector instructions it needs to 
> > > move the arguments around.
> > > 
> > > Previously, I advocated for a model where calling an Intel intrinsic 
> > > function had the effect of implicitly marking the caller with the target 
> > > attributes of the intrinsic. This falls down if the user tries to write a 
> > > single function that conditionally branches between code that uses 
> > > different instruction set extensions. You can imagine the SSE2 codepath 
> > > accidentally using AVX instructions because the compiler thinks they are 
> > > better. I'm told that ICC models CPU micro-architectural features in the 
> > > CFG, but I don't ever expect that LLVM will do that. If we're stuck with 
> > > per-function CPU feature settings, it seems like it would be nice to try 
> > > to do what the user asked by default, and warn the user if we see them 
> > > doing a cpuid check in a function that has been implicitly blessed with 
> > > some target attrib

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread Reid Kleckner via Phabricator via cfe-commits
rnk marked an inline comment as done.
rnk added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

rnk wrote:
> rjmccall wrote:
> > Would it be better to handle `inalloca` differently, maybe as a flag rather 
> > than as a top-level kind?  I'm concerned about gradually duplicating a 
> > significant amount of the expressivity of other kinds.
> In the past, I've drafted a more than one unfinished designs for how we could 
> remodel inalloca with tokens so that it can be per-argument instead of 
> something that applies to all argument memory. Unfortunately, I never found 
> the time to finish or implement one.
> 
> As I was working on this patch, I was thinking to myself that this could be 
> the moment to implement one of those designs, but it would be pretty 
> involved. Part of the issue is that, personally, I have very little interest 
> in improving x86_32 code quality, so a big redesign wouldn't deliver much 
> benefit. The benefits would all be code simplifications and maintenance cost 
> reductions, which are nice, but seem to only get me through the prototype 
> design stage.
> 
> I'll go dig up my last doc and try to share it, but for now, I think we have 
> to suffer the extra inalloca code in this patch.
Here's what I wrote, with some sketches of possible LLVM IR that could replace 
inalloca:
https://reviews.llvm.org/P8183

The basic idea is that we need a call setup instruction that forms a region 
with the call. During CodeGen, we can look forward (potentially across BBs) to 
determine how much argument stack memory to allocate, allocate it (perhaps in 
pieces as we go along), and then skip the normal call stack argument adjustment 
during regular call lowering.

Suggestions for names better than "argmem" are welcome.

The major complication comes from edges that exit the call setup region. These 
could be exceptional exits or normal exits with statement expressions and 
break, return, or goto. Along these paths we need to adjust SP, or risk leaking 
stack memory. Today, with inalloca, I think we leak stack memory.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread Reid Kleckner via Phabricator via cfe-commits
rnk marked 2 inline comments as done.
rnk added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

rjmccall wrote:
> Would it be better to handle `inalloca` differently, maybe as a flag rather 
> than as a top-level kind?  I'm concerned about gradually duplicating a 
> significant amount of the expressivity of other kinds.
In the past, I've drafted a more than one unfinished designs for how we could 
remodel inalloca with tokens so that it can be per-argument instead of 
something that applies to all argument memory. Unfortunately, I never found the 
time to finish or implement one.

As I was working on this patch, I was thinking to myself that this could be the 
moment to implement one of those designs, but it would be pretty involved. Part 
of the issue is that, personally, I have very little interest in improving 
x86_32 code quality, so a big redesign wouldn't deliver much benefit. The 
benefits would all be code simplifications and maintenance cost reductions, 
which are nice, but seem to only get me through the prototype design stage.

I'll go dig up my last doc and try to share it, but for now, I think we have to 
suffer the extra inalloca code in this patch.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rjmccall wrote:
> rnk wrote:
> > craig.topper wrote:
> > > What happens in the backend with inreg if 512-bit vectors aren't legal?
> > LLVM splits the vector up using the largest legal vector size. As many 
> > pieces as possible are passed in available XMM/YMM registers, and the rest 
> > are passed in memory. MSVC, of course, assumes the user wanted the larger 
> > vector size, and uses whatever vector instructions it needs to move the 
> > arguments around.
> > 
> > Previously, I advocated for a model where calling an Intel intrinsic 
> > function had the effect of implicitly marking the caller with the target 
> > attributes of the intrinsic. This falls down if the user tries to write a 
> > single function that conditionally branches between code that uses 
> > different instruction set extensions. You can imagine the SSE2 codepath 
> > accidentally using AVX instructions because the compiler thinks they are 
> > better. I'm told that ICC models CPU micro-architectural features in the 
> > CFG, but I don't ever expect that LLVM will do that. If we're stuck with 
> > per-function CPU feature settings, it seems like it would be nice to try to 
> > do what the user asked by default, and warn the user if we see them doing a 
> > cpuid check in a function that has been implicitly blessed with some target 
> > attributes. You could imagine doing a similar thing when large vector type 
> > variables are used: if a large vector argument or local is used, implicitly 
> > enable the appropriate target features to move vectors of that size around.
> > 
> > This idea didn't get anywhere, and the current situation has persisted.
> > 
> > ---
> > 
> > You know, maybe we should just keep clang the way it is, and just set up a 
> > warning in the backend that says "hey, I split your large vector. You 
> > probably didn't want that." And then we just continue doing what we do now. 
> > Nobody likes backend warnings, but it seems better than the current 
> > direction of the frontend knowing every detail of x86 vector extensions.
> If target attributes affect ABI, it seems really dangerous to implicitly set 
> attributes based on what intrinsics are called.
> 
> The local CPU-testing problem seems similar to the problems with local 
> `#pragma STDC FENV_ACCESS` blocks that the constrained-FP people are looking 
> into.  They both have a "this operation is normally fully optimizable, but we 
> might need to be more careful in specific functions" aspect to them.  I 
> wonder if there's a reasonable way to unify the approaches, or at least 
> benefit from lessons learned.
I agree, we wouldn't want intrinsic usage to change ABI. But, does anybody 
actually want the behavior that LLVM implements today where large vectors get 
split across registers and memory? I think most users who pass a 512 bit vector 
want it to either be passed in ZMM registers or fail to compile. Why do we even 
support passing 1024 bit vectors? Could we make that an error?

Anyway, major redesigns aside, should clang do something when large ve

[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

rnk wrote:
> craig.topper wrote:
> > What happens in the backend with inreg if 512-bit vectors aren't legal?
> LLVM splits the vector up using the largest legal vector size. As many pieces 
> as possible are passed in available XMM/YMM registers, and the rest are 
> passed in memory. MSVC, of course, assumes the user wanted the larger vector 
> size, and uses whatever vector instructions it needs to move the arguments 
> around.
> 
> Previously, I advocated for a model where calling an Intel intrinsic function 
> had the effect of implicitly marking the caller with the target attributes of 
> the intrinsic. This falls down if the user tries to write a single function 
> that conditionally branches between code that uses different instruction set 
> extensions. You can imagine the SSE2 codepath accidentally using AVX 
> instructions because the compiler thinks they are better. I'm told that ICC 
> models CPU micro-architectural features in the CFG, but I don't ever expect 
> that LLVM will do that. If we're stuck with per-function CPU feature 
> settings, it seems like it would be nice to try to do what the user asked by 
> default, and warn the user if we see them doing a cpuid check in a function 
> that has been implicitly blessed with some target attributes. You could 
> imagine doing a similar thing when large vector type variables are used: if a 
> large vector argument or local is used, implicitly enable the appropriate 
> target features to move vectors of that size around.
> 
> This idea didn't get anywhere, and the current situation has persisted.
> 
> ---
> 
> You know, maybe we should just keep clang the way it is, and just set up a 
> warning in the backend that says "hey, I split your large vector. You 
> probably didn't want that." And then we just continue doing what we do now. 
> Nobody likes backend warnings, but it seems better than the current direction 
> of the frontend knowing every detail of x86 vector extensions.
If target attributes affect ABI, it seems really dangerous to implicitly set 
attributes based on what intrinsics are called.

The local CPU-testing problem seems similar to the problems with local `#pragma 
STDC FENV_ACCESS` blocks that the constrained-FP people are looking into.  They 
both have a "this operation is normally fully optimizable, but we might need to 
be more careful in specific functions" aspect to them.  I wonder if there's a 
reasonable way to unify the approaches, or at least benefit from lessons 
learned.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread Reid Kleckner via Phabricator via cfe-commits
rnk marked 2 inline comments as done.
rnk added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

craig.topper wrote:
> What happens in the backend with inreg if 512-bit vectors aren't legal?
LLVM splits the vector up using the largest legal vector size. As many pieces 
as possible are passed in available XMM/YMM registers, and the rest are passed 
in memory. MSVC, of course, assumes the user wanted the larger vector size, and 
uses whatever vector instructions it needs to move the arguments around.

Previously, I advocated for a model where calling an Intel intrinsic function 
had the effect of implicitly marking the caller with the target attributes of 
the intrinsic. This falls down if the user tries to write a single function 
that conditionally branches between code that uses different instruction set 
extensions. You can imagine the SSE2 codepath accidentally using AVX 
instructions because the compiler thinks they are better. I'm told that ICC 
models CPU micro-architectural features in the CFG, but I don't ever expect 
that LLVM will do that. If we're stuck with per-function CPU feature settings, 
it seems like it would be nice to try to do what the user asked by default, and 
warn the user if we see them doing a cpuid check in a function that has been 
implicitly blessed with some target attributes. You could imagine doing a 
similar thing when large vector type variables are used: if a large vector 
argument or local is used, implicitly enable the appropriate target features to 
move vectors of that size around.

This idea didn't get anywhere, and the current situation has persisted.

---

You know, maybe we should just keep clang the way it is, and just set up a 
warning in the backend that says "hey, I split your large vector. You probably 
didn't want that." And then we just continue doing what we do now. Nobody likes 
backend warnings, but it seems better than the current direction of the 
frontend knowing every detail of x86 vector extensions.



Comment at: clang/test/CodeGenCXX/inalloca-vector.cpp:71
+// CHECK-LABEL: define dso_local x86_vectorcallcc void 
@"?vectorcall_receive_vec@@Y{{[^"]*}}"
+// CHECKX-SAME: (double inreg %xmm0,
+// CHECKX-SAME: double inreg %xmm1,

erichkeane wrote:
> Are all the checks hre on out disabled for a reason?
Yes, this is case 2 in the commit message. I won't close the bug without coming 
back to this.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread John McCall via Phabricator via cfe-commits
rjmccall added inline comments.



Comment at: clang/include/clang/CodeGen/CGFunctionInfo.h:91
   bool InAllocaSRet : 1;// isInAlloca()
+  bool InAllocaIndirect : 1;// isInAlloca()
   bool IndirectByVal : 1;   // isIndirect()

Would it be better to handle `inalloca` differently, maybe as a flag rather 
than as a top-level kind?  I'm concerned about gradually duplicating a 
significant amount of the expressivity of other kinds.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread Craig Topper via Phabricator via cfe-commits
craig.topper added inline comments.



Comment at: clang/test/CodeGen/x86_32-arguments-win32.c:77
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, 
<8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, 
<16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* 
%1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 
x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

What happens in the backend with inreg if 512-bit vectors aren't legal?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-03 Thread Erich Keane via Phabricator via cfe-commits
erichkeane added a comment.

I think this is alright, I want @ctopper to take a look before I approve it 
though.  Additionally, do you know if this modifies the regcall calling 
convention at all?  Should it?




Comment at: clang/test/CodeGenCXX/inalloca-vector.cpp:71
+// CHECK-LABEL: define dso_local x86_vectorcallcc void 
@"?vectorcall_receive_vec@@Y{{[^"]*}}"
+// CHECKX-SAME: (double inreg %xmm0,
+// CHECKX-SAME: double inreg %xmm1,

Are all the checks hre on out disabled for a reason?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-02 Thread pre-merge checks [bot] via Phabricator via cfe-commits
merge_guards_bot added a comment.

{icon question-circle color=gray} Unit tests: unknown.

{icon question-circle color=gray} clang-tidy: unknown.

{icon question-circle color=gray} clang-format: unknown.

Build artifacts 
: 
diff.json 
,
 console-log.txt 



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72114/new/

https://reviews.llvm.org/D72114



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72114: [MS] Overhaul how clang passes overaligned args on x86_32

2020-01-02 Thread Reid Kleckner via Phabricator via cfe-commits
rnk created this revision.
rnk added reviewers: rjmccall, craig.topper, erichkeane.
Herald added a project: clang.

MSVC 2013 would refuse to pass highly aligned things (typically vectors
and aggregates) by value. Users would receive this error:

  t.cpp(11) : error C2719: 'w': formal parameter with __declspec(align('32')) 
won't be aligned
  t.cpp(11) : error C2719: 'q': formal parameter with __declspec(align('32')) 
won't be aligned

However, in MSVC 2015, this behavior was changed, and highly aligned
things are now passed indirectly. To avoid breaking backwards
incompatibility, objects that do not have a *required* high alignment
(i.e. double) are still passed directly, even though they are not
naturally aligned. This change implements the new behavior of passing
things indirectly.

The new behavior is:

- up to three vector parameters can be passed in [XYZ]MM0-2
- remaining arguments with required alignment greater than 4 bytes are passed 
indirectly

Previously, MSVC never passed things truly indirectly, meaning clang
would always apply the byval attribute to indirect arguments. We had to
go to the trouble of adding inalloca so that non-trivially copyable C++
types could be passed in place without copying the object
representation. When inalloca was added, we asserted that all arguments
passed indirectly must use byval. With this change, that assert no
longer holds, and I had to update inalloca to handle that case. The
implicit sret pointer parameter was already handled this way, and this
change generalizes some of that logic to arguments.

There are two cases that this change leaves unfixed:

1. objects that are non-trivially copyable *and* overaligned
2. vectorcall + inalloca + vectors

For case 1, I need to touch C++ ABI code in MicrosoftCXXABI.cpp, so I
want to do it in a follow-up.

For case 2, my fix is one line, but it will require updating IR tests to
use lots of inreg, so I wanted to separate it out.

Related to D71915  and D72110 


Fixes most of PR44395


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D72114

Files:
  clang/include/clang/CodeGen/CGFunctionInfo.h
  clang/lib/CodeGen/CGCall.cpp
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/x86_32-arguments-win32.c
  clang/test/CodeGenCXX/inalloca-overaligned.cpp
  clang/test/CodeGenCXX/inalloca-vector.cpp

Index: clang/test/CodeGenCXX/inalloca-vector.cpp
===
--- /dev/null
+++ clang/test/CodeGenCXX/inalloca-vector.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -w -triple i686-pc-win32 -emit-llvm -o - %s | FileCheck %s
+
+// PR44395
+// MSVC passes up to three vectors in registers, and the rest indirectly. Check
+// that both are compatible with an inalloca prototype.
+
+struct NonTrivial {
+  NonTrivial();
+  NonTrivial(const NonTrivial &o);
+  unsigned handle;
+};
+
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+__m128 gv128;
+
+// nt, w, and q will be in the inalloca pack.
+void receive_vec_128(NonTrivial nt, __m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
+  gv128 = x + y + z + w + q;
+}
+// CHECK-LABEL: define dso_local void  @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %x,
+// CHECK-SAME: <4 x float> inreg %y,
+// CHECK-SAME: <4 x float> inreg %z,
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %0)
+
+void pass_vec_128() {
+  __m128 z = {0};
+  receive_vec_128(NonTrivial(), z, z, z, z, z);
+}
+// CHECK-LABEL: define dso_local void @"?pass_vec_128@@YAXXZ"()
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 0
+// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* %{{.*}})
+
+// Store q, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 1
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// Store w, store temp alloca.
+// CHECK: store <4 x float> %{{[^,]*}}, <4 x float>* %{{[^,]*}}, align 16
+// CHECK: getelementptr inbounds <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>, <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* %{{[^,]*}}, i32 0, i32 2
+// CHECK: store <4 x float>* %{{[^,]*}}, <4 x float>** %{{[^,]*}}, align 4
+
+// CHECK: call void @"?receive_vec_128@@YAXUNonTrivial@@T__m128@@@Z"
+// CHECK-SAME: (<4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <4 x float> inreg %{{[^,]*}},
+// CHECK-SAME: <{ %struct.NonTrivial, <4 x float>*, <4 x float>* }>* inalloca %{{[^,]*}})
+
+// w will be passed indirectly by register, and q will be passed ind