[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values

2021-01-11 Thread Joe Ellis via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG8ea72b388734: [clang][AArch64][SVE] Avoid going through 
memory for coerced VLST return values (authored by joechrisellis).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D94290/new/

https://reviews.llvm.org/D94290

Files:
  clang/lib/CodeGen/CGCall.cpp
  clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
  clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c

Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
===
--- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
+++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
@@ -17,7 +17,6 @@
 // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2
 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store  [[PRED:%.*]], * [[PRED_ADDR]], align 2
 // CHECK-NEXT:store  [[VEC:%.*]], * [[VEC_ADDR]], align 16
 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2
@@ -35,11 +34,9 @@
 // CHECK-NEXT:[[TMP10:%.*]] = call  @llvm.aarch64.sve.add.nxv4i32( [[TMP9]],  [[CASTSCALABLESVE]],  [[TMP8]])
 // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0)
 // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP13]]
+// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE1]]
 //
 fixed_int32_t foo(svbool_t pred, svint32_t vec) {
   svbool_t pg = svand_z(pred, global_pred, global_pred);
@@ -50,16 +47,13 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP4]]
+// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE]]
 //
 fixed_int32_t test_ptr_to_global() {
   fixed_int32_t *global_vec_ptr;
@@ -73,17 +67,14 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP4]]
+// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT:ret  

[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values

2021-01-08 Thread Joe Ellis via Phabricator via cfe-commits
joechrisellis updated this revision to Diff 315339.
joechrisellis added a comment.

Address @c-rhodes's comment.

- Use `SrcTy` instead of `Src.getElementType()`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D94290/new/

https://reviews.llvm.org/D94290

Files:
  clang/lib/CodeGen/CGCall.cpp
  clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
  clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c

Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
===
--- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
+++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
@@ -17,7 +17,6 @@
 // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2
 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store  [[PRED:%.*]], * [[PRED_ADDR]], align 2
 // CHECK-NEXT:store  [[VEC:%.*]], * [[VEC_ADDR]], align 16
 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2
@@ -35,11 +34,9 @@
 // CHECK-NEXT:[[TMP10:%.*]] = call  @llvm.aarch64.sve.add.nxv4i32( [[TMP9]],  [[CASTSCALABLESVE]],  [[TMP8]])
 // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0)
 // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP13]]
+// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE1]]
 //
 fixed_int32_t foo(svbool_t pred, svint32_t vec) {
   svbool_t pg = svand_z(pred, global_pred, global_pred);
@@ -50,16 +47,13 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP4]]
+// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE]]
 //
 fixed_int32_t test_ptr_to_global() {
   fixed_int32_t *global_vec_ptr;
@@ -73,17 +67,14 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP4]]
+// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE]]
 //
 fixed_int32_t array_arg(fixed_int32_t 

[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values

2021-01-08 Thread Cullen Rhodes via Phabricator via cfe-commits
c-rhodes accepted this revision.
c-rhodes added a comment.
This revision is now accepted and ready to land.

I've left one minor nit but looks otherwise looks fine to me




Comment at: clang/lib/CodeGen/CGCall.cpp:1273
+if (auto *FixedSrc =
+dyn_cast(Src.getElementType())) {
+  if (ScalableDst->getElementType() == FixedSrc->getElementType()) {

nit: `s/Src.getElementType()/SrcTy`


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D94290/new/

https://reviews.llvm.org/D94290

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values

2021-01-08 Thread Joe Ellis via Phabricator via cfe-commits
joechrisellis created this revision.
joechrisellis added reviewers: c-rhodes, bsmith, peterwaller-arm.
Herald added subscribers: NickHung, psnobl, kristof.beyls, tschuett.
Herald added a reviewer: efriedma.
joechrisellis requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

VLST return values are coerced to VLATs in the function epilog for
consistency with the VLAT ABI. Previously, this coercion was done
through memory. It is preferable to use the
llvm.experimental.vector.insert intrinsic to avoid going through memory
here.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D94290

Files:
  clang/lib/CodeGen/CGCall.cpp
  clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
  clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
  clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c

Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
===
--- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
+++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
@@ -17,7 +17,6 @@
 // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2
 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16
 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store  [[PRED:%.*]], * [[PRED_ADDR]], align 2
 // CHECK-NEXT:store  [[VEC:%.*]], * [[VEC_ADDR]], align 16
 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2
@@ -35,11 +34,9 @@
 // CHECK-NEXT:[[TMP10:%.*]] = call  @llvm.aarch64.sve.add.nxv4i32( [[TMP9]],  [[CASTSCALABLESVE]],  [[TMP8]])
 // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0)
 // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP13]]
+// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE1]]
 //
 fixed_int32_t foo(svbool_t pred, svint32_t vec) {
   svbool_t pg = svand_z(pred, global_pred, global_pred);
@@ -50,16 +47,13 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16
-// CHECK-NEXT:ret  [[TMP4]]
+// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16
+// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call  @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0)
+// CHECK-NEXT:ret  [[CASTSCALABLESVE]]
 //
 fixed_int32_t test_ptr_to_global() {
   fixed_int32_t *global_vec_ptr;
@@ -73,17 +67,14 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16
 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8
-// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16
 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8
 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0
 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16
 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16
-// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8*
-// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8*
-// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
-// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]],