[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values
This revision was automatically updated to reflect the committed changes. Closed by commit rG8ea72b388734: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values (authored by joechrisellis). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D94290/new/ https://reviews.llvm.org/D94290 Files: clang/lib/CodeGen/CGCall.cpp clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp clang/test/CodeGen/attr-arm-sve-vector-bits-call.c clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c === --- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c +++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c @@ -17,7 +17,6 @@ // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store [[PRED:%.*]], * [[PRED_ADDR]], align 2 // CHECK-NEXT:store [[VEC:%.*]], * [[VEC_ADDR]], align 16 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2 @@ -35,11 +34,9 @@ // CHECK-NEXT:[[TMP10:%.*]] = call @llvm.aarch64.sve.add.nxv4i32( [[TMP9]], [[CASTSCALABLESVE]], [[TMP8]]) // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0) // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false) -// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP13]] +// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE1]] // fixed_int32_t foo(svbool_t pred, svint32_t vec) { svbool_t pg = svand_z(pred, global_pred, global_pred); @@ -50,16 +47,13 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP4]] +// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE]] // fixed_int32_t test_ptr_to_global() { fixed_int32_t *global_vec_ptr; @@ -73,17 +67,14 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP4]] +// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT:ret
[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values
joechrisellis updated this revision to Diff 315339. joechrisellis added a comment. Address @c-rhodes's comment. - Use `SrcTy` instead of `Src.getElementType()`. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D94290/new/ https://reviews.llvm.org/D94290 Files: clang/lib/CodeGen/CGCall.cpp clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp clang/test/CodeGen/attr-arm-sve-vector-bits-call.c clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c === --- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c +++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c @@ -17,7 +17,6 @@ // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store [[PRED:%.*]], * [[PRED_ADDR]], align 2 // CHECK-NEXT:store [[VEC:%.*]], * [[VEC_ADDR]], align 16 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2 @@ -35,11 +34,9 @@ // CHECK-NEXT:[[TMP10:%.*]] = call @llvm.aarch64.sve.add.nxv4i32( [[TMP9]], [[CASTSCALABLESVE]], [[TMP8]]) // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0) // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false) -// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP13]] +// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE1]] // fixed_int32_t foo(svbool_t pred, svint32_t vec) { svbool_t pg = svand_z(pred, global_pred, global_pred); @@ -50,16 +47,13 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP4]] +// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE]] // fixed_int32_t test_ptr_to_global() { fixed_int32_t *global_vec_ptr; @@ -73,17 +67,14 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP4]] +// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE]] // fixed_int32_t array_arg(fixed_int32_t
[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values
c-rhodes accepted this revision. c-rhodes added a comment. This revision is now accepted and ready to land. I've left one minor nit but looks otherwise looks fine to me Comment at: clang/lib/CodeGen/CGCall.cpp:1273 +if (auto *FixedSrc = +dyn_cast(Src.getElementType())) { + if (ScalableDst->getElementType() == FixedSrc->getElementType()) { nit: `s/Src.getElementType()/SrcTy` Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D94290/new/ https://reviews.llvm.org/D94290 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D94290: [clang][AArch64][SVE] Avoid going through memory for coerced VLST return values
joechrisellis created this revision. joechrisellis added reviewers: c-rhodes, bsmith, peterwaller-arm. Herald added subscribers: NickHung, psnobl, kristof.beyls, tschuett. Herald added a reviewer: efriedma. joechrisellis requested review of this revision. Herald added a project: clang. Herald added a subscriber: cfe-commits. VLST return values are coerced to VLATs in the function epilog for consistency with the VLAT ABI. Previously, this coercion was done through memory. It is preferable to use the llvm.experimental.vector.insert intrinsic to avoid going through memory here. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D94290 Files: clang/lib/CodeGen/CGCall.cpp clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp clang/test/CodeGen/attr-arm-sve-vector-bits-call.c clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c Index: clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c === --- clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c +++ clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c @@ -17,7 +17,6 @@ // CHECK-NEXT:[[PRED_ADDR:%.*]] = alloca , align 2 // CHECK-NEXT:[[VEC_ADDR:%.*]] = alloca , align 16 // CHECK-NEXT:[[PG:%.*]] = alloca , align 2 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store [[PRED:%.*]], * [[PRED_ADDR]], align 2 // CHECK-NEXT:store [[VEC:%.*]], * [[VEC_ADDR]], align 16 // CHECK-NEXT:[[TMP0:%.*]] = load , * [[PRED_ADDR]], align 2 @@ -35,11 +34,9 @@ // CHECK-NEXT:[[TMP10:%.*]] = call @llvm.aarch64.sve.add.nxv4i32( [[TMP9]], [[CASTSCALABLESVE]], [[TMP8]]) // CHECK-NEXT:[[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32( [[TMP10]], i64 0) // CHECK-NEXT:store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP11:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP12:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 64, i1 false) -// CHECK-NEXT:[[TMP13:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP13]] +// CHECK-NEXT:[[TMP11:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE1:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP11]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE1]] // fixed_int32_t foo(svbool_t pred, svint32_t vec) { svbool_t pg = svand_z(pred, global_pred, global_pred); @@ -50,16 +47,13 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[GLOBAL_VEC_PTR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* @global_vec, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[GLOBAL_VEC_PTR]], align 8 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]], align 16 -// CHECK-NEXT:ret [[TMP4]] +// CHECK-NEXT:[[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[RETVAL]], align 16 +// CHECK-NEXT:[[CASTSCALABLESVE:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v16i32( undef, <16 x i32> [[TMP2]], i64 0) +// CHECK-NEXT:ret [[CASTSCALABLESVE]] // fixed_int32_t test_ptr_to_global() { fixed_int32_t *global_vec_ptr; @@ -73,17 +67,14 @@ // CHECK-NEXT: entry: // CHECK-NEXT:[[RETVAL:%.*]] = alloca <16 x i32>, align 16 // CHECK-NEXT:[[ARR_ADDR:%.*]] = alloca <16 x i32>*, align 8 -// CHECK-NEXT:[[RETVAL_COERCE:%.*]] = alloca , align 16 // CHECK-NEXT:store <16 x i32>* [[ARR:%.*]], <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[TMP0:%.*]] = load <16 x i32>*, <16 x i32>** [[ARR_ADDR]], align 8 // CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds <16 x i32>, <16 x i32>* [[TMP0]], i64 0 // CHECK-NEXT:[[TMP1:%.*]] = load <16 x i32>, <16 x i32>* [[ARRAYIDX]], align 16 // CHECK-NEXT:store <16 x i32> [[TMP1]], <16 x i32>* [[RETVAL]], align 16 -// CHECK-NEXT:[[TMP2:%.*]] = bitcast * [[RETVAL_COERCE]] to i8* -// CHECK-NEXT:[[TMP3:%.*]] = bitcast <16 x i32>* [[RETVAL]] to i8* -// CHECK-NEXT:call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false) -// CHECK-NEXT:[[TMP4:%.*]] = load , * [[RETVAL_COERCE]],