Since r15-778-g1d1ef1c22752b3, we are compiling the following snippet:
void foo (int16_t *dst, const uint8_t *src0, const uint8_t *src1)
{
uint8x16_t s0 = vld1q_u8 (src0);
uint8x16_t s1 = vld1q_u8 (src1);
uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1));
uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1));
vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo));
vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi));
}
into:
ldp d0, d29, [x1]
ldp d30, d31, [x2]
usubl v30.8h, v0.8b, v30.8b
usubl v31.8h, v29.8b, v31.8b
stp q30, q31, [x0]
ret
rather than:
ldr q31, [x1]
ldr q30, [x2]
usubl v29.8h, v31.8b, v30.8b
usubl2 v30.8h, v31.16b, v30.16b
stp q29, q30, [x0]
ret
That is, rather than keeping two 128-bit loads and using the usubl2
instruction designed to operate on upper halves of 128-bit vector
registers, we are doing four 64-bit scalar loads and operate on 64-bit
values, which leads to increased register pressure.
What happens here is the aforementioned commit lowers the vget_half_* ()
intrinsics to BIT_FIELD_REFs, at which point the logic in
tree-ssa-forwprop.cc::optimize_vector_load () kicks in, breaking down
vector loads into scalar loads as long as all uses are through
BIT_FIELD_REFs. AFAICT, this function (or before it existed, the code
comprising it) handles the following scenarios:
(1) Introduced in r10-135-ga7eb97ad269b65 in response to PR88983, this
code broke down vector loads into smaller loads whenever the target
doesn't natively support wider loads, fixing code quality issues. This
should always be a win since the original loads weren't even available in
the first place.
(2) Since r12-2728-g2724d1bba6b364, it is now also handling loads that
feed into VEC_UNPACK expressions to prefer extending scalar loads to
vector loads + vector unpack, which is beneficial at least on some
microarchitectures.
This patch restricts the optimization to those scenarios explicitly, while
adding another one on top:
(3) If any of the BIT_FIELD_REFs have scalar type, prefer scalar loads to
vector loads to reduce possible traffic between scalar and vector register
files. IOW, only if all BIT_FIELD_REFs are used as subvectors, assume
there might be other instructions operating on those subvectors that do
not leave the vector register file, and do not perform the transformation.
To summarize, after this patch, if either (1), (2), or (3) holds, narrow
loads are preferred, otherwise vector loads are left intact.
Bootstrapped and regtested on aarch64 and x86_64, no regressions on
SPEC2017, the code snippet above added as an aarch64-specific test.
gcc/ChangeLog:
* tree-ssa-forwprop.cc (optimize_vector_load): Inhibit
optimization when all uses are through subvectors without
extension.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/usubl2.c: New test.
---
.../gcc.target/aarch64/simd/usubl2.c | 19 +++++++++++++++++++
gcc/tree-ssa-forwprop.cc | 9 +++++++++
2 files changed, 28 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
new file mode 100644
index 00000000000..442b922e71a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_neon.h>
+
+void foo(int16_t *dst, const uint8_t *src0, const uint8_t *src1)
+{
+ uint8x16_t s0 = vld1q_u8 (src0);
+ uint8x16_t s1 = vld1q_u8 (src1);
+
+ uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1));
+ uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1));
+
+ vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo));
+ vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi));
+}
+
+/* { dg-final { scan-assembler "usubl\tv\[0-9\]+\.8h,\ v\[0-9\]+\.8b,\
v\[0-9\]+\.8b" } } */
+/* { dg-final { scan-assembler "usubl2\tv\[0-9\]+\.8h,\ v\[0-9\]+\.16b,\
v\[0-9\]+\.16b" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 9f8d4ad3b44..052d1740491 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -4245,6 +4245,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
use_operand_p use_p;
imm_use_iterator iter;
bool rewrite = true;
+ bool scalar_use = false;
+ bool unpack_use = false;
auto_vec<gimple *, 8> bf_stmts;
auto_vec<tree, 8> worklist;
worklist.quick_push (lhs);
@@ -4278,6 +4280,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
??? Support VEC_UNPACK_FLOAT_{HI,LO}_EXPR. */
&& INTEGRAL_TYPE_P (TREE_TYPE (use_rhs)))))
{
+ if (!VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (use_stmt))))
+ scalar_use = true;
bf_stmts.safe_push (use_stmt);
continue;
}
@@ -4287,6 +4291,7 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
|| use_code == VEC_UNPACK_LO_EXPR)
&& use_rhs == lhs)
{
+ unpack_use = true;
worklist.safe_push (gimple_assign_lhs (use_stmt));
continue;
}
@@ -4298,6 +4303,10 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
}
while (!worklist.is_empty ());
+ rewrite = rewrite && (scalar_use
+ || unpack_use
+ || !can_implement_p (mov_optab,
+ TYPE_MODE (TREE_TYPE (lhs))));
if (!rewrite)
{
gsi_next (gsi);
--
2.43.0