The call to `gen_lowpart` in `store_bit_field_1` might copy the destination
register into a new one, which may lead to wrong code generation, as the bit
insertions update the new register instead of updating `str_rtx`.
This patch copies back the new destination register into `str_rtx` when needed.
Bootstrapped/regtested on AArch64 and x86-64.
PR rtl-optimization/125988
gcc/ChangeLog:
* expmed.cc (store_bit_field_1): Copy back the new destination
register into `str_rtx` when needed.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/pr125988.c: New test.
---
gcc/expmed.cc | 22 +++++++--
gcc/testsuite/gcc.target/aarch64/pr125988.c | 51 +++++++++++++++++++++
2 files changed, 70 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/pr125988.c
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index da1b5b632876..1f4611a6ed89 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -888,9 +888,25 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize,
poly_uint64 bitnum,
op0 = gen_lowpart (op0_mode.require (), op0);
}
- return store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
- bitregion_start, bitregion_end,
- fieldmode, value, reverse, fallback_p);
+ if (!store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
+ bitregion_start, bitregion_end,
+ fieldmode, value, reverse, fallback_p))
+ return false;
+
+ rtx op0_reg = op0;
+ rtx str_rtx_reg = str_rtx;
+ while (SUBREG_P (op0_reg))
+ op0_reg = SUBREG_REG (op0_reg);
+ while (SUBREG_P (str_rtx_reg))
+ str_rtx_reg = SUBREG_REG (str_rtx_reg);
+
+ /* If a new destination register has been generated, copy the value back
+ into str_rtx. */
+ if (REG_P (op0_reg) && REG_P (str_rtx_reg)
+ && REGNO (op0_reg) != REGNO (str_rtx_reg))
+ emit_move_insn (str_rtx_reg, op0_reg);
+
+ return true;
}
/* Subroutine of store_bit_field_1, with the same arguments, except
diff --git a/gcc/testsuite/gcc.target/aarch64/pr125988.c
b/gcc/testsuite/gcc.target/aarch64/pr125988.c
new file mode 100644
index 000000000000..3ac7be9b7b99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr125988.c
@@ -0,0 +1,51 @@
+/* PR rtl-optimization/125988 */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O3 -favoid-store-forwarding" } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+/* Verify that the lane inserted by vld4_lane_bf16 survives
+ avoid-store-forwarding's bit-insert rewrite. */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+typedef union
+{
+ bfloat16_t bf;
+ unsigned short u;
+} bf16_u;
+
+__attribute__((noipa)) static int
+test (const bf16_u *data, const bf16_u *overwrite)
+{
+ bfloat16x4x4_t v;
+ bf16_u t[4];
+ int i, j;
+ for (i = 0; i < 4; i++, data += 4)
+ v.val[i] = vld1_bf16 (&data->bf);
+ v = vld4_lane_bf16 (&overwrite->bf, v, 3);
+ while (--i >= 0)
+ {
+ vst1_bf16 (&t[0].bf, v.val[i]);
+ data -= 4;
+ for (j = 0; j < 4; j++)
+ if (t[j].u != (j == 3 ? overwrite[i].u : data[j].u))
+ return 1;
+ }
+ return 0;
+}
+
+int
+main (void)
+{
+ bf16_u d[16];
+ for (int i = 0; i < 16; i++)
+ d[i].u = 0x1000 + i;
+ bf16_u ov[4] = { {.u = 0xABCD}, {.u = 0x1234},
+ {.u = 0xCAFE}, {.u = 0xBEEF} };
+ if (test (d, ov))
+ abort ();
+ return 0;
+}
--
2.52.0