On Mon, Jun 29, 2026 at 4:13 PM Konstantinos Eleftheriou
<[email protected]> wrote:
>
> The call to `gen_lowpart` in `store_bit_field_1` might copy the destination
> register into a new one, which may lead to wrong code generation, as the bit
> insertions update the new register instead of updating `str_rtx`.
>
> This patch copies back the new destination register into `str_rtx` when
> needed.
>
> Bootstrapped/regtested on AArch64 and x86-64.
>
> PR rtl-optimization/125988
>
> gcc/ChangeLog:
>
> * expmed.cc (store_bit_field_1): Copy back the new destination
> register into `str_rtx` when needed.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/pr125988.c: New test.
> ---
> gcc/expmed.cc | 22 +++++++--
> gcc/testsuite/gcc.target/aarch64/pr125988.c | 51 +++++++++++++++++++++
> 2 files changed, 70 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr125988.c
>
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index da1b5b632876..1f4611a6ed89 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -888,9 +888,25 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize,
> poly_uint64 bitnum,
> op0 = gen_lowpart (op0_mode.require (), op0);
> }
>
> - return store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
> - bitregion_start, bitregion_end,
> - fieldmode, value, reverse, fallback_p);
> + if (!store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
> + bitregion_start, bitregion_end,
> + fieldmode, value, reverse, fallback_p))
> + return false;
> +
> + rtx op0_reg = op0;
> + rtx str_rtx_reg = str_rtx;
> + while (SUBREG_P (op0_reg))
> + op0_reg = SUBREG_REG (op0_reg);
> + while (SUBREG_P (str_rtx_reg))
> + str_rtx_reg = SUBREG_REG (str_rtx_reg);
That looks definitely fishy.
You do not quote the part that does the copy, but stripping all subregs
and then copying looks wrong. It also looks this was produced by an AI?
> +
> + /* If a new destination register has been generated, copy the value back
> + into str_rtx. */
> + if (REG_P (op0_reg) && REG_P (str_rtx_reg)
> + && REGNO (op0_reg) != REGNO (str_rtx_reg))
> + emit_move_insn (str_rtx_reg, op0_reg);
> +
> + return true;
> }
>
> /* Subroutine of store_bit_field_1, with the same arguments, except
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr125988.c
> b/gcc/testsuite/gcc.target/aarch64/pr125988.c
> new file mode 100644
> index 000000000000..3ac7be9b7b99
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr125988.c
> @@ -0,0 +1,51 @@
> +/* PR rtl-optimization/125988 */
> +/* { dg-do run } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-options "-O3 -favoid-store-forwarding" } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +
> +/* Verify that the lane inserted by vld4_lane_bf16 survives
> + avoid-store-forwarding's bit-insert rewrite. */
> +
> +#include <arm_neon.h>
> +
> +extern void abort (void);
> +
> +typedef union
> +{
> + bfloat16_t bf;
> + unsigned short u;
> +} bf16_u;
> +
> +__attribute__((noipa)) static int
> +test (const bf16_u *data, const bf16_u *overwrite)
> +{
> + bfloat16x4x4_t v;
> + bf16_u t[4];
> + int i, j;
> + for (i = 0; i < 4; i++, data += 4)
> + v.val[i] = vld1_bf16 (&data->bf);
> + v = vld4_lane_bf16 (&overwrite->bf, v, 3);
> + while (--i >= 0)
> + {
> + vst1_bf16 (&t[0].bf, v.val[i]);
> + data -= 4;
> + for (j = 0; j < 4; j++)
> + if (t[j].u != (j == 3 ? overwrite[i].u : data[j].u))
> + return 1;
> + }
> + return 0;
> +}
> +
> +int
> +main (void)
> +{
> + bf16_u d[16];
> + for (int i = 0; i < 16; i++)
> + d[i].u = 0x1000 + i;
> + bf16_u ov[4] = { {.u = 0xABCD}, {.u = 0x1234},
> + {.u = 0xCAFE}, {.u = 0xBEEF} };
> + if (test (d, ov))
> + abort ();
> + return 0;
> +}
> --
> 2.52.0
>