OK.
Thanks for the patch!
> The BPF backend expansion of setmem was broken, because it could elect
> to use stores of HI, SI or DI modes based on the destitnation alignement
> when the value was QI, but fail to duplicate the byte value across to
> those larger sizes. This resulted in not all bytes bytes of the
> destination actually being set to the desired value.
>
> Fix bpf_expand_setmem to ensure the desired byte value is really
> duplicated as necessary, whether it is constant or a (sub)reg:QI.
>
> Tested for bpf-unknown-none target on x86_64-linux-gnu host.
>
> Sanity checked by compiling kernel bpf-next selftests, and
> spot-checking the resulting asm for affected __builtin_memsets
> used there.
>
> PR target/122139
>
> gcc/
>
> * config/bpf/bpf.cc (bpf_expand_setmem): Duplicate byte value
> across to new mode when using larger modes for store.
>
> gcc/testsuite/
>
> * gcc.target/bpf/memset-3.c: New.
> * gcc.target/bpf/memset-4.c: New.
> ---
> gcc/config/bpf/bpf.cc | 65 +++++++++++++++++++++++--
> gcc/testsuite/gcc.target/bpf/memset-3.c | 56 +++++++++++++++++++++
> gcc/testsuite/gcc.target/bpf/memset-4.c | 24 +++++++++
> 3 files changed, 141 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/bpf/memset-3.c
> create mode 100644 gcc/testsuite/gcc.target/bpf/memset-4.c
>
> diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
> index 2e7474be1ff..a28018b3367 100644
> --- a/gcc/config/bpf/bpf.cc
> +++ b/gcc/config/bpf/bpf.cc
> @@ -1427,25 +1427,82 @@ bpf_expand_setmem (rtx *operands)
> unsigned inc = GET_MODE_SIZE (mode);
> unsigned offset = 0;
>
> + /* If val is a constant, then build a new constant value duplicating
> + the byte across to the size of stores we might do.
> + e.g. if val is 0xab and we can store in 4-byte chunks, build
> + 0xabababab and use that to do the memset.
> + If val is not a constant, then by constraint it is a QImode register
> + and we similarly duplicate the byte across. */
> + rtx src;
> + if (CONST_INT_P (val))
> + {
> + unsigned HOST_WIDE_INT tmp = UINTVAL (val) & 0xff;
> + /* Need src in the proper mode. */
> + switch (mode)
> + {
> + case DImode:
> + src = gen_rtx_CONST_INT (DImode, tmp * 0x0101010101010101);
> + break;
> + case SImode:
> + src = gen_rtx_CONST_INT (SImode, tmp * 0x01010101);
> + break;
> + case HImode:
> + src = gen_rtx_CONST_INT (HImode, tmp * 0x0101);
> + break;
> + default:
> + src = val;
> + break;
> + }
> + }
> + else
> + {
> + /* VAL is a subreg:QI (reg:DI N).
> + Copy that byte to fill the whole register. */
> + src = gen_reg_rtx (mode);
> + emit_move_insn (src, gen_rtx_ZERO_EXTEND (mode, val));
> +
> + /* We can fill the whole register with copies of the byte by
> multiplying
> + by 0x010101...
> + For DImode this requires a tmp reg with lldw, but only if we will
> + actually do nonzero iterations of stxdw. */
> + if (mode < DImode || iters == 0)
> + emit_move_insn (src, gen_rtx_MULT (mode, src, GEN_INT (0x01010101)));
> + else
> + {
> + rtx tmp = gen_reg_rtx (mode);
> + emit_move_insn (tmp, GEN_INT (0x0101010101010101));
> + emit_move_insn (src, gen_rtx_MULT (mode, src, tmp));
> + }
> + }
> +
> for (unsigned int i = 0; i < iters; i++)
> {
> - emit_move_insn (adjust_address (dst, mode, offset), val);
> + emit_move_insn (adjust_address (dst, mode, offset), src);
> offset += inc;
> }
> if (remainder & 4)
> {
> - emit_move_insn (adjust_address (dst, SImode, offset), val);
> + emit_move_insn (adjust_address (dst, SImode, offset),
> + REG_P (src)
> + ? simplify_gen_subreg (SImode, src, mode, 0)
> + : src);
> offset += 4;
> remainder -= 4;
> }
> if (remainder & 2)
> {
> - emit_move_insn (adjust_address (dst, HImode, offset), val);
> + emit_move_insn (adjust_address (dst, HImode, offset),
> + REG_P (src)
> + ? simplify_gen_subreg (HImode, src, mode, 0)
> + : src);
> offset += 2;
> remainder -= 2;
> }
> if (remainder & 1)
> - emit_move_insn (adjust_address (dst, QImode, offset), val);
> + emit_move_insn (adjust_address (dst, QImode, offset),
> + REG_P (src)
> + ? simplify_gen_subreg (QImode, src, mode, 0)
> + : src);
>
> return true;
> }
> diff --git a/gcc/testsuite/gcc.target/bpf/memset-3.c
> b/gcc/testsuite/gcc.target/bpf/memset-3.c
> new file mode 100644
> index 00000000000..0b044a07a4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memset-3.c
> @@ -0,0 +1,56 @@
> +/* Test that inline memset expansion properly duplicates the byte value
> + across the bytes to fill. PR target/122139. */
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -masm=normal" } */
> +
> +#define SIZE 63
> +
> +unsigned char cdata[SIZE];
> +unsigned short sdata[SIZE / 2 + 1];
> +unsigned int idata[SIZE / 4 + 1];
> +unsigned long ldata[SIZE / 8 + 1];
> +
> +void
> +a (void)
> +{
> + __builtin_memset (cdata, 0x54, SIZE);
> +}
> +/* 0x54=84 */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,84" 63 } } */
> +
> +void
> +b (void)
> +{
> + __builtin_memset (sdata, 0x7a, SIZE);
> +}
> +
> +/* 0x7a=122, 0x7a7a=31354 */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,31354" 31 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,122" 1 } } */
> +
> +void
> +c (void)
> +{
> + __builtin_memset (idata, 0x23, SIZE);
> +}
> +
> +/* 0x23=35, 0x2323=8995, 0x23232323=589505315 */
> +/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,589505315" 15 } }
> */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,8995" 1 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,35" 1 } } */
> +
> +void
> +d (void)
> +{
> + __builtin_memset (ldata, 0xcb, SIZE);
> +}
> +
> +/* 0xcbcbcbcb_cbcbcbcb = -3761688987579986997,
> + 0xcbcbcbcb = -875836469
> + 0xcbcb = -13365
> + 0xcb = -53 */
> +/* { dg-final { scan-assembler-times "lddw\t%r.,-3761688987579986997"}} */
> +/* { dg-final { scan-assembler-times "stxdw" 7 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,-875836469" 1 } }
> */
> +/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,-13365" 1 } } */
> +/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,-53" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/bpf/memset-4.c
> b/gcc/testsuite/gcc.target/bpf/memset-4.c
> new file mode 100644
> index 00000000000..0c835c5e4d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/bpf/memset-4.c
> @@ -0,0 +1,24 @@
> +/* Test that inline memset expansion properly duplicates the byte value
> + across the bytes to fill for non-const value. PR target/122139. */
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -masm=normal" } */
> +
> +#define SIZE 63
> +
> +unsigned char cdata[SIZE];
> +unsigned short sdata[SIZE / 2 + 1];
> +unsigned int idata[SIZE / 4 + 1];
> +unsigned long ldata[SIZE / 8 + 1];
> +
> +void
> +c (unsigned char byte)
> +{
> + __builtin_memset (idata, byte, SIZE);
> +}
> +
> +/* Hard to verify for non-const value. Look for the mul by 0x01010101
> + and the proper number of stores... */
> +/* { dg-final { scan-assembler "mul32\[\t \]%r.,16843009" } } */
> +/* { dg-final { scan-assembler-times "stxw" 15 } } */
> +/* { dg-final { scan-assembler-times "stxh" 1 } } */
> +/* { dg-final { scan-assembler-times "stxb" 1 } } */