On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> If a single instruction can store or move the whole block of memory, use
> vector instruction and don't align destination.
>
> gcc/
>
>         PR target/121934
>         * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a
>         single instruction can store or move the whole block of memory,
>         use vector instruction and don't align destination.
>
> gcc/testsuite/
>
>         PR target/121934
>         * gcc.target/i386/pr121934-1a.c: New test.
>         * gcc.target/i386/pr121934-1b.c: Likewise.
>         * gcc.target/i386/pr121934-2a.c: Likewise.
>         * gcc.target/i386/pr121934-2b.c: Likewise.
>         * gcc.target/i386/pr121934-3a.c: Likewise.
>         * gcc.target/i386/pr121934-3b.c: Likewise.
>         * gcc.target/i386/pr121934-4a.c: Likewise.
>         * gcc.target/i386/pr121934-4b.c: Likewise.
>         * gcc.target/i386/pr121934-5a.c: Likewise.
>         * gcc.target/i386/pr121934-5b.c: Likewise.

OK.

Thanks,
Uros.

>
> Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> ---
>  gcc/config/i386/i386-expand.cc              | 62 +++++++++++++--------
>  gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-1b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-2b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-3b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-4b.c |  7 +++
>  gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++
>  gcc/testsuite/gcc.target/i386/pr121934-5b.c |  7 +++
>  11 files changed, 187 insertions(+), 24 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index dc26b3452cb..b0b9e6da946 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
> count_exp, rtx val_exp,
>    if (!issetmem)
>      srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
>
> +  bool aligned_dstmem = false;
> +  unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
> +  bool single_insn_p = count && count <= nunits;
> +  if (single_insn_p)
> +    {
> +      /* If it can be done with a single instruction, use vector
> +        instruction and don't align destination.  */
> +      alg = vector_loop;
> +      noalign = true;
> +      dynamic_check = -1;
> +    }
> +
>    unroll_factor = 1;
>    move_mode = word_mode;
> -  int nunits;
>    switch (alg)
>      {
>      case libcall:
> @@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
> count_exp, rtx val_exp,
>        need_zero_guard = true;
>        unroll_factor = 4;
>        /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
> -      nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
>        nunits /= GET_MODE_SIZE (word_mode);
>        if (nunits > 1)
>         {
> @@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx 
> count_exp, rtx val_exp,
>      }
>    gcc_assert (desired_align >= 1 && align >= 1);
>
> -  /* Misaligned move sequences handle both prologue and epilogue at once.
> -     Default code generation results in a smaller code for large alignments
> -     and also avoids redundant job when sizes are known precisely.  */
> -  misaligned_prologue_used
> -    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> -       && MAX (desired_align, epilogue_size_needed) <= 32
> -       && desired_align <= epilogue_size_needed
> -       && ((desired_align > align && !align_bytes)
> -          || (!count && epilogue_size_needed > 1)));
> -
> -  /* Destination is aligned after the misaligned prologue.  */
> -  bool aligned_dstmem = misaligned_prologue_used;
> -
> -  if (noalign && !misaligned_prologue_used)
> -    {
> -      /* Also use misaligned prologue if alignment isn't needed and
> -        destination isn't aligned.   Since alignment isn't needed,
> -        the destination after prologue won't be aligned.  */
> -      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> -                       <= MEM_ALIGN (dst));
> -      if (!aligned_dstmem)
> -       misaligned_prologue_used = true;
> +  if (!single_insn_p)
> +    {
> +      /* Misaligned move sequences handle both prologue and epilogue
> +        at once.  Default code generation results in a smaller code
> +        for large alignments and also avoids redundant job when sizes
> +        are known precisely.  */
> +      misaligned_prologue_used
> +       = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
> +          && MAX (desired_align, epilogue_size_needed) <= 32
> +          && desired_align <= epilogue_size_needed
> +          && ((desired_align > align && !align_bytes)
> +              || (!count && epilogue_size_needed > 1)));
> +
> +      /* Destination is aligned after the misaligned prologue.  */
> +      aligned_dstmem = misaligned_prologue_used;
> +
> +      if (noalign && !misaligned_prologue_used)
> +       {
> +         /* Also use misaligned prologue if alignment isn't needed and
> +            destination isn't aligned.   Since alignment isn't needed,
> +            the destination after prologue won't be aligned.  */
> +         aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
> +                           <= MEM_ALIGN (dst));
> +         if (!aligned_dstmem)
> +           misaligned_prologue_used = true;
> +       }
>      }
>
>    /* Do the cheap promotion to allow better CSE across the
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> new file mode 100644
> index 00000000000..6b6881367db
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c, d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> new file mode 100644
> index 00000000000..47381ec3476
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre 
> -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-1a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> new file mode 100644
> index 00000000000..49def11aa4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */
> +
> +extern int f();
> +int a, b, c;
> +long long int d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (long long int) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> new file mode 100644
> index 00000000000..1c634dfe420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre 
> -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-2a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> new file mode 100644
> index 00000000000..0c04b69c0d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(128) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(128)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> new file mode 100644
> index 00000000000..ff4b0831cea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 
> -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-3a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> new file mode 100644
> index 00000000000..5aa3e069cff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx 
> -mprefer-vector-width=256" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(256) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(256)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> new file mode 100644
> index 00000000000..5f8241dcad5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx 
> -mprefer-vector-width=256 
> -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-4a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> new file mode 100644
> index 00000000000..10be0dd4343
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f 
> -mprefer-vector-width=512" } */
> +
> +extern int f();
> +int a, b, c;
> +_BitInt(512) d[3];
> +void g() {
> +  int h;
> +  if (f()) {
> +    if (b)
> +    i:
> +      c > 0;
> +    a = 0;
> +    for (h = 0; h < 3; h++) {
> +      if (a != 1)
> +        __builtin_printf("0\n");
> +      d[h] = (_BitInt(512)) -1;
> +    }
> +    goto i;
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c 
> b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> new file mode 100644
> index 00000000000..6a45a8a7a8b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp 
> -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f 
> -mprefer-vector-width=512 
> -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */
> +
> +#include "pr121934-5a.c"
> +
> +/* { dg-final { scan-assembler-not "rep stos" } } */
> +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */
> --
> 2.51.0
>

Reply via email to