On Sun, Sep 14, 2025 at 9:14 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > If a single instruction can store or move the whole block of memory, use > vector instruction and don't align destination. > > gcc/ > > PR target/121934 > * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a > single instruction can store or move the whole block of memory, > use vector instruction and don't align destination. > > gcc/testsuite/ > > PR target/121934 > * gcc.target/i386/pr121934-1a.c: New test. > * gcc.target/i386/pr121934-1b.c: Likewise. > * gcc.target/i386/pr121934-2a.c: Likewise. > * gcc.target/i386/pr121934-2b.c: Likewise. > * gcc.target/i386/pr121934-3a.c: Likewise. > * gcc.target/i386/pr121934-3b.c: Likewise. > * gcc.target/i386/pr121934-4a.c: Likewise. > * gcc.target/i386/pr121934-4b.c: Likewise. > * gcc.target/i386/pr121934-5a.c: Likewise. > * gcc.target/i386/pr121934-5b.c: Likewise.
OK. Thanks, Uros. > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > --- > gcc/config/i386/i386-expand.cc | 62 +++++++++++++-------- > gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++ > gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++ > gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++ > gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++ > gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++ > gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++ > gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++ > gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++ > gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++ > gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++ > 11 files changed, 187 insertions(+), 24 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index dc26b3452cb..b0b9e6da946 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx > count_exp, rtx val_exp, > if (!issetmem) > srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); > > + bool aligned_dstmem = false; > + unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX; > + bool single_insn_p = count && count <= nunits; > + if (single_insn_p) > + { > + /* If it can be done with a single instruction, use vector > + instruction and don't align destination. */ > + alg = vector_loop; > + noalign = true; > + dynamic_check = -1; > + } > + > unroll_factor = 1; > move_mode = word_mode; > - int nunits; > switch (alg) > { > case libcall: > @@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx > count_exp, rtx val_exp, > need_zero_guard = true; > unroll_factor = 4; > /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */ > - nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX; > nunits /= GET_MODE_SIZE (word_mode); > if (nunits > 1) > { > @@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx > count_exp, rtx val_exp, > } > gcc_assert (desired_align >= 1 && align >= 1); > > - /* Misaligned move sequences handle both prologue and epilogue at once. > - Default code generation results in a smaller code for large alignments > - and also avoids redundant job when sizes are known precisely. */ > - misaligned_prologue_used > - = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES > - && MAX (desired_align, epilogue_size_needed) <= 32 > - && desired_align <= epilogue_size_needed > - && ((desired_align > align && !align_bytes) > - || (!count && epilogue_size_needed > 1))); > - > - /* Destination is aligned after the misaligned prologue. */ > - bool aligned_dstmem = misaligned_prologue_used; > - > - if (noalign && !misaligned_prologue_used) > - { > - /* Also use misaligned prologue if alignment isn't needed and > - destination isn't aligned. Since alignment isn't needed, > - the destination after prologue won't be aligned. */ > - aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode) > - <= MEM_ALIGN (dst)); > - if (!aligned_dstmem) > - misaligned_prologue_used = true; > + if (!single_insn_p) > + { > + /* Misaligned move sequences handle both prologue and epilogue > + at once. Default code generation results in a smaller code > + for large alignments and also avoids redundant job when sizes > + are known precisely. */ > + misaligned_prologue_used > + = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES > + && MAX (desired_align, epilogue_size_needed) <= 32 > + && desired_align <= epilogue_size_needed > + && ((desired_align > align && !align_bytes) > + || (!count && epilogue_size_needed > 1))); > + > + /* Destination is aligned after the misaligned prologue. */ > + aligned_dstmem = misaligned_prologue_used; > + > + if (noalign && !misaligned_prologue_used) > + { > + /* Also use misaligned prologue if alignment isn't needed and > + destination isn't aligned. Since alignment isn't needed, > + the destination after prologue won't be aligned. */ > + aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode) > + <= MEM_ALIGN (dst)); > + if (!aligned_dstmem) > + misaligned_prologue_used = true; > + } > } > > /* Do the cheap promotion to allow better CSE across the > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c > b/gcc/testsuite/gcc.target/i386/pr121934-1a.c > new file mode 100644 > index 00000000000..6b6881367db > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */ > + > +extern int f(); > +int a, b, c, d[3]; > +void g() { > + int h; > + if (f()) { > + if (b) > + i: > + c > 0; > + a = 0; > + for (h = 0; h < 3; h++) { > + if (a != 1) > + __builtin_printf("0\n"); > + d[h] = -1; > + } > + goto i; > + } > +} > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c > b/gcc/testsuite/gcc.target/i386/pr121934-1b.c > new file mode 100644 > index 00000000000..47381ec3476 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre > -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ > + > +#include "pr121934-1a.c" > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c > b/gcc/testsuite/gcc.target/i386/pr121934-2a.c > new file mode 100644 > index 00000000000..49def11aa4e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */ > + > +extern int f(); > +int a, b, c; > +long long int d[3]; > +void g() { > + int h; > + if (f()) { > + if (b) > + i: > + c > 0; > + a = 0; > + for (h = 0; h < 3; h++) { > + if (a != 1) > + __builtin_printf("0\n"); > + d[h] = (long long int) -1; > + } > + goto i; > + } > +} > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c > b/gcc/testsuite/gcc.target/i386/pr121934-2b.c > new file mode 100644 > index 00000000000..1c634dfe420 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre > -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ > + > +#include "pr121934-2a.c" > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c > b/gcc/testsuite/gcc.target/i386/pr121934-3a.c > new file mode 100644 > index 00000000000..0c04b69c0d4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */ > + > +extern int f(); > +int a, b, c; > +_BitInt(128) d[3]; > +void g() { > + int h; > + if (f()) { > + if (b) > + i: > + c > 0; > + a = 0; > + for (h = 0; h < 3; h++) { > + if (a != 1) > + __builtin_printf("0\n"); > + d[h] = (_BitInt(128)) -1; > + } > + goto i; > + } > +} > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c > b/gcc/testsuite/gcc.target/i386/pr121934-3b.c > new file mode 100644 > index 00000000000..ff4b0831cea > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 > -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ > + > +#include "pr121934-3a.c" > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c > b/gcc/testsuite/gcc.target/i386/pr121934-4a.c > new file mode 100644 > index 00000000000..5aa3e069cff > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx > -mprefer-vector-width=256" } */ > + > +extern int f(); > +int a, b, c; > +_BitInt(256) d[3]; > +void g() { > + int h; > + if (f()) { > + if (b) > + i: > + c > 0; > + a = 0; > + for (h = 0; h < 3; h++) { > + if (a != 1) > + __builtin_printf("0\n"); > + d[h] = (_BitInt(256)) -1; > + } > + goto i; > + } > +} > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c > b/gcc/testsuite/gcc.target/i386/pr121934-4b.c > new file mode 100644 > index 00000000000..5f8241dcad5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx > -mprefer-vector-width=256 > -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ > + > +#include "pr121934-4a.c" > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c > b/gcc/testsuite/gcc.target/i386/pr121934-5a.c > new file mode 100644 > index 00000000000..10be0dd4343 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f > -mprefer-vector-width=512" } */ > + > +extern int f(); > +int a, b, c; > +_BitInt(512) d[3]; > +void g() { > + int h; > + if (f()) { > + if (b) > + i: > + c > 0; > + a = 0; > + for (h = 0; h < 3; h++) { > + if (a != 1) > + __builtin_printf("0\n"); > + d[h] = (_BitInt(512)) -1; > + } > + goto i; > + } > +} > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c > b/gcc/testsuite/gcc.target/i386/pr121934-5b.c > new file mode 100644 > index 00000000000..6a45a8a7a8b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp > -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f > -mprefer-vector-width=512 > -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ > + > +#include "pr121934-5a.c" > + > +/* { dg-final { scan-assembler-not "rep stos" } } */ > +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ > -- > 2.51.0 >