On Tue, Jul 29, 2025 at 10:32 AM Tamar Christina <tamar.christ...@arm.com> wrote: > > Hi Pengfei, > > > -----Original Message----- > > From: Pengfei Li <pengfei....@arm.com> > > Sent: Monday, July 21, 2025 12:03 PM > > To: gcc-patches@gcc.gnu.org > > Cc: rguent...@suse.de; s...@gentoo.org; Tamar Christina > > <tamar.christ...@arm.com>; Pengfei Li <pengfei....@arm.com> > > Subject: [PATCH] vect: Fix insufficient alignment requirement for > > speculative loads > > [PR121190] > > > > This patch fixes a segmentation fault issue that can occur in vectorized > > loops with an early break. When GCC vectorizes such loops, it may insert > > a versioning check to ensure that data references (DRs) with speculative > > loads are aligned. The check normally requires DRs to be aligned to the > > vector mode size, which prevents generated vector load instructions from > > crossing page boundaries. > > > > However, this is not sufficient when a single scalar load is vectorized > > into multiple loads within the same iteration. In such cases, even if > > none of the vector loads crosses page boundaries, subsequent loads after > > the first one may still access memory beyond current valid page. > > > > Consider the following loop as an example: > > > > while (i < MAX_COMPARE) { > > if (*(p + i) != *(q + i)) > > return i; > > i++; > > } > > > > When compiled with "-O3 -march=znver2" on x86, the vectorized loop may > > include instructions like: > > > > vmovdqa (%rcx,%rax), %ymm0 > > vmovdqa 32(%rcx,%rax), %ymm1 > > vpcmpeqq (%rdx,%rax), %ymm0, %ymm0 > > vpcmpeqq 32(%rdx,%rax), %ymm1, %ymm1 > > > > Note two speculative vector loads are generated for each DR (p and q). > > The first vmovdqa and vpcmpeqq are safe due to the vector size (32-byte) > > alignment, but the following ones (at offset 32) may not be safe because > > they could read from the beginning of the next memory page, potentially > > leading to segmentation faults. > > > > To avoid the issue, this patch increases the alignment requirement for > > speculative loads to DR_TARGET_ALIGNMENT. It ensures all vector loads in > > the same vector iteration access memory within the same page. > > > > This patch is bootstrapped and regression-tested on x86_64-linux-gnu, > > arm-linux-gnueabihf and aarch64-linux-gnu. > > > > PR tree-optimization/121190 > > > > gcc/ChangeLog: > > > > * tree-vect-data-refs.cc (vect_enhance_data_refs_alignment): > > Increase alignment requirement for speculative loads. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/vect/vect-early-break_52.c: Update an unsafe test. > > * gcc.dg/vect/vect-early-break_137.c-pr121190: New test. > > --- > > .../vect/vect-early-break_137-pr121190.c | 60 +++++++++++++++++++ > > .../gcc.dg/vect/vect-early-break_52.c | 2 +- > > gcc/tree-vect-data-refs.cc | 15 ++++- > > 3 files changed, 75 insertions(+), 2 deletions(-) > > create mode 100644 > > gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c > > b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c > > new file mode 100644 > > index 00000000000..da11146c578 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c > > @@ -0,0 +1,60 @@ > > +/* PR tree-optimization/121190 */ > > +/* { dg-do run } */ > > +/* { dg-options "-O3" } */ > > +/* { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } > > } */ > > +/* { dg-require-effective-target mmap } */ > > +/* { dg-require-effective-target vect_early_break } */ > > + > > +#include <stdint.h> > > +#include <string.h> > > +#include <stdio.h> > > +#include <sys/mman.h> > > +#include <unistd.h> > > + > > +#define MAX_COMPARE 5000 > > + > > +__attribute__((noipa)) > > +int diff (uint64_t *restrict p, uint64_t *restrict q) > > +{ > > + int i = 0; > > + while (i < MAX_COMPARE) { > > + if (*(p + i) != *(q + i)) > > + return i; > > + i++; > > + } > > + return -1; > > +} > > + > > +int main () > > +{ > > + long pgsz = sysconf (_SC_PAGESIZE); > > + if (pgsz == -1) { > > + fprintf (stderr, "sysconf failed\n"); > > + return 0; > > + } > > + > > + /* Allocate 2 consecutive pages of memory and let p1 and p2 point to the > > + beginning of each. */ > > + void *mem = mmap (NULL, pgsz * 2, PROT_READ | PROT_WRITE, > > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > > + if (mem == MAP_FAILED) { > > + fprintf (stderr, "mmap failed\n"); > > + return 0; > > + } > > + uint64_t *p1 = (uint64_t *) mem; > > + uint64_t *p2 = (uint64_t *) mem + pgsz / sizeof (uint64_t); > > + > > + /* Fill the first page with zeros, except for its last 64 bits. */ > > + memset (p1, 0, pgsz); > > + *(p2 - 1) = -1; > > + > > + /* Make the 2nd page not accessable. */ > > + mprotect (p2, pgsz, PROT_NONE); > > + > > + /* Calls to diff should not read the 2nd page. */ > > + for (int i = 1; i <= 20; i++) { > > + if (diff (p2 - i, p1) != i - 1) > > + __builtin_abort (); > > + } > > +} > > + > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c > > b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c > > index 86a632f2a82..6abfcd6580e 100644 > > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c > > @@ -18,4 +18,4 @@ int main1 (short X) > > } > > } > > > > -/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" { > > target { ! > > "x86_64-*-* i?86-*-*" } } } } */ > > +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" { > > target { ! > > "x86_64-*-* i?86-*-* arm*-*-*" } } } } */ > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > > index c84cd29116e..829af2fd28d 100644 > > --- a/gcc/tree-vect-data-refs.cc > > +++ b/gcc/tree-vect-data-refs.cc > > @@ -2964,12 +2964,25 @@ vect_enhance_data_refs_alignment (loop_vec_info > > loop_vinfo) > > break; > > } > > > > + /* Normally, we require DRs to be aligned to the vector mode > > size. > > + However, this is not sufficient when a statement involves > > safe > > + speculative read. In such cases, a single scalar load can be > > + vectorized into multiple vector loads in one loop iteration. > > + Even if the first vector load is safe, subsequent loads might > > + still access an invalid memory page. We increase the > > alignment > > + requirement to prevent this. */ > > + poly_uint64 required_align_size; > > + if (dr_safe_speculative_read_required (stmt_info)) > > + required_align_size = DR_TARGET_ALIGNMENT (dr_info); > > + else > > + required_align_size = GET_MODE_SIZE (TYPE_MODE (vectype)); > > + > > I wonder about what the intention of this code was. It seems to me that it > was > trying to disable versioning for VLA, but then also doubling up and using the > mode as the alignment. But the cross iteration alignment check below this on > uses DR_TARGET_ALIGNMENT as one would expect, since the target alignment > can be different from the vector size. > > So I wonder if something like this isn't more appropriate: > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index 75e06ff28e6..8595c76eae2 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -2972,7 +2972,8 @@ vect_enhance_data_refs_alignment (loop_vec_info > loop_vinfo) > VF is a power of two. We could relax this if we added > a way of enforcing a power-of-two size. */ > unsigned HOST_WIDE_INT size; > - if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) > + if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant () > + || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&size))
I agree that checking DR_TARGET_ALIGNMENT is what needs to be done, I'm not sure why we checked the size - probably historic. But there's no need to keep the size check, so just diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index b3ec0b67826..8d43b99a6f4 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -2969,7 +2969,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) VF is a power of two. We could relax this if we added a way of enforcing a power-of-two size. */ unsigned HOST_WIDE_INT size; - if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) + if (!DR_TARGET_ALIGNMENT (dr_info).is_constant (&size)) { do_versioning = false; break; is correct. OK if that works, and sorry for the delay. Richard. > { > do_versioning = false; > break; > > Since it seems like we should always make the alignment check based on the > actual > alignment rather than vector size, unless I misunderstood something? > > Thanks, > Tamar > > > /* At present we don't support versioning for alignment > > with variable VF, since there's no guarantee that the > > VF is a power of two. We could relax this if we added > > a way of enforcing a power-of-two size. */ > > unsigned HOST_WIDE_INT size; > > - if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) > > + if (!required_align_size.is_constant (&size)) > > { > > do_versioning = false; > > break; > > -- > > 2.43.0 >