> Am 29.07.2025 um 20:37 schrieb Pengfei Li <pengfei....@arm.com>:
>
> Hi,
>
> The comment in v2 is addressed. Tested again on both trunk and gcc-15.
>
> Ok for trunk and gcc-15?
>
Ok
Richard
> Changes in v3:
> - Extract the constant VF check out.
>
> Changes in v2:
> - Remove the condition of dr_safe_speculative_read_required.
> - Add a constant VF check.
>
> Thanks,
> Pengfei
>
> -- >8 --
> This patch fixes a segmentation fault issue that can occur in vectorized
> loops with an early break. When GCC vectorizes such loops, it may insert
> a versioning check to ensure that data references (DRs) with speculative
> loads are aligned. The check normally requires DRs to be aligned to the
> vector mode size, which prevents generated vector load instructions from
> crossing page boundaries.
>
> However, this is not sufficient when a single scalar load is vectorized
> into multiple loads within the same iteration. In such cases, even if
> none of the vector loads crosses page boundaries, subsequent loads after
> the first one may still access memory beyond current valid page.
>
> Consider the following loop as an example:
>
> while (i < MAX_COMPARE) {
> if (*(p + i) != *(q + i))
> return i;
> i++;
> }
>
> When compiled with "-O3 -march=znver2" on x86, the vectorized loop may
> include instructions like:
>
> vmovdqa (%rcx,%rax), %ymm0
> vmovdqa 32(%rcx,%rax), %ymm1
> vpcmpeqq (%rdx,%rax), %ymm0, %ymm0
> vpcmpeqq 32(%rdx,%rax), %ymm1, %ymm1
>
> Note two speculative vector loads are generated for each DR (p and q).
> The first vmovdqa and vpcmpeqq are safe due to the vector size (32-byte)
> alignment, but the following ones (at offset 32) may not be safe because
> they could read from the beginning of the next memory page, potentially
> leading to segmentation faults.
>
> To avoid the issue, this patch increases the alignment requirement to
> DR_TARGET_ALIGNMENT. It ensures all vector loads in the same vector
> iteration access memory within the same page.
>
> This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
> arm-linux-gnueabihf and aarch64-linux-gnu.
>
> PR tree-optimization/121190
>
> gcc/ChangeLog:
>
> * tree-vect-data-refs.cc (vect_enhance_data_refs_alignment):
> Increase alignment requirement for speculative loads.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-early-break_52.c: Update an unsafe test.
> * gcc.dg/vect/vect-early-break_137.c-pr121190: New test.
> ---
> .../vect/vect-early-break_137-pr121190.c | 62 +++++++++++++++++++
> .../gcc.dg/vect/vect-early-break_52.c | 2 +-
> gcc/tree-vect-data-refs.cc | 20 ++----
> 3 files changed, 69 insertions(+), 15 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
> new file mode 100644
> index 00000000000..e6b071c411c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
> @@ -0,0 +1,62 @@
> +/* PR tree-optimization/121190 */
> +/* { dg-options "-O3" } */
> +/* { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } }
> */
> +/* { dg-require-effective-target mmap } */
> +/* { dg-require-effective-target vect_early_break } */
> +
> +#include <stdint.h>
> +#include <string.h>
> +#include <stdio.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +#include "tree-vect.h"
> +
> +#define MAX_COMPARE 5000
> +
> +__attribute__((noipa))
> +int diff (uint64_t *restrict p, uint64_t *restrict q)
> +{
> + int i = 0;
> + while (i < MAX_COMPARE) {
> + if (*(p + i) != *(q + i))
> + return i;
> + i++;
> + }
> + return -1;
> +}
> +
> +int main ()
> +{
> + check_vect ();
> +
> + long pgsz = sysconf (_SC_PAGESIZE);
> + if (pgsz == -1) {
> + fprintf (stderr, "sysconf failed\n");
> + return 0;
> + }
> +
> + /* Allocate 2 consecutive pages of memory and let p1 and p2 point to the
> + beginning of each. */
> + void *mem = mmap (NULL, pgsz * 2, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> + if (mem == MAP_FAILED) {
> + fprintf (stderr, "mmap failed\n");
> + return 0;
> + }
> + uint64_t *p1 = (uint64_t *) mem;
> + uint64_t *p2 = (uint64_t *) mem + pgsz / sizeof (uint64_t);
> +
> + /* Fill the first page with zeros, except for its last 64 bits. */
> + memset (p1, 0, pgsz);
> + *(p2 - 1) = -1;
> +
> + /* Make the 2nd page not accessable. */
> + mprotect (p2, pgsz, PROT_NONE);
> +
> + /* Calls to diff should not read the 2nd page. */
> + for (int i = 1; i <= 20; i++) {
> + if (diff (p2 - i, p1) != i - 1)
> + __builtin_abort ();
> + }
> +}
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
> index 86a632f2a82..6abfcd6580e 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
> @@ -18,4 +18,4 @@ int main1 (short X)
> }
> }
>
> -/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" {
> target { ! "x86_64-*-* i?86-*-*" } } } } */
> +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" {
> target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index e7919b73c25..da700cd1f3d 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -2918,12 +2918,14 @@ vect_enhance_data_refs_alignment (loop_vec_info
> loop_vinfo)
> 2) there is at least one unsupported misaligned data ref with an unknown
> misalignment, and
> 3) all misaligned data refs with a known misalignment are supported, and
> - 4) the number of runtime alignment checks is within reason. */
> + 4) the number of runtime alignment checks is within reason.
> + 5) the vectorization factor is a constant. */
>
> do_versioning
> = (optimize_loop_nest_for_speed_p (loop)
> && !loop->inner /* FORNOW */
> - && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
> + && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP)
> + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ();
>
> if (do_versioning)
> {
> @@ -2964,17 +2966,6 @@ vect_enhance_data_refs_alignment (loop_vec_info
> loop_vinfo)
> break;
> }
>
> - /* At present we don't support versioning for alignment
> - with variable VF, since there's no guarantee that the
> - VF is a power of two. We could relax this if we added
> - a way of enforcing a power-of-two size. */
> - unsigned HOST_WIDE_INT size;
> - if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
> - {
> - do_versioning = false;
> - break;
> - }
> -
> /* Forcing alignment in the first iteration is no good if
> we don't keep it across iterations. For now, just disable
> versioning in this case.
> @@ -2993,7 +2984,8 @@ vect_enhance_data_refs_alignment (loop_vec_info
> loop_vinfo)
> Construct the mask needed for this test. For example,
> GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
> mask must be 15 = 0xf. */
> - int mask = size - 1;
> + gcc_assert (DR_TARGET_ALIGNMENT (dr_info).is_constant ());
> + int mask = DR_TARGET_ALIGNMENT (dr_info).to_constant () - 1;
>
> /* FORNOW: use the same mask to test all potentially unaligned
> references in the loop. */
> --
> 2.43.0
>