On Mon, 23 Jun 2025, Tamar Christina wrote:
> Consider the loop
>
> void f1 (int *restrict a, int n)
> {
> #pragma GCC unroll 4 requested
> for (int i = 0; i < n; i++)
> a[i] *= 2;
> }
>
> Which today is vectorized and then unrolled 3x by the RTL unroller due to the
> use of the pragma. This is unfortunate because the pragma was intended for
> the
> scalar loop but we end up with an unrolled vector loop and a longer path to
> the
> entry which has a low enough VF requirement to enter.
>
> This patch instead seeds the suggested_unroll_factor with the value the user
> requested and instead uses it to maintain the total VF that the user wanted
> the
> scalar loop to maintain.
>
> In effect it applies the unrolling inside the vector loop itself. This has
> the
> benefits for things like reductions, as it allows us to split the accumulator
> and so the unrolled loop is more efficient. For early-break it allows the
> cbranch call to be shared between the unrolled elements, giving you more
> effective unrolling because it doesn't need the repeated cbranch which can be
> expensive.
>
> The target can then choose to create multiple epilogues to deal with the
> "rest".
>
> The example above now generates:
>
> .L4:
> ldr q31, [x2]
> add v31.4s, v31.4s, v31.4s
> str q31, [x2], 16
> cmp x2, x3
> bne .L4
>
> as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:
>
> .L4:
> ldp q30, q31, [x2]
> add v30.4s, v30.4s, v30.4s
> add v31.4s, v31.4s, v31.4s
> stp q30, q31, [x2], 32
> cmp x3, x2
> bne .L4
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
OK.
Thanks,
Richard.
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * doc/extend.texi: Document pragma unroll interaction with vectorizer.
> * tree-vectorizer.h (LOOP_VINFO_USER_UNROLL): New.
> (class _loop_vec_info): Add user_unroll.
> * tree-vect-loop.cc (vect_analyze_loop_1): Set
> suggested_unroll_factor and retry.
> (_loop_vec_info::_loop_vec_info): Initialize user_unroll.
> (vect_transform_loop): Clear the loop->unroll value if the pragma was
> used.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/unroll-vect.c: New test.
>
> ---
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index
> 69c6512074642ece47f1f9a3d7bdde20ec800d40..7da99f77ec82b23f7a79558f3a0fa98b208f8283
> 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -10382,6 +10382,11 @@ loop or a @code{#pragma GCC ivdep}, and applies only
> to the loop that follows.
> @var{n} is an integer constant expression specifying the unrolling factor.
> The values of @math{0} and @math{1} block any unrolling of the loop.
>
> +If the loop was vectorized the unroll factor specified will be used to seed
> the
> +vectorizer unroll factor. Whether the loop is unrolled or not will be
> +determined by target costing. The resulting vectorized loop may still be
> +unrolled more in later passes depending on the target costing.
> +
> @end table
>
> @node Thread-Local
> diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
> b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=armv8-a --param
> aarch64-autovec-preference=asimd-only -std=gnu99" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** f1:
> +** ...
> +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (int *restrict a, int n)
> +{
> +#pragma GCC unroll 16
> + for (int i = 0; i < n; i++)
> + a[i] *= 2;
> +}
> +
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index
> 9ac4d7e5f7a099a7039cd4186666cf64328b8ee6..44f304b6e3766d43d388599b6a80ab9e8e3123ef
> 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
> vec_info_shared *shared)
> peeling_for_gaps (false),
> peeling_for_niter (false),
> early_breaks (false),
> + user_unroll (false),
> no_data_dependencies (false),
> has_mask_store (false),
> scalar_loop_scaling (profile_probability::uninitialized ()),
> @@ -3428,27 +3429,50 @@ vect_analyze_loop_1 (class loop *loop,
> vec_info_shared *shared,
> res ? "succeeded" : "failed",
> GET_MODE_NAME (loop_vinfo->vector_mode));
>
> - if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor
> > 1)
> + auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
> + if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> + /* Check to see if the user wants to unroll or if the target wants to.
> */
> + && (suggested_unroll_factor > 1 || user_unroll > 1))
> {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_NOTE, vect_location,
> + if (suggested_unroll_factor == 1)
> + {
> + int assumed_vf = vect_vf_for_cost (loop_vinfo);
> + suggested_unroll_factor = user_unroll / assumed_vf;
> + if (suggested_unroll_factor > 1)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "setting unroll factor to %d based on user requested "
> + "unroll factor %d and suggested vectorization "
> + "factor: %d\n",
> + suggested_unroll_factor, user_unroll, assumed_vf);
> + }
> + }
> +
> + if (suggested_unroll_factor > 1)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> "***** Re-trying analysis for unrolling"
> " with unroll factor %d and slp %s.\n",
> suggested_unroll_factor,
> slp_done_for_suggested_uf ? "on" : "off");
> - loop_vec_info unroll_vinfo
> - = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
> - unroll_vinfo->vector_mode = vector_mode;
> - unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
> - opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
> - slp_done_for_suggested_uf);
> - if (new_res)
> - {
> - delete loop_vinfo;
> - loop_vinfo = unroll_vinfo;
> - }
> - else
> - delete unroll_vinfo;
> + loop_vec_info unroll_vinfo
> + = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
> + unroll_vinfo->vector_mode = vector_mode;
> + unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
> + opt_result new_res
> + = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
> + slp_done_for_suggested_uf);
> + if (new_res)
> + {
> + delete loop_vinfo;
> + loop_vinfo = unroll_vinfo;
> + LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
> + }
> + else
> + delete unroll_vinfo;
> + }
> }
>
> /* Remember the autodetected vector mode. */
> @@ -12041,6 +12065,13 @@ vect_transform_loop (loop_vec_info loop_vinfo,
> gimple *loop_vectorized_call)
> dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
> " variable-length vectorization factor\n");
> }
> +
> + /* When we have unrolled the loop due to a user requested value we should
> + leave it up to the RTL unroll heuristics to determine if it's still
> worth
> + while to unroll more. */
> + if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
> + loop->unroll = 0;
> +
> /* Free SLP instances here because otherwise stmt reference counting
> won't work. */
> slp_instance instance;
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index
> 7aa2b02b63cb88a61b88295c43d4325ee487c619..5351e1aee44fb3b9a75c13940b5d239fb48871cb
> 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -970,6 +970,10 @@ public:
> /* Main loop IV cond. */
> gcond* loop_iv_cond;
>
> + /* True if we have an unroll factor requested by the user through pragma
> GCC
> + unroll. */
> + bool user_unroll;
> +
> /* True if there are no loop carried data dependencies in the loop.
> If loop->safelen <= 1, then this is always true, either the loop
> didn't have any loop carried data dependencies, or the loop is being
> @@ -1094,6 +1098,7 @@ public:
> #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L) (L)->check_unequal_addrs
> #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero
> #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds
> +#define LOOP_VINFO_USER_UNROLL(L) (L)->user_unroll
> #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores
> #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
> #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)