On Mon, 24 Nov 2025, Richard Biener wrote:

> The following recors both a possibly notinbranch and an inbranch
> SIMD clone during analysis so that we can properly handle the
> late decision on loop masking.  Recording of linear-clause data
> from analysis is extended to cover linear-clause arguments from
> both clones.
> 
> This also fixes AVX512 masked loop code generation in line with
> the previous fixes.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Does this look OK?

I have split out and pushed the last two hunks of the
vectorizable_simd_clone_call part as correctness fix, so this now
remains as missed-optimization fix.

Richard.

> Thanks,
> Richard.
> 
>       PR tree-optimization/122776
>       * tree-vectorizer.h (vect_simd_clone_data::clone,
>       vect_simd_clone_data::clone_inbranch): New fields for
>       the two selected clones.
>       * tree-vect-stmts.cc (vectorizable_simd_clone_call): Record
>       both a possibly notinbranch and a inbranch clone.  Delay
>       the choice between both to code generation based on
>       LOOP_VINFO_FULLY_MASKED_P.  Fix masked loop inbranch
>       code generation for the AVX512 case.
> 
>       * gcc.dg/vect/vect-simd-clone-24.c: New testcase.
>       * gcc.dg/gomp/pr110485.c: Adjust.
> ---
>  gcc/testsuite/gcc.dg/gomp/pr110485.c          |   2 +-
>  .../gcc.dg/vect/vect-simd-clone-24.c          |  22 ++++
>  gcc/tree-vect-stmts.cc                        | 102 ++++++++++--------
>  gcc/tree-vectorizer.h                         |   6 +-
>  4 files changed, 86 insertions(+), 46 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> 
> diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
> b/gcc/testsuite/gcc.dg/gomp/pr110485.c
> index ba6817a127f..5183f3f403c 100644
> --- a/gcc/testsuite/gcc.dg/gomp/pr110485.c
> +++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
> @@ -16,4 +16,4 @@ void foo (int n)
>  }
>  
>  /* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
> -/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
> non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
> +/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no 
> masked simd clone was available" "vect" { target x86_64-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c 
> b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> new file mode 100644
> index 00000000000..081c19bf58f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
> +/* { dg-require-effective-target vect_simd_clones } */
> +/* { dg-additional-options "-fopenmp-simd --param 
> vect-partial-vector-usage=1 -fdump-tree-dce6" } */
> +/* { dg-additional-options "-mavx512f" { target avx512f } } */
> +
> +#pragma omp declare simd simdlen(16)
> +int __attribute__((const)) baz (int x);
> +
> +int a[1024];
> +
> +void foo (int n, int * __restrict b)
> +{
> +  for (int i = 0; i < n; ++i)
> +    if (baz (a[i]))
> +      b[i] = baz (b[i]);
> +}
> +
> +/* One notinbranch SIMD call, one inbranch in the main vector loop and two
> +   inbranch in the masked epilog.  */
> +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1 
> "dce6" { target avx512f } } } */
> +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] 
> \\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a18772f5928..bfed98b8af0 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -4219,9 +4219,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
>    unsigned group_size = SLP_TREE_LANES (slp_node);
>    unsigned int badness = 0;
> +  unsigned int badness_inbranch = 0;
>    struct cgraph_node *bestn = NULL;
> +  struct cgraph_node *bestn_inbranch = NULL;
>    if (!cost_vec)
> -    bestn = cgraph_node::get (simd_clone_info[0]);
> +    bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +          ? data.clone_inbranch : data.clone);
>    else
>      for (struct cgraph_node *n = node->simd_clones; n != NULL;
>        n = n->simdclone->next_clone)
> @@ -4351,14 +4354,19 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                       SIMD_CLONE_ARG_TYPE_MASK);
>           /* Penalize using a masked SIMD clone in a non-masked loop, that is
>              not in a branch, as we'd have to construct an all-true mask.  */
> -         if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> -           this_badness += 64;
> +         this_badness += 64;
>         }
>       if (bestn == NULL || this_badness < badness)
>         {
>           bestn = n;
>           badness = this_badness;
>         }
> +     if (n->simdclone->inbranch
> +         && (bestn_inbranch == NULL || this_badness < badness_inbranch))
> +       {
> +         bestn_inbranch = n;
> +         badness_inbranch = this_badness;
> +       }
>        }
>  
>    if (bestn == NULL)
> @@ -4394,6 +4402,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                              "incompatible vector types for invariants\n");
>           return false;
>         }
> +
> +      if (!bestn_inbranch && loop_vinfo)
> +     {
> +       if (dump_enabled_p ()
> +           && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "can't use a fully-masked loop because no"
> +                          " masked simd clone was available.\n");
> +       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +     }
> +
>        /* When the original call is pure or const but the SIMD ABI dictates
>        an aggregate return we will have to use a virtual definition and
>        in a loop eventually even need to add a virtual PHI.  That's
> @@ -4407,45 +4426,41 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>        so automagic virtual operand updating doesn't work.  */
>        if (gimple_vuse (stmt))
>       vinfo->any_known_not_updated_vssa = true;
> -      simd_clone_info.safe_push (bestn->decl);
> -      for (i = 0; i < bestn->simdclone->nargs; i++)
> -     {
> -       switch (bestn->simdclone->args[i].arg_type)
> -         {
> -         default:
> -           continue;
> -         case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
> -         case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
> -           {
> -             simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
> -             simd_clone_info.safe_push (arginfo[i].op);
> -             tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
> -                        ? size_type_node : TREE_TYPE (arginfo[i].op);
> -             tree ls = build_int_cst (lst, arginfo[i].linear_step);
> -             simd_clone_info.safe_push (ls);
> -             tree sll = arginfo[i].simd_lane_linear
> -                        ? boolean_true_node : boolean_false_node;
> -             simd_clone_info.safe_push (sll);
> -           }
> -           break;
> -         case SIMD_CLONE_ARG_TYPE_MASK:
> -           if (loop_vinfo
> -               && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> -             vect_record_loop_mask (loop_vinfo,
> -                                    &LOOP_VINFO_MASKS (loop_vinfo),
> -                                    ncopies_in, vectype, op);
> -           break;
> -         }
> -     }
>  
> -      if (!bestn->simdclone->inbranch && loop_vinfo)
> -     {
> -       if (dump_enabled_p ()
> -           && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> -         dump_printf_loc (MSG_NOTE, vect_location,
> -                          "can't use a fully-masked loop because a"
> -                          " non-masked simd clone was selected.\n");
> -       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +      data.clone = bestn;
> +      data.clone_inbranch = bestn_inbranch;
> +
> +      simd_clone_info.safe_push (NULL_TREE);
> +      for (i = 0;
> +        i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++)
> +     {
> +       if (loop_vinfo
> +           && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +           && (bestn_inbranch->simdclone->args[i].arg_type
> +               == SIMD_CLONE_ARG_TYPE_MASK))
> +         vect_record_loop_mask (loop_vinfo,
> +                                &LOOP_VINFO_MASKS (loop_vinfo),
> +                                ncopies_in, vectype, op);
> +       else if ((bestn->simdclone->args[i].arg_type
> +                 == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
> +                || (bestn->simdclone->args[i].arg_type
> +                    == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)
> +                || (bestn_inbranch
> +                    && ((bestn_inbranch->simdclone->args[i].arg_type
> +                         == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
> +                        || (bestn_inbranch->simdclone->args[i].arg_type
> +                            == 
> SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))))
> +         {
> +           simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
> +           simd_clone_info.safe_push (arginfo[i].op);
> +           tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
> +                       ? size_type_node : TREE_TYPE (arginfo[i].op));
> +           tree ls = build_int_cst (lst, arginfo[i].linear_step);
> +           simd_clone_info.safe_push (ls);
> +           tree sll = (arginfo[i].simd_lane_linear
> +                       ? boolean_true_node : boolean_false_node);
> +           simd_clone_info.safe_push (sll);
> +         }
>       }
>  
>        SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type;
> @@ -4816,9 +4831,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>  
>         tree masktype = bestn->simdclone->args[mask_i].vector_type;
>         if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
> -         /* Guess the number of lanes represented by masktype.  */
>           callee_nelements = exact_div (bestn->simdclone->simdlen,
> -                                       bestn->simdclone->nargs - nargs);
> +                                       
> bestn->simdclone->args[i].linear_step);
>         else
>           callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
>         o = vector_unroll_factor (nunits, callee_nelements);
> @@ -4828,7 +4842,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>               {
>                 vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
>                 mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> -                                          ncopies, masktype, j);
> +                                          ncopies_in, vectype, j);
>               }
>             else
>               mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 55f0bee0eb7..3b264a6102c 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data {
>    vect_simd_clone_data () = default;
>    vect_simd_clone_data (vect_simd_clone_data &&other) = default;
>  
> +  /* Selected SIMD clone and clone for in-branch.  */
> +  cgraph_node *clone;
> +  cgraph_node *clone_inbranch;
> +
>    /* Selected SIMD clone's function info.  First vector element
> -     is SIMD clone's function decl, followed by a pair of trees (base + step)
> +     is NULL_TREE, followed by a pair of trees (base + step)
>       for linear arguments (pair of NULLs for other arguments).  */
>    auto_vec<tree> simd_clone_info;
>  };
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to