> On 11 Oct 2025, at 13:18, Tamar Christina <[email protected]> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Hi Jennifer,
> 
> could you next time send these out as different emails? The multiple patches 
> as
> attachments is a bit hard to reply to the relevant bits.
> 
>> From b8e2225eb288d6805cccbbfc5a46dccc0b8d61e6 Mon Sep 17 00:00:00 2001
>> From: Jennifer Schmitz <[email protected]>
>> Date: Fri, 22 Aug 2025 07:16:48 -0700
>> Subject: [PATCH 2/2] [PR121604] aarch64: Fold svbrka/b with ptrue predicate
>> and operand to pfalse.
>> 
>> We can fold svbrka/b to pfalse, if the governing predicate and the operand
>> are both ptrue.
>> We implemented this optimization during gimple folding in
>> svbrk_unary_impl::fold.
>> 
> 
> Only BRKB can fold to pfalse here. BRKA will include the first element.
> BRKA can however fold to ptrue pN, vl1. So the BRKA folding here is incorrect.
> 
> GCC Seems to miss most valid foldings of BRK[AB] though.  In case you wanted 
> some
> examples I wrote some below including the values that need to be active for it
> to fold. Not saying these are required to be fixed by you, just pointing them 
> out.
> 
> #include <arm_sve.h>
> 
> // should fold to return b,
> __attribute__ ((noipa))
> svbool_t f1m (svbool_t a, svbool_t b)
> {
>    return svbrka_b_m (b, svpfalse_b(), a);
> }
> 
> // should fold to pfalse
> __attribute__ ((noipa))
> svbool_t f1z (svbool_t a)
> {
>    return svbrka_b_z (svpfalse_b(), a);
> }
> 
> // should not fold
> __attribute__ ((noipa))
> svbool_t f2m (svbool_t a, svbool_t b)
> {
>    return svbrka_b_m (b, svptrue_b8(), a);
> }
> 
> // should not fold
> __attribute__ ((noipa))
> svbool_t f2z (svbool_t a)
> {
>    return svbrka_b_z (svptrue_b8(), a);
> }
> 
> // should fold to pfirst
> __attribute__ ((noipa))
> svbool_t f3z (svbool_t a)
> {
>    return svbrka_b_z (svptrue_b8(), svptrue_b8());
> }
> 
> // don't fold
> __attribute__ ((noipa))
> svbool_t f3m (svbool_t a, svbool_t b)
> {
>    return svbrka_b_m (a, svptrue_b8(), a);
> }
> 
> // should fold to pfirst
> __attribute__ ((noipa))
> svbool_t f4m (svbool_t a)
> {
>    return svbrka_b_m (a, svptrue_b8(), svptrue_b8 ());
> }
> 
> // should fold to ptrue
> __attribute__ ((noipa))
> svbool_t f4z ()
> {
>    return svbrka_b_z (svptrue_b8(), svpfalse());
> }
> 
> // fold to pfalse
> __attribute__ ((noipa))
> svbool_t g1z (svbool_t a)
> {
>    return svbrkb_b_z (svpfalse_b(), a);
> }
> 
> // don't fold
> __attribute__ ((noipa))
> svbool_t g2m (svbool_t a)
> {
>    return svbrkb_b_m (svpfalse_b(), svptrue_b8(), a);
> }
> 
> // don't fold
> __attribute__ ((noipa))
> svbool_t g2z (svbool_t a)
> {
>    return svbrkb_b_z (svptrue_b8(), a);
> }
> 
> // should fold to ptrue
> __attribute__ ((noipa))
> svbool_t g3m (svbool_t a)
> {
>    return svbrkb_b_m (a, svptrue_b8(), svpfalse ());
> }
> 
> // fold to pfalse
> __attribute__ ((noipa))
> svbool_t g4m (svbool_t a)
> {
>    return svbrkb_b_m (a, svptrue_b8(), svptrue_b8 ());
> }
> 
> __attribute__ ((noipa))
> void check_result (svbool_t a, svbool_t b)
> {
>  svbool_t pg = svptrue_b8();
>  svbool_t diff = svorr_b_z(pg,
>                            svand_b_z(pg, a, svnot_b_z(pg, b)),
>                            svand_b_z(pg, b, svnot_b_z(pg, a)));
>    if (svcntp_b8(pg, diff) != 0)
>      __builtin_abort ();
> }
> 
> int main ()
> {
>    svbool_t a = svptrue_pat_b16 (SV_VL4);
>    svbool_t b = svptrue_pat_b16 (SV_VL5);
> 
>    // Should fold to second argument
>    check_result (f1m (a, b), b);
>    check_result (f1m (b, svpfalse ()), svpfalse ());
> 
>    // should fold to false
>    check_result (f1z (a), svpfalse ());
>    check_result (f1z (b), svpfalse ());
> 
>    // should fold to pfirst
>    check_result (f3z (a), svptrue_pat_b16 (SV_VL1));
>    check_result (f3z (b), svptrue_pat_b16 (SV_VL1));
>    check_result (f3z (svptrue_b8 ()), svptrue_pat_b16 (SV_VL1));
>    check_result (f3z (svpfalse ()), svptrue_pat_b16 (SV_VL1));
> 
>    // should fold to pfirst
>    check_result (f4m (a), svptrue_pat_b16 (SV_VL1));
>    check_result (f4m (b), svptrue_pat_b16 (SV_VL1));
>    check_result (f4m (svptrue_b8 ()), svptrue_pat_b16 (SV_VL1));
>    check_result (f4m (svpfalse ()), svptrue_pat_b16 (SV_VL1));
> 
>    // should fold to ptrue
>    check_result (f4z (), svptrue_b8 ());
> 
>    // should fold to pfalse
>    check_result (g1z (a), svpfalse ());
>    check_result (g1z (b), svpfalse ());
>    check_result (g1z (svptrue_b8 ()), svpfalse ());
>    check_result (g1z (svpfalse ()), svpfalse ());
> 
>    // Should fold to ptrue
>    check_result (g3m (a), svptrue_b8 ());
>    check_result (g3m (b), svptrue_b8 ());
>    check_result (g3m (svptrue_b8 ()), svptrue_b8 ());
>    check_result (g3m (svpfalse ()), svptrue_b8 ());
> 
>    // should fold to pfalse
>    check_result (g4m (a), svpfalse ());
>    check_result (g4m (b), svpfalse ());
>    check_result (g4m (svptrue_b8 ()), svpfalse ());
>    check_result (g4m (svpfalse ()), svpfalse ());
> }
> 
>> the case of m predication. The reason is that if get_gp_arg_index returns the
>> “normal” position of the governing predicate (i.e. for x or z predication), 
>> we
>> still have to adjust the position in case of m predication for a few unary 
>> shapes.
>> And that brings us back to needing case distinctions for specific shapes in
>> gimple_folder::fold_pfalse, which Alex wanted to avoid (if I understood 
>> correctly).
>> What do you think?
> 
> Yeah I wanted to avoid changes to the shape class since the comments around 
> that
> area indicate that it was an intentional decision not to store the location of
> the GP in the shape.  Hence my suggestion to store this at a higher level.
> 
> However I spent some time trying to understand how the parsing works, and
> reading more into how the intrinsics framework is structured, I think we
> actually do know the location of the GP but we don't store it.
> 
> During apply_predication we do adjust the index based on where the predicate
> could be, so the shape does know where the GP is supposed to be.
> 
> So instead how about the attached approach?  I've tested it on a few of the
> cases and it gives the expected results and is a much cleaner approach I 
> think.
> 
> What do you think of going that way instead?
Hi Tamar,
Thanks for taking the time to dig into this. Your solution looks good and I 
agree that apply_predication is the right place for the logic setting the gp 
index. I added a few (minor) comments inline below.
After you committed this patch I can follow up with folding svbrka/b.
Thanks,
Jennifer
> 
> Thanks,
> Tamar
> <rb19899.patch>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> index 
> 74a3338e9556c4369a3b1e6e864acfba4ced0e3a..4e1e8ae9dc3cd003c43486a6d9ba11c5b16cab40
>  100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> @@ -21,7 +21,10 @@
>  #include "system.h"
>  #include "coretypes.h"
>  #include "tm.h"
> +#include "basic-block.h"
>  #include "tree.h"
> +#include "function.h"
> +#include "gimple.h"
>  #include "rtl.h"
>  #include "tm_p.h"
>  #include "memmodel.h"
> @@ -68,23 +71,36 @@ za_group_is_pure_overload (const function_group_info 
> &group)
>     types in ARGUMENT_TYPES.  RETURN_TYPE is the type returned by the
>     function.  */
>  static void
> -apply_predication (const function_instance &instance, tree return_type,
> +apply_predication (function_instance &instance, tree return_type,
>                  vec<tree> &argument_types)
>  {
> +  /* Initially mark the function as not being predicated.  */
> +  instance.gp_pred_index = -1;
> +
>    /* There are currently no SME ZA instructions that have both merging and
>       unpredicated forms, so for simplicity, the predicates are always 
> included
>       in the original format string.  */
>    if (instance.pred != PRED_none && instance.pred != PRED_za_m)
>      {
>        argument_types.quick_insert (0, instance.gp_type ());
> +      instance.gp_pred_index = 0;
>        /* For unary merge operations, the first argument is a vector with
>        the same type as the result.  For unary_convert_narrowt it also
>        provides the "bottom" half of active elements, and is present
>        for all types of predication.  */
>        auto nargs = argument_types.length () - 1;
>        if (instance.shape->has_merge_argument_p (instance, nargs))
> -     argument_types.quick_insert (0, return_type);
> +     {
> +       argument_types.quick_insert (0, return_type);
> +       instance.gp_pred_index = 1;
> +     }
>      }
> +
> +  /* Check if the shape has overriden the GP argument.  In this case register
> +     the types, predicate the function but don't mark it as having a GP and
> +     override what was set before.  */
> +  if (!instance.shape->has_gp_argument_p (instance))
> +    instance.gp_pred_index = -1;
>  }
>  
>  /* Parse and move past an element type in FORMAT and return it as a type
> @@ -3332,6 +3348,14 @@ struct pmov_to_vector_lane_def : public 
> overloaded_base<0>
>         but it doesn't currently have the necessary information.  */
>      return c.require_immediate_range (1, 1, bytes - 1);
>    }
> +
> +  /* This function has a predicate argument, and is a merging instruction, 
> but
> +     the predicate is not a GP, it's the inactive lanes.  */
> +  bool
> +  has_gp_argument_p (const function_instance &) const override
> +  {
> +    return false;
> +  }
>  };
>  SHAPE (pmov_to_vector_lane)
>  
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index 
> 22d75197188d4bdcd771d9b684da5b6ea67db598..4a74ede653fbdea6d31494815baa7948f1cfc088
>  100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -3632,24 +3632,22 @@ gimple_folder::redirect_pred_x ()
>  gimple *
>  gimple_folder::fold_pfalse ()
>  {
> -  if (pred == PRED_none)
> +  tree gp = gp_pred (call);
> +  /* If there isn't a GP then we can't do any folding as the instruction 
> isn't
> +     predicated.  */
> +  if (!gp)
>      return nullptr;
> -  tree arg0 = gimple_call_arg (call, 0);
> +
>    if (pred == PRED_m)
>      {
> -      /* Unary function shapes with _m predication are folded to the
> -      inactive vector (arg0), while other function shapes are folded
> -      to op1 (arg1).  */
> -      tree arg1 = gimple_call_arg (call, 1);
> -      if (is_pfalse (arg1))
> -     return fold_call_to (arg0);
> -      if (is_pfalse (arg0))
> -     return fold_call_to (arg1);
> +      tree val = inactive_values (call);
> +      if (is_pfalse (gp))
> +     return fold_call_to (val);
>        return nullptr;
>      }
> -  if ((pred == PRED_x || pred == PRED_z) && is_pfalse (arg0))
> +  if ((pred == PRED_x || pred == PRED_z) && is_pfalse (gp))
>      return fold_call_to (build_zero_cst (TREE_TYPE (lhs)));
> -  if (pred == PRED_implicit && is_pfalse (arg0))
> +  if (pred == PRED_implicit && is_pfalse (gp))
>      {
>        unsigned int flags = call_properties ();
>        /* Folding to lhs = {0, ...} is not appropriate for intrinsics with
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h 
> b/gcc/config/aarch64/aarch64-sve-builtins.h
> index 
> d6a58b450d6ddfd38788899b61b75c14e9472a99..4bbb0cef54ecd7979b65633c09cee79153c057c5
>  100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.h
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.h
> @@ -403,6 +403,8 @@ public:
>    bool could_trap_p () const;
>  
>    vector_type_index gp_type_index () const;
> +  tree gp_pred (gcall *) const;
> +  tree inactive_values (gcall *) const;
>    tree gp_type () const;
>  
>    unsigned int vectors_per_tuple () const;
> @@ -436,6 +438,7 @@ public:
>    group_suffix_index group_suffix_id;
>    predication_index pred;
>    fpm_mode_index fpm_mode;
> +  int gp_pred_index;
I think we could drop the “_pred” in the variable name, because that’s already 
covered by the “p” in “gp”.
Same for tree gp_pred (gcall *) const;.
> 
>  };
>  
>  class registered_function;
> @@ -801,6 +805,8 @@ public:
>    virtual bool has_merge_argument_p (const function_instance &,
>                                    unsigned int) const;
>  
> +  virtual bool has_gp_argument_p (const function_instance &) const;
> +
>    virtual bool explicit_type_suffix_p (unsigned int) const = 0;
>  
>    /* True if the group suffix is present in overloaded names.
> @@ -949,6 +955,34 @@ function_instance::gp_type () const
>    return acle_vector_types[0][gp_type_index ()];
>  }
>  
> +/* Return the tree value that should be used as the governing predicate of
> +   this function.  If none then return NULL_TREE.  */
> +inline tree
> +function_instance::gp_pred (gcall *call) const
> +{
> +  if (pred == PRED_none || gp_pred_index < 0)
> +    return NULL_TREE;
> +
> +  return gimple_call_arg (call, gp_pred_index);
> +}
> +
> +/* Return the tree value that should be used the inactive lanes should this
> +   function be a predicated function with a gp.  If none then return
> +   NULL_TREE.  */
> +inline tree
> +function_instance::inactive_values (gcall *call) const
> +{
> +  if (pred == PRED_none || gp_pred_index < 0)
> +    return NULL_TREE;
> +
> +  /* Function is unary.  */
How about “/* Function is unary with m predication.  */”, because for x/z 
predication, gp_pred_index is 0 for unary types too?
> +  if (gp_pred_index == 1)
> +    return gimple_call_arg (call, 0);
> +
> +  /* Else the inactive values are the next element.  */
> +  return gimple_call_arg (call, 1);
> +}
> +
>  /* If the function operates on tuples of vectors, return the number
>     of vectors in the tuples, otherwise return 1.  */
>  inline unsigned int
> @@ -1123,6 +1157,14 @@ function_shape::has_merge_argument_p (const 
> function_instance &instance,
>    return nargs == 1 && instance.pred == PRED_m;
>  }
>  
> +/* Return true if INSTANCE (which has NARGS arguments) has an predicate 
> argument
> +   that can be used as the global predicate.  */
How about “/* Return true if INSTANCE has a predicate argument that acts as 
governing predicate.  */?
> +inline bool
> +function_shape::has_gp_argument_p (const function_instance &instance) const
> +{
> +  return instance.pred != PRED_none;
> +}
> +
>  /* Return the mode of the result of a call.  */
>  inline machine_mode
>  function_expander::result_mode () const


Reply via email to