On Sat, Nov 29, 2025 at 3:53 PM Robin Dapp <[email protected]> wrote:
>
> > I think this is better, but I'd make this partial_store_elidable_p only and
> > for
> > loads replace with 'else'?  Alternatively rename it to
> > partial_load_store_all_lanes_masked_p
> > or so?  Btw, I see we're oddly rejecting any mask != -1 even when len
> > == 0?  Likewise
> > we don't seem to treat mask == 0 the same as len == 0?
>
> Yes, I also noticed the inconsistencies.  At first I wanted to change as 
> little
> as possible but maybe another approach is also still ok:
>
> What we're still missing to treat things more uniformly is an else operand for
> len_load.  Then we can use internal_fn_else_index for all partial loads and 
> use
> its value.
>
> So I went ahead and did that in a separate preparation patch.
>
> The attached v3 of the elision patch goes a bit further in than v1/v2 in it
> tries to classify "all active", "all inactive", and "mixed" and also checks 
> all
> partial loads (like gathers, lanes).  It depends on the preparation patch,
> though, because it calls internal_fn_else_index unconditionally.
>
> They have been bootstrapped and regtested individually as well as together on
> x86 and power10.  Regtested on riscv64, aarch64, and s390 (via qemu).

LGTM.

Thanks,
Richard.

> Regards
>  Robin
>
>
> [PATCH v3] fold: Elide MASK_LEN_LOAD/STORE with zero length [PR122635].
>
> This patch adds zero-length handling to gimple_fold_partial_store and
> gimple_fold_partial_load and unifies them into
> gimple_fold_partial_load_store.
>
> It introduces a new function partial_load_store_mask_state that
> returns
>  MASK_ALL_INACTIVE,
>  MASK_ALL_ACTIVE, or
>  MASK_UNKNOWN.
>
> This result is used to either replace a load with its else value and
> elide a store (when all inactive), turn the load/store into a regular
> mem ref (all_active), or do nothing.
>
>         PR tree-optimization/122635
>
> gcc/ChangeLog:
>
>         * gimple-fold.cc (enum mask_load_store_state): New enum.
>         (gimple_fold_partial_load_store_mem_ref): Only fold
>         "all active" loads/stores.
>         (partial_load_store_mask_state): New function to compute mask
>         state.
>         (gimple_fold_partial_load): Remove.
>         (gimple_fold_partial_load_store): New function.
>         (gimple_fold_partial_store): Remove.
>         (gimple_fold_call): Use new function.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/sve/pfalse-store.c: Expect more elided
>         stores.
>         * gcc.target/riscv/rvv/autovec/pr122635-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/pr122635-2.c: New test.
>         * gcc.target/powerpc/p9-vec-length-epil-8.c: Expect two lxvl
>         less.
> ---
>  gcc/gimple-fold.cc                            | 225 ++++++++++++------
>  .../gcc.target/aarch64/sve/pfalse-store.c     |   5 +-
>  .../gcc.target/powerpc/p9-vec-length-epil-8.c |   2 +-
>  .../gcc.target/riscv/rvv/autovec/pr122635-1.c |  20 ++
>  .../gcc.target/riscv/rvv/autovec/pr122635-2.c |  18 ++
>  5 files changed, 198 insertions(+), 72 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
>
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index 3fc76313622..a7b5d8352e7 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -5757,50 +5757,112 @@ arith_overflowed_p (enum tree_code code, const_tree 
> type,
>    return wi::min_precision (wres, sign) > TYPE_PRECISION (type);
>  }
>
> -/* If IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call CALL is unconditional,
> -   return a MEM_REF for the memory it references, otherwise return null.
> -   VECTYPE is the type of the memory vector.  MASK_P indicates it's for
> -   MASK if true, otherwise it's for LEN.  */
> +/* Mask state for partial load/store operations (mask and length).  */
> +enum mask_load_store_state {
> +  MASK_ALL_INACTIVE,  /* All lanes/elements are inactive (can be elided).  */
> +  MASK_ALL_ACTIVE,    /* All lanes/elements are active (unconditional).  */
> +  MASK_UNKNOWN
> +};
>
> -static tree
> -gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool 
> mask_p)
> +/* Check the mask/length state of IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call 
> CALL.
> +   Returns whether all elements are active, all inactive, or mixed.
> +   VECTYPE is the vector type of the operation.  */
> +
> +static enum mask_load_store_state
> +partial_load_store_mask_state (gcall *call, tree vectype)
>  {
> -  tree ptr = gimple_call_arg (call, 0);
> -  tree alias_align = gimple_call_arg (call, 1);
> -  if (!tree_fits_uhwi_p (alias_align))
> -    return NULL_TREE;
> +  internal_fn ifn = gimple_call_internal_fn (call);
> +  int mask_index = internal_fn_mask_index (ifn);
> +  int len_index = internal_fn_len_index (ifn);
> +
> +  /* Extract length and mask arguments up front.  */
> +  tree len = len_index != -1 ? gimple_call_arg (call, len_index) : NULL_TREE;
> +  tree bias = len ? gimple_call_arg (call, len_index + 1) : NULL_TREE;
> +  tree mask = mask_index != -1 ? gimple_call_arg (call, mask_index) : 
> NULL_TREE;
> +
> +  poly_int64 nelts = GET_MODE_NUNITS (TYPE_MODE (vectype));
>
> -  if (mask_p)
> +  poly_widest_int wlen = -1;
> +  bool full_length_p = !len;  /* No length means full length.  */
> +
> +  /* Compute effective length.  */
> +  if (len && poly_int_tree_p (len))
>      {
> -      tree mask = gimple_call_arg (call, 2);
> -      if (!integer_all_onesp (mask))
> -       return NULL_TREE;
> +      gcc_assert (TREE_CODE (bias) == INTEGER_CST);
> +      wlen = wi::to_poly_widest (len) + wi::to_widest (bias);
> +
> +      if (known_eq (wlen, 0))
> +       return MASK_ALL_INACTIVE;
> +
> +      if (known_eq (wlen, nelts))
> +       full_length_p = true;
> +      else
> +       full_length_p = false;
>      }
> -  else
> +
> +  /* Check mask for early return cases.  */
> +  if (mask)
>      {
> -      internal_fn ifn = gimple_call_internal_fn (call);
> -      int len_index = internal_fn_len_index (ifn);
> -      tree basic_len = gimple_call_arg (call, len_index);
> -      if (!poly_int_tree_p (basic_len))
> -       return NULL_TREE;
> -      tree bias = gimple_call_arg (call, len_index + 1);
> -      gcc_assert (TREE_CODE (bias) == INTEGER_CST);
> -      /* For LEN_LOAD/LEN_STORE/MASK_LEN_LOAD/MASK_LEN_STORE,
> -        we don't fold when (bias + len) != VF.  */
> -      if (maybe_ne (wi::to_poly_widest (basic_len) + wi::to_widest (bias),
> -                   GET_MODE_NUNITS (TYPE_MODE (vectype))))
> -       return NULL_TREE;
> +      if (integer_zerop (mask))
> +       return MASK_ALL_INACTIVE;
> +
> +      if (full_length_p && integer_all_onesp (mask))
> +       return MASK_ALL_ACTIVE;
> +    }
> +  else if (full_length_p)
> +    /* No mask and full length means all active.  */
> +    return MASK_ALL_ACTIVE;
> +
> +  /* For VLA vectors, we can't do much more.  */
> +  if (!nelts.is_constant ())
> +    return MASK_UNKNOWN;
> +
> +  /* Same for VLS vectors with non-constant mask.  */
> +  if (mask && TREE_CODE (mask) != VECTOR_CST)
> +    return MASK_UNKNOWN;
>
> -      /* For MASK_LEN_{LOAD,STORE}, we should also check whether
> -         the mask is all ones mask.  */
> -      if (ifn == IFN_MASK_LEN_LOAD || ifn == IFN_MASK_LEN_STORE)
> +  /* Check VLS vector elements.  */
> +  gcc_assert (wlen.is_constant ());
> +
> +  HOST_WIDE_INT active_len = wlen.to_constant ().to_shwi ();
> +  if (active_len == -1)
> +    active_len = nelts.to_constant ();
> +
> +  /* Check if all elements in the active range match the mask.  */
> +  for (HOST_WIDE_INT i = 0; i < active_len; i++)
> +    {
> +      bool elt_active = !mask || !integer_zerop (vector_cst_elt (mask, i));
> +      if (!elt_active)
>         {
> -         tree mask = gimple_call_arg (call, internal_fn_mask_index (ifn));
> -         if (!integer_all_onesp (mask))
> -           return NULL_TREE;
> +         /* Found an inactive element.  Check if all are inactive.  */
> +         for (HOST_WIDE_INT j = 0; j < active_len; j++)
> +           if (!mask || !integer_zerop (vector_cst_elt (mask, j)))
> +             return MASK_UNKNOWN;  /* Mixed state.  */
> +         return MASK_ALL_INACTIVE;
>         }
>      }
>
> +  /* All elements in active range are active.  */
> +  return full_length_p ? MASK_ALL_ACTIVE : MASK_UNKNOWN;
> +}
> +
> +
> +/* If IFN_{MASK,LEN,MASK_LEN}_LOAD/STORE call CALL is unconditional
> +   (all lanes active), return a MEM_REF for the memory it references.
> +   Otherwise return NULL_TREE.  VECTYPE is the type of the memory vector.  */
> +
> +static tree
> +gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype)
> +{
> +  /* Only fold if all lanes are active (unconditional).  */
> +  if (partial_load_store_mask_state (call, vectype) != MASK_ALL_ACTIVE)
> +    return NULL_TREE;
> +
> +  tree ptr = gimple_call_arg (call, 0);
> +  tree alias_align = gimple_call_arg (call, 1);
> +  if (!tree_fits_uhwi_p (alias_align))
> +    return NULL_TREE;
> +
>    unsigned HOST_WIDE_INT align = tree_to_uhwi (alias_align);
>    if (TYPE_ALIGN (vectype) != align)
>      vectype = build_aligned_type (vectype, align);
> @@ -5808,41 +5870,68 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, 
> tree vectype, bool mask_p)
>    return fold_build2 (MEM_REF, vectype, ptr, offset);
>  }
>
> -/* Try to fold IFN_{MASK,LEN}_LOAD call CALL.  Return true on success.
> -   MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
> +/* Try to fold IFN_{MASK,LEN}_LOAD/STORE call CALL.  Return true on success. 
>  */
>
>  static bool
> -gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool 
> mask_p)
> +gimple_fold_partial_load_store (gimple_stmt_iterator *gsi, gcall *call)
>  {
> +  internal_fn ifn = gimple_call_internal_fn (call);
>    tree lhs = gimple_call_lhs (call);
> -  if (!lhs)
> -    return false;
> +  bool is_load = (lhs != NULL_TREE);
> +  tree vectype;
>
> -  if (tree rhs
> -      = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (lhs), 
> mask_p))
> +  if (is_load)
> +    vectype = TREE_TYPE (lhs);
> +  else
>      {
> -      gassign *new_stmt = gimple_build_assign (lhs, rhs);
> -      gimple_set_location (new_stmt, gimple_location (call));
> -      gimple_move_vops (new_stmt, call);
> -      gsi_replace (gsi, new_stmt, false);
> -      return true;
> +      tree rhs = gimple_call_arg (call, internal_fn_stored_value_index 
> (ifn));
> +      vectype = TREE_TYPE (rhs);
>      }
> -  return false;
> -}
>
> -/* Try to fold IFN_{MASK,LEN}_STORE call CALL.  Return true on success.
> -   MASK_P indicates it's for MASK if true, otherwise it's for LEN.  */
> +  enum mask_load_store_state state
> +    = partial_load_store_mask_state (call, vectype);
>
> -static bool
> -gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call,
> -                          bool mask_p)
> -{
> -  internal_fn ifn = gimple_call_internal_fn (call);
> -  tree rhs = gimple_call_arg (call, internal_fn_stored_value_index (ifn));
> -  if (tree lhs
> -      = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), 
> mask_p))
> +  /* Handle all-inactive case.  */
> +  if (state == MASK_ALL_INACTIVE)
>      {
> -      gassign *new_stmt = gimple_build_assign (lhs, rhs);
> +      if (is_load)
> +       {
> +         /* Replace load with else value.  */
> +         int else_index = internal_fn_else_index (ifn);
> +         tree else_value = gimple_call_arg (call, else_index);
> +         gassign *new_stmt = gimple_build_assign (lhs, else_value);
> +         gimple_set_location (new_stmt, gimple_location (call));
> +         gsi_replace (gsi, new_stmt, false);
> +         return true;
> +       }
> +      else
> +       {
> +         /* Remove inactive store.  */
> +         unlink_stmt_vdef (call);
> +         release_defs (call);
> +         gsi_replace (gsi, gimple_build_nop (), true);
> +         return true;
> +       }
> +    }
> +
> +  /* We cannot simplify a gather/scatter or load/store lanes further.  */
> +  if (internal_gather_scatter_fn_p (ifn)
> +      || TREE_CODE (vectype) == ARRAY_TYPE)
> +    return false;
> +
> +  /* Handle all-active case - fold to regular memory operation.  */
> +  if (tree mem_ref = gimple_fold_partial_load_store_mem_ref (call, vectype))
> +    {
> +      gassign *new_stmt;
> +      if (is_load)
> +       new_stmt = gimple_build_assign (lhs, mem_ref);
> +      else
> +       {
> +         tree rhs
> +           = gimple_call_arg (call, internal_fn_stored_value_index (ifn));
> +         new_stmt = gimple_build_assign (mem_ref, rhs);
> +       }
> +
>        gimple_set_location (new_stmt, gimple_location (call));
>        gimple_move_vops (new_stmt, call);
>        gsi_replace (gsi, new_stmt, false);
> @@ -6075,19 +6164,21 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool 
> inplace)
>           cplx_result = true;
>           uaddc_usubc = true;
>           break;
> -       case IFN_MASK_LOAD:
> -         changed |= gimple_fold_partial_load (gsi, stmt, true);
> -         break;
> -       case IFN_MASK_STORE:
> -         changed |= gimple_fold_partial_store (gsi, stmt, true);
> -         break;
>         case IFN_LEN_LOAD:
> +       case IFN_MASK_LOAD:
>         case IFN_MASK_LEN_LOAD:
> -         changed |= gimple_fold_partial_load (gsi, stmt, false);
> -         break;
> +       case IFN_MASK_GATHER_LOAD:
> +       case IFN_MASK_LEN_GATHER_LOAD:
> +       case IFN_MASK_LOAD_LANES:
> +       case IFN_MASK_LEN_LOAD_LANES:
>         case IFN_LEN_STORE:
> +       case IFN_MASK_STORE:
>         case IFN_MASK_LEN_STORE:
> -         changed |= gimple_fold_partial_store (gsi, stmt, false);
> +       case IFN_MASK_SCATTER_STORE:
> +       case IFN_MASK_LEN_SCATTER_STORE:
> +       case IFN_MASK_STORE_LANES:
> +       case IFN_MASK_LEN_STORE_LANES:
> +         changed |= gimple_fold_partial_load_store (gsi, stmt);
>           break;
>         default:
>           break;
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
> index 1539f58c824..39db13bbcd6 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pfalse-store.c
> @@ -46,8 +46,5 @@ ALL_DATA (st2, x2_t)
>  ALL_DATA (st3, x3_t)
>  ALL_DATA (st4, x4_t)
>
> -/* FIXME: Currently, st1/2/3/4 are not folded with a pfalse
> -   predicate, which is the reason for the 48 missing cases below. Once
> -   folding is implemented for these intrinsics, the sum should be 60.  */
> -/* { dg-final { scan-assembler-times {\t.cfi_startproc\n\tret\n} 12 } } */
> +/* { dg-final { scan-assembler-times {\t.cfi_startproc\n\tret\n} 60 } } */
>  /* { dg-final { scan-assembler-times {\t.cfi_startproc\n} 60 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c 
> b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
> index 34a2c8eb11b..5dff0d0ceb9 100644
> --- a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
> +++ b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c
> @@ -13,5 +13,5 @@
>
>  #include "p9-vec-length-8.h"
>
> -/* { dg-final { scan-assembler-times {\mlxvl\M} 16 } } */
> +/* { dg-final { scan-assembler-times {\mlxvl\M} 14 } } */
>  /* { dg-final { scan-assembler-times {\mstxvl\M} 7 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
> new file mode 100644
> index 00000000000..0beb3d70866
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-1.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl256b -mabi=lp64d 
> -mrvv-vector-bits=zvl -mno-autovec-segment" } */
> +
> +typedef struct {
> +  int a[6];
> +  float b[3];
> +} c;
> +
> +int d(c *e) {
> +  int f =0;
> +  for (; f < 3; f++) {
> +    e->a[2 * f] = e->b[f];
> +    e->a[2 * f + 1] = -e->a[2 * f];
> +    e->a[2 * f] = f + 3 * e->a[2 * f];
> +    e->a[2 * f + 1] = f + 3 * e->a[2 * f + 1];
> +  }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-not "vsetivli.*zero,0" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
> new file mode 100644
> index 00000000000..0de69b52cb0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122635-2.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl256b -mabi=lp64d 
> -mrvv-vector-bits=zvl -mno-autovec-segment" } */
> +
> +typedef struct {
> +  int A[6];
> +  float b[];
> +} a;
> +
> +int b(a *a) {
> +  int b = 0;
> +  for (; b < 3; b++) {
> +    a->A[2 * b] = a->b[b] - b + a->A[2 * b];
> +    a->A[2 * b + 1] = b * a->A[2 * b + 1];
> +  }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-not "vsetivli.*zero,0" } } */
> --
> 2.51.1
>

Reply via email to