On Fri, Sep 19, 2025, 7:08 PM Peter0x44 <peter0...@disroot.org> wrote:

> On 2025-09-20 02:33, Andrew Pinski wrote:
> > On Fri, Sep 19, 2025, 6:22 PM Peter Damianov <peter0...@disroot.org>
> > wrote:
> >
> >> This patch implements folding of aggregate assignments (*dest =
> >> *src)
> >> by converting them to scalar MEM_REF operations when the size
> >> permits. This enables vectorization opportunities.
> >
> > I am not sure this will work with my recent changes to forwprop.  Plus
> > it might make sra not handle them later on.
> >
> > Plus INDIRECT_REF will never shows up in gimple.
> >
> > Iirc it is ldist that should be handling aggregate assignments to get
> > it converted into memcpys.
> > It is not the vectorizer really.
>
> Wouldn't converting it into memcpy then lose the stride info of the
> copying loop?
> Does that matter?
>

Stride? Huh?
The loop in the bug report is just
void
foo_memmov (pixel* p, pixel* q, int n)
{
    for (int i = 0; i != n; i++)
      *p++ = *q++;
}

There is no stride there. This is copying 4 bytes each time through the
loop so it should be converted into just
memmove(p, q, n *sizeof(pixel));

Yes that does lose some aliasing information but I think that is ok.
The reason why your conversion of an aggregate copy works is because it
converts it into an integer load/store which currently ldist understands.
(Specifically dependence analysis).

Now for the vectorizer, maybe this lowering should happen in ifcvt rather
than forwprop.

Thanks,
Andrew



> >
> > Thanks,
> > Andrew
> >
> >> gcc/ChangeLog:
> >>
> >> PR tree-optimization/99504
> >> * tree-ssa-forwprop.cc (fold_aggregate_assignment): New
> >> function.
> >> Folds aggregate assignments to scalar MEM_REF operations/
> >> (pass_forwprop::execute): Call fold_aggregate_assignment for
> >> applicable assignment statements.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >> PR tree-optimization/99504
> >> * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that
> >> aggregate
> >> assignments of various sizes get folded to scalar MEM_REF
> >> operations.
> >>
> >> Signed-off-by: Peter Damianov <peter0...@disroot.org>
> >> ---
> >> gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c |  66 +++++++++
> >> gcc/tree-ssa-forwprop.cc                    | 140
> >> ++++++++++++++++++++
> >> 2 files changed, 206 insertions(+)
> >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> >>
> >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> >> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> >> new file mode 100644
> >> index 00000000000..7fef9821e9e
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> >> @@ -0,0 +1,66 @@
> >> +/* PR tree-optimization/99504 */
> >> +/* Test that aggregate assignments get folded to scalar MEM_REF
> >> operations */
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
> >> +
> >> +#include <stdint.h>
> >> +
> >> +struct pixel_4 {
> >> +  uint8_t r, g, b, a;
> >> +};
> >> +
> >> +struct pixel_8 {
> >> +  uint16_t r, g, b, a;
> >> +};
> >> +
> >> +struct pixel_16 {
> >> +  uint32_t r, g, b, a;
> >> +};
> >> +
> >> +struct pixel_32 {
> >> +  uint64_t r, g, b, a;
> >> +};
> >> +
> >> +#ifdef __SIZEOF_INT128__
> >> +struct pixel_64 {
> >> +  __int128 r, g, b, a;
> >> +};
> >> +#endif
> >> +
> >> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src)
> >> +{
> >> +  *dest = *src;
> >> +}
> >> +
> >> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src)
> >> +{
> >> +  *dest = *src;
> >> +}
> >> +
> >> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src)
> >> +{
> >> +  *dest = *src;
> >> +}
> >> +
> >> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src)
> >> +{
> >> +  *dest = *src;
> >> +}
> >> +
> >> +#ifdef __SIZEOF_INT128__
> >> +void test_64_bytes(struct pixel_64 *dest, struct pixel_64 *src)
> >> +{
> >> +  *dest = *src;
> >> +}
> >> +#endif
> >> +
> >> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n)
> >> +{
> >> +  for (int i = 0; i < n; i++)
> >> +    dest[i] = src[i];
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 10 "forwprop1" } }
> >> */
> >> +/* Check that we generate scalar temporaries for the folded
> >> assignments */
> >> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" }
> >> } */
> >> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+"
> >> "forwprop1" } } */
> >> \ No newline at end of file
> >> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> >> index 43b1c9d696f..3ce94a737c6 100644
> >> --- a/gcc/tree-ssa-forwprop.cc
> >> +++ b/gcc/tree-ssa-forwprop.cc
> >> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
> >> typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
> >>
> >> static bool forward_propagate_addr_expr (tree, tree, bool);
> >> +static bool fold_aggregate_assignment (gimple_stmt_iterator *);
> >>
> >> /* Set to true if we delete dead edges during the optimization.  */
> >> static bool cfg_changed;
> >> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree
> >> rhs, bool parent_single_use_p)
> >> }
> >>
> >> +/* Try to optimize aggregate assignments by converting them to
> >> scalar
> >> +   MEM_REF operations when profitable for vectorization.
> >> +   This applies the same folding as memcpy to aggregate
> >> assignments.  */
> >> +
> >> +static bool
> >> +fold_aggregate_assignment (gimple_stmt_iterator *gsi)
> >> +{
> >> +  gimple *stmt = gsi_stmt (*gsi);
> >> +
> >> +  if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt))
> >> +    return false;
> >> +
> >> +  tree lhs = gimple_assign_lhs (stmt);
> >> +  tree rhs = gimple_assign_rhs1 (stmt);
> >> +
> >> +  /* Check if this is an aggregate assignment: *dest = *src
> >> +     where both sides are aggregate types (can be MEM_REF or
> >> indirection).  */
> >> +  bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF);
> >> +  bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF);
> >> +
> >> +  if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect)
> >> +      || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect))
> >> +    return false;
> >> +
> >> +  tree lhs_type = TREE_TYPE (lhs);
> >> +  tree rhs_type = TREE_TYPE (rhs);
> >> +
> >> +  if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type))
> >> +    return false;
> >> +
> >> +  if (!types_compatible_p (lhs_type, rhs_type))
> >> +    return false;
> >> +
> >> +  if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type)))
> >> +    return false;
> >> +
> >> +  unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT
> >> (lhs_type));
> >> +  if (!pow2p_hwi (ilen) || ilen > MOVE_MAX)
> >> +    return false;
> >> +
> >> +  tree lhs_base = TREE_OPERAND (lhs, 0);
> >> +  tree rhs_base = TREE_OPERAND (rhs, 0);
> >> +
> >> +  unsigned int lhs_align = get_pointer_alignment (lhs_base);
> >> +  unsigned int rhs_align = get_pointer_alignment (rhs_base);
> >> +
> >> +  scalar_int_mode imode;
> >> +  machine_mode mode;
> >> +  if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode)
> >> +      || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists
> >> (&mode)
> >> +      || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT))
> >> +    return false;
> >> +
> >> +  if ((lhs_align < GET_MODE_ALIGNMENT (mode)
> >> +       && targetm.slow_unaligned_access (mode, lhs_align)
> >> +       && optab_handler (movmisalign_optab, mode) ==
> >> CODE_FOR_nothing)
> >> +      || (rhs_align < GET_MODE_ALIGNMENT (mode)
> >> +         && targetm.slow_unaligned_access (mode, rhs_align)
> >> +         && optab_handler (movmisalign_optab, mode) ==
> >> CODE_FOR_nothing))
> >> +    return false;
> >> +
> >> +  tree type = bitwise_type_for_mode (mode);
> >> +  tree srctype = type;
> >> +  tree desttype = type;
> >> +
> >> +  if (rhs_align < GET_MODE_ALIGNMENT (mode))
> >> +    srctype = build_aligned_type (type, rhs_align);
> >> +  if (lhs_align < GET_MODE_ALIGNMENT (mode))
> >> +    desttype = build_aligned_type (type, lhs_align);
> >> +
> >> +  tree off0 = build_int_cst (build_pointer_type_for_mode
> >> (char_type_node,
> >> +                                                         ptr_mode,
> >> true), 0);
> >> +
> >> +  tree srcmem, destmem;
> >> +
> >> +  if (rhs_is_indirect)
> >> +    {
> >> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0);
> >> +    }
> >> +  else
> >> +    {
> >> +      tree rhs_offset = TREE_OPERAND (rhs, 1);
> >> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base,
> >> rhs_offset);
> >> +    }
> >> +
> >> +  if (lhs_is_indirect)
> >> +    {
> >> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0);
> >> +    }
> >> +  else
> >> +    {
> >> +      tree lhs_offset = TREE_OPERAND (lhs, 1);
> >> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base,
> >> lhs_offset);
> >> +    }
> >> +  gimple *new_stmt;
> >> +  if (is_gimple_reg_type (srctype))
> >> +    {
> >> +      new_stmt = gimple_build_assign (NULL_TREE, srcmem);
> >> +      tree tmp_var = make_ssa_name (srctype, new_stmt);
> >> +      gimple_assign_set_lhs (new_stmt, tmp_var);
> >> +      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
> >> +      gimple_set_location (new_stmt, gimple_location (stmt));
> >> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> >> +
> >> +      new_stmt = gimple_build_assign (destmem, tmp_var);
> >> +      gimple_move_vops (new_stmt, stmt);
> >> +      gimple_set_location (new_stmt, gimple_location (stmt));
> >> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> >> +      gsi_remove (gsi, true);
> >> +    }
> >> +  else
> >> +    {
> >> +      new_stmt = gimple_build_assign (destmem, srcmem);
> >> +      gimple_move_vops (new_stmt, stmt);
> >> +      gimple_set_location (new_stmt, gimple_location (stmt));
> >> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> >> +      gsi_remove (gsi, true);
> >> +    }
> >> +
> >> +  if (dump_file && (dump_flags & TDF_DETAILS))
> >> +    {
> >> +      fprintf (dump_file,
> >> +              "Converted aggregate assignment to scalar
> >> MEM_REF:\n");
> >> +      fprintf (dump_file, "  Original: ");
> >> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> >> +      fprintf (dump_file, "  Size: %u bytes, Mode: %s\n",
> >> +              (unsigned)ilen, GET_MODE_NAME (mode));
> >> +    }
> >> +
> >> +  statistics_counter_event (cfun, "aggregate assignment to scalar
> >> MEM_REF", 1);
> >> +
> >> +  return true;
> >> +}
> >> +
> >> +
> >> /* Helper function for simplify_gimple_switch.  Remove case labels
> >> that
> >> have values outside the range of the new type.  */
> >>
> >> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun)
> >> if (TREE_CODE (lhs) != SSA_NAME
> >> || has_zero_uses (lhs))
> >> {
> >> +             if (TREE_CODE (lhs) != SSA_NAME
> >> +                 && fold_aggregate_assignment (&gsi))
> >> +               continue;
> >> +
> >> process_vec_perm_simplify_seq_list
> >> (&vec_perm_simplify_seq_list);
> >> gsi_next (&gsi);
> >> continue;
> >> --
> >> 2.39.5
>

Reply via email to