On Fri, Sep 19, 2025, 7:08 PM Peter0x44 <peter0...@disroot.org> wrote:
> On 2025-09-20 02:33, Andrew Pinski wrote: > > On Fri, Sep 19, 2025, 6:22 PM Peter Damianov <peter0...@disroot.org> > > wrote: > > > >> This patch implements folding of aggregate assignments (*dest = > >> *src) > >> by converting them to scalar MEM_REF operations when the size > >> permits. This enables vectorization opportunities. > > > > I am not sure this will work with my recent changes to forwprop. Plus > > it might make sra not handle them later on. > > > > Plus INDIRECT_REF will never shows up in gimple. > > > > Iirc it is ldist that should be handling aggregate assignments to get > > it converted into memcpys. > > It is not the vectorizer really. > > Wouldn't converting it into memcpy then lose the stride info of the > copying loop? > Does that matter? > Stride? Huh? The loop in the bug report is just void foo_memmov (pixel* p, pixel* q, int n) { for (int i = 0; i != n; i++) *p++ = *q++; } There is no stride there. This is copying 4 bytes each time through the loop so it should be converted into just memmove(p, q, n *sizeof(pixel)); Yes that does lose some aliasing information but I think that is ok. The reason why your conversion of an aggregate copy works is because it converts it into an integer load/store which currently ldist understands. (Specifically dependence analysis). Now for the vectorizer, maybe this lowering should happen in ifcvt rather than forwprop. Thanks, Andrew > > > > Thanks, > > Andrew > > > >> gcc/ChangeLog: > >> > >> PR tree-optimization/99504 > >> * tree-ssa-forwprop.cc (fold_aggregate_assignment): New > >> function. > >> Folds aggregate assignments to scalar MEM_REF operations/ > >> (pass_forwprop::execute): Call fold_aggregate_assignment for > >> applicable assignment statements. > >> > >> gcc/testsuite/ChangeLog: > >> > >> PR tree-optimization/99504 > >> * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that > >> aggregate > >> assignments of various sizes get folded to scalar MEM_REF > >> operations. > >> > >> Signed-off-by: Peter Damianov <peter0...@disroot.org> > >> --- > >> gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c | 66 +++++++++ > >> gcc/tree-ssa-forwprop.cc | 140 > >> ++++++++++++++++++++ > >> 2 files changed, 206 insertions(+) > >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > >> > >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > >> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > >> new file mode 100644 > >> index 00000000000..7fef9821e9e > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > >> @@ -0,0 +1,66 @@ > >> +/* PR tree-optimization/99504 */ > >> +/* Test that aggregate assignments get folded to scalar MEM_REF > >> operations */ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */ > >> + > >> +#include <stdint.h> > >> + > >> +struct pixel_4 { > >> + uint8_t r, g, b, a; > >> +}; > >> + > >> +struct pixel_8 { > >> + uint16_t r, g, b, a; > >> +}; > >> + > >> +struct pixel_16 { > >> + uint32_t r, g, b, a; > >> +}; > >> + > >> +struct pixel_32 { > >> + uint64_t r, g, b, a; > >> +}; > >> + > >> +#ifdef __SIZEOF_INT128__ > >> +struct pixel_64 { > >> + __int128 r, g, b, a; > >> +}; > >> +#endif > >> + > >> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src) > >> +{ > >> + *dest = *src; > >> +} > >> + > >> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src) > >> +{ > >> + *dest = *src; > >> +} > >> + > >> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src) > >> +{ > >> + *dest = *src; > >> +} > >> + > >> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src) > >> +{ > >> + *dest = *src; > >> +} > >> + > >> +#ifdef __SIZEOF_INT128__ > >> +void test_64_bytes(struct pixel_64 *dest, struct pixel_64 *src) > >> +{ > >> + *dest = *src; > >> +} > >> +#endif > >> + > >> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n) > >> +{ > >> + for (int i = 0; i < n; i++) > >> + dest[i] = src[i]; > >> +} > >> + > >> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 10 "forwprop1" } } > >> */ > >> +/* Check that we generate scalar temporaries for the folded > >> assignments */ > >> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } > >> } */ > >> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" > >> "forwprop1" } } */ > >> \ No newline at end of file > >> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > >> index 43b1c9d696f..3ce94a737c6 100644 > >> --- a/gcc/tree-ssa-forwprop.cc > >> +++ b/gcc/tree-ssa-forwprop.cc > >> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq > >> typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; > >> > >> static bool forward_propagate_addr_expr (tree, tree, bool); > >> +static bool fold_aggregate_assignment (gimple_stmt_iterator *); > >> > >> /* Set to true if we delete dead edges during the optimization. */ > >> static bool cfg_changed; > >> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree > >> rhs, bool parent_single_use_p) > >> } > >> > >> +/* Try to optimize aggregate assignments by converting them to > >> scalar > >> + MEM_REF operations when profitable for vectorization. > >> + This applies the same folding as memcpy to aggregate > >> assignments. */ > >> + > >> +static bool > >> +fold_aggregate_assignment (gimple_stmt_iterator *gsi) > >> +{ > >> + gimple *stmt = gsi_stmt (*gsi); > >> + > >> + if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt)) > >> + return false; > >> + > >> + tree lhs = gimple_assign_lhs (stmt); > >> + tree rhs = gimple_assign_rhs1 (stmt); > >> + > >> + /* Check if this is an aggregate assignment: *dest = *src > >> + where both sides are aggregate types (can be MEM_REF or > >> indirection). */ > >> + bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF); > >> + bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF); > >> + > >> + if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect) > >> + || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect)) > >> + return false; > >> + > >> + tree lhs_type = TREE_TYPE (lhs); > >> + tree rhs_type = TREE_TYPE (rhs); > >> + > >> + if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type)) > >> + return false; > >> + > >> + if (!types_compatible_p (lhs_type, rhs_type)) > >> + return false; > >> + > >> + if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type))) > >> + return false; > >> + > >> + unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT > >> (lhs_type)); > >> + if (!pow2p_hwi (ilen) || ilen > MOVE_MAX) > >> + return false; > >> + > >> + tree lhs_base = TREE_OPERAND (lhs, 0); > >> + tree rhs_base = TREE_OPERAND (rhs, 0); > >> + > >> + unsigned int lhs_align = get_pointer_alignment (lhs_base); > >> + unsigned int rhs_align = get_pointer_alignment (rhs_base); > >> + > >> + scalar_int_mode imode; > >> + machine_mode mode; > >> + if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode) > >> + || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists > >> (&mode) > >> + || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT)) > >> + return false; > >> + > >> + if ((lhs_align < GET_MODE_ALIGNMENT (mode) > >> + && targetm.slow_unaligned_access (mode, lhs_align) > >> + && optab_handler (movmisalign_optab, mode) == > >> CODE_FOR_nothing) > >> + || (rhs_align < GET_MODE_ALIGNMENT (mode) > >> + && targetm.slow_unaligned_access (mode, rhs_align) > >> + && optab_handler (movmisalign_optab, mode) == > >> CODE_FOR_nothing)) > >> + return false; > >> + > >> + tree type = bitwise_type_for_mode (mode); > >> + tree srctype = type; > >> + tree desttype = type; > >> + > >> + if (rhs_align < GET_MODE_ALIGNMENT (mode)) > >> + srctype = build_aligned_type (type, rhs_align); > >> + if (lhs_align < GET_MODE_ALIGNMENT (mode)) > >> + desttype = build_aligned_type (type, lhs_align); > >> + > >> + tree off0 = build_int_cst (build_pointer_type_for_mode > >> (char_type_node, > >> + ptr_mode, > >> true), 0); > >> + > >> + tree srcmem, destmem; > >> + > >> + if (rhs_is_indirect) > >> + { > >> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0); > >> + } > >> + else > >> + { > >> + tree rhs_offset = TREE_OPERAND (rhs, 1); > >> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, > >> rhs_offset); > >> + } > >> + > >> + if (lhs_is_indirect) > >> + { > >> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0); > >> + } > >> + else > >> + { > >> + tree lhs_offset = TREE_OPERAND (lhs, 1); > >> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, > >> lhs_offset); > >> + } > >> + gimple *new_stmt; > >> + if (is_gimple_reg_type (srctype)) > >> + { > >> + new_stmt = gimple_build_assign (NULL_TREE, srcmem); > >> + tree tmp_var = make_ssa_name (srctype, new_stmt); > >> + gimple_assign_set_lhs (new_stmt, tmp_var); > >> + gimple_set_vuse (new_stmt, gimple_vuse (stmt)); > >> + gimple_set_location (new_stmt, gimple_location (stmt)); > >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > >> + > >> + new_stmt = gimple_build_assign (destmem, tmp_var); > >> + gimple_move_vops (new_stmt, stmt); > >> + gimple_set_location (new_stmt, gimple_location (stmt)); > >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > >> + gsi_remove (gsi, true); > >> + } > >> + else > >> + { > >> + new_stmt = gimple_build_assign (destmem, srcmem); > >> + gimple_move_vops (new_stmt, stmt); > >> + gimple_set_location (new_stmt, gimple_location (stmt)); > >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > >> + gsi_remove (gsi, true); > >> + } > >> + > >> + if (dump_file && (dump_flags & TDF_DETAILS)) > >> + { > >> + fprintf (dump_file, > >> + "Converted aggregate assignment to scalar > >> MEM_REF:\n"); > >> + fprintf (dump_file, " Original: "); > >> + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > >> + fprintf (dump_file, " Size: %u bytes, Mode: %s\n", > >> + (unsigned)ilen, GET_MODE_NAME (mode)); > >> + } > >> + > >> + statistics_counter_event (cfun, "aggregate assignment to scalar > >> MEM_REF", 1); > >> + > >> + return true; > >> +} > >> + > >> + > >> /* Helper function for simplify_gimple_switch. Remove case labels > >> that > >> have values outside the range of the new type. */ > >> > >> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun) > >> if (TREE_CODE (lhs) != SSA_NAME > >> || has_zero_uses (lhs)) > >> { > >> + if (TREE_CODE (lhs) != SSA_NAME > >> + && fold_aggregate_assignment (&gsi)) > >> + continue; > >> + > >> process_vec_perm_simplify_seq_list > >> (&vec_perm_simplify_seq_list); > >> gsi_next (&gsi); > >> continue; > >> -- > >> 2.39.5 >