BR,
Jeff (Jiufu Guo)
-----------------------------
diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index edf292cfbe9..8c36ad5df79 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -97,6 +97,502 @@ static bool defer_stack_allocation (tree, bool);
static void record_alignment_for_reg_var (unsigned int);
+extern rtx
+expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
int);
+
+/* For light SRA in expander about paramaters and returns. */
+namespace
+{
+
+struct access
+{
+ /* Each accessing on the aggragate is about OFFSET/SIZE. */
+ HOST_WIDE_INT offset;
+ HOST_WIDE_INT size;
+
+ bool writing;
+
+ /* The context expression of this access. */
+ tree expr;
+
+ /* The rtx for the access: link to incoming/returning register(s).
*/
+ rtx rtx_val;
+};
+
+typedef struct access *access_p;
+
+/* Expr (tree) -> Scalarized value (rtx) map. */
+static hash_map<tree, rtx> *expr_rtx_vec;
+
+/* Base (tree) -> Vector (vec<access_p> *) map. */
+static hash_map<tree, auto_vec<access_p> > *base_access_vec;
+
+/* Return true if EXPR has interesting access to the sra candidates,
+ and created access, return false otherwise. */
+
+static struct access *
+build_access (tree expr, bool write)
+{
+ enum tree_code code = TREE_CODE (expr);
+ if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF
+ && code != ARRAY_REF && code != ARRAY_RANGE_REF)
+ return NULL;
+
+ HOST_WIDE_INT offset, size;
+ bool reverse;
+ tree base = get_ref_base_and_extent_hwi (expr, &offset, &size,
&reverse);
+ if (!base || !DECL_P (base))
+ return NULL;
+
+ vec<access_p> *access_vec = base_access_vec->get (base);
+ if (!access_vec)
+ return NULL;
+
+ /* TODO: support reverse. */
+ if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE
(base)))
+ {
+ base_access_vec->remove (base);
+ return NULL;
+ }
+
+ struct access *access = XNEWVEC (struct access, 1);
+
+ memset (access, 0, sizeof (struct access));
+ access->offset = offset;
+ access->size = size;
+ access->expr = expr;
+ access->writing = write;
+ access->rtx_val = NULL_RTX;
+
+ access_vec->safe_push (access);
+
+ return access;
+}
+
+/* Callback of walk_stmt_load_store_addr_ops visit_base used to
remove
+ operands with address taken. */
+
+static bool
+visit_base (gimple *, tree op, tree, void *)
+{
+ op = get_base_address (op);
+ if (op && DECL_P (op))
+ base_access_vec->remove (op);
+
+ return false;
+}
+
+/* Scan function and look for interesting expressions and create
access
+ structures for them. */
+
+static void
+collect_acccesses (void)
+{
+ basic_block bb;
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ walk_stmt_load_store_addr_ops (gsi.phi (), NULL, NULL, NULL,
+ visit_base);
+
+ for (gimple_stmt_iterator gsi =
gsi_start_nondebug_after_labels_bb (bb);
+ !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ switch (gimple_code (stmt))
+ {
+ case GIMPLE_RETURN:
+ continue;
+ case GIMPLE_ASSIGN:
+ if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
+ {
+ tree rhs = gimple_assign_rhs1 (stmt);
+ tree lhs = gimple_assign_lhs (stmt);
+ bool res_r = build_access (rhs, false);
+ bool res_l = build_access (lhs, true);
+ if (res_l && TREE_CODE (rhs) != CONSTRUCTOR)
+ base_access_vec->remove (get_base_address (lhs));
+
+ if (res_l || res_r)
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+
+ walk_stmt_load_store_addr_ops (stmt, NULL, visit_base, visit_base,
+ visit_base);
+ }
+ }
+}
+
+/* Return true if VAR is a candidate for SRA. */
+
+static bool
+add_sra_candidate (tree var)
+{
+ tree type = TREE_TYPE (var);
+
+ if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE
(type))
+ || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE
(var)
+ || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT
(va_list_type_node))
+ return false;
+ gcc_assert (COMPLETE_TYPE_P (type));
+
+ base_access_vec->get_or_insert (var);
+
+ return true;
+}
+
+/* Collect the parameter and returns with type which is suitable for
+ * scalarization. */
+
+static bool
+collect_sra_candidates (void)
+{
+ bool ret = false;
+
+ /* Collect parameters. */
+ for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
+ parm = DECL_CHAIN (parm))
+ ret |= add_sra_candidate (parm);
+
+ /* Collect VARs on returns. */
+ if (DECL_RESULT (current_function_decl))
+ {
+ edge_iterator ei;
+ edge e;
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+ {
+ tree val = gimple_return_retval (r);
+ /* To sclaraized the return, the return value should be only
+ writen, except this return stmt.
+ Then using 'true(write)' to create the access. */
+ if (val && VAR_P (val))
+ ret |= add_sra_candidate (val) && build_access (val, true);
+ }
+ }
+
+ return ret;
+}
+
+/* Check if the accesses of BASE are scalarizbale.
+ Now, only scalarize the parms only with reading
+ or returns only with writing. */
+
+static bool
+with_scalariable_accesses (vec<access_p> *access_vec, bool is_parm)
+{
+ if (access_vec->is_empty ())
+ return false;
+
+ for (unsigned int j = 0; j < access_vec->length (); j++)
+ {
+ struct access *access = (*access_vec)[j];
+ /* Writing to a field of parameter. */
+ if (is_parm && access->writing)
+ return false;
+
+ /* Writing to a field of parameter. */
+ if (!is_parm && !access->writing)
+ return false;
+ }
+
+ return true;
+}
+
+static void
+prepare_expander_sra ()
+{
+ if (optimize <= 0)
+ return;
+
+ base_access_vec = new hash_map<tree, auto_vec<access_p> >;
+ expr_rtx_vec = new hash_map<tree, rtx>;
+
+ if (collect_sra_candidates ())
+ collect_acccesses ();
+}
+
+static void
+free_expander_sra ()
+{
+ if (optimize <= 0)
+ return;
+ delete expr_rtx_vec;
+ expr_rtx_vec = 0;
+ delete base_access_vec;
+ base_access_vec = 0;
+}
+} /* namespace */
+
+namespace
+{
+/* Get the register at INDEX from a parallel REGS. */
+
+static rtx
+extract_parallel_reg (rtx regs, int index)
+{
+ rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
+ if (!HARD_REGISTER_P (orig_reg))
+ return orig_reg;
+
+ /* Reading from param hard reg need to be moved to a temp. */
+ rtx reg = gen_reg_rtx (GET_MODE (orig_reg));
+ emit_move_insn (reg, orig_reg);
+ return reg;
+}
+
+/* Get IMODE part from REG at OFF_BITS. */
+
+static rtx
+extract_sub_reg (rtx orig_reg, int off_bits, machine_mode mode)
+{
+ scalar_int_mode imode;
+ if (!int_mode_for_mode (mode).exists (&imode))
+ return NULL_RTX;
+
+ machine_mode orig_mode = GET_MODE (orig_reg);
+ gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
+
+ poly_uint64 lowpart_off = subreg_lowpart_offset (imode, orig_mode);
+ int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT;
+ int shift_bits;
+ if (lowpart_off_bits >= off_bits)
+ shift_bits = lowpart_off_bits - off_bits;
+ else
+ shift_bits = off_bits - lowpart_off_bits;
+
+ rtx reg = orig_reg;
+ if (shift_bits > 0)
+ reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, shift_bits,
NULL, 1);
+
+ rtx subreg = gen_lowpart (imode, reg);
+ rtx result = gen_reg_rtx (imode);
+ emit_move_insn (result, subreg);
+
+ if (mode != imode)
+ result = gen_lowpart (mode, result);
+
+ return result;
+}
+
+/* Extract subfields from the REG at START bits to TARGET at OFF,
+ BITS parameter is the total number extract bits. */
+
+static int
+extract_fields_from_reg (rtx reg, int bits, int start, rtx *target,
+ HOST_WIDE_INT off)
+{
+ machine_mode mode_aux[] = {SImode, HImode, QImode};
+ int margins = sizeof (mode_aux) / sizeof (mode_aux[0]);
+ HOST_WIDE_INT off_bits = start;
+ rtx *p = target;
+ for (int n = 0; n < margins; n++)
+ {
+ machine_mode mode = mode_aux[n];
+ HOST_WIDE_INT bitsize = GET_MODE_BITSIZE (mode).to_constant ();
+ if (bits < bitsize)
+ continue;
+
+ rtx subreg = extract_sub_reg (reg, off_bits, mode);
+ *p++ = gen_rtx_EXPR_LIST (mode, subreg, GEN_INT (off));
+ off += bitsize / BITS_PER_UNIT;
+ off_bits += bitsize;
+ bits -= bitsize;
+ }
+
+ return p - target;
+}
+} /* namespace */
+
+/* Check If there is an sra access for the expr.
+ Return the correspond scalar sym for the access. */
+
+rtx
+get_scalar_rtx_for_aggregate_expr (tree expr)
+{
+ if (!expr_rtx_vec)
+ return NULL_RTX;
+ rtx *val = expr_rtx_vec->get (expr);
+ return val ? *val : NULL_RTX;
+}
+
+/* Compute/Set RTX registers for those accesses on BASE. */
+
+void
+set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
+{
+ if (!base_access_vec)
+ return;
+ vec<access_p> *access_vec = base_access_vec->get (base);
+ if (!access_vec)
+ return;
+ bool is_parm = TREE_CODE (base) == PARM_DECL;
+ if (!with_scalariable_accesses (access_vec, is_parm))
+ return;
+
+ /* Go through each access, compute corresponding rtx(regs or
subregs)
+ for the expression. */
+ int n = access_vec->length ();
+ int cur_access_index = 0;
+ for (; cur_access_index < n; cur_access_index++)
+ {
+ access_p acc = (*access_vec)[cur_access_index];
+ machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
+
+ /* mode of mult registers. */
+ if (expr_mode != BLKmode
+ && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
+ break;
+
+ /* Compute the position of the access in the whole parallel
rtx. */
+ int start_index = -1;
+ int end_index = -1;
+ HOST_WIDE_INT left_bits = 0;
+ HOST_WIDE_INT right_bits = 0;
+ int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+ for (; cur_index < XVECLEN (regs, 0); cur_index++)
+ {
+ rtx slot = XVECEXP (regs, 0, cur_index);
+ HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+ machine_mode mode = GET_MODE (XEXP (slot, 0));
+ HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+ if (off <= acc->offset && off + size > acc->offset)
+ {
+ start_index = cur_index;
+ left_bits = acc->offset - off;
+ }
+ if (off + size >= acc->offset + acc->size)
+ {
+ end_index = cur_index;
+ right_bits = off + size - (acc->offset + acc->size);
+ break;
+ }
+ }
+ /* Invalid access possition: padding or outof bound. */
+ if (start_index < 0 || end_index < 0)
+ break;
+
+ /* Need a parallel for possible multi-registers. */
+ if (expr_mode == BLKmode || end_index > start_index)
+ {
+ /* More possible space for SI, HI, QI. */
+ machine_mode mode_aux[] = {SImode, HImode, QImode};
+ int margins = sizeof (mode_aux) / sizeof (mode_aux[0]);
+ int extra = (right_bits ? margins : 0) + (left_bits ? margins :
0);
+ int num_words = end_index - start_index + 1;
+ num_words -= (right_bits ? 1 : 0);
+ num_words -= (left_bits ? 1 : 0);
+ rtx *tmps = XALLOCAVEC (rtx, num_words + extra);
+
+ int pos = 0;
+ /* There are extra fields from the left part of the start reg. */
+ if (left_bits)
+ {
+ gcc_assert (!acc->writing);
+ gcc_assert ((left_bits % BITS_PER_UNIT) == 0);
+
+ rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+ machine_mode mode = GET_MODE (reg);
+ int reg_size = GET_MODE_BITSIZE (mode).to_constant ();
+ int bits = reg_size - left_bits;
+ pos = extract_fields_from_reg (reg, bits, left_bits, tmps, 0);
+ }
+
+ HOST_WIDE_INT start;
+ start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
+ start -= left_bits / BITS_PER_UNIT;
+ /* Extract whole registers. */
+ for (; pos < num_words; pos++)
+ {
+ int index = start_index + pos;
+ rtx reg = extract_parallel_reg (regs, index);
+ machine_mode mode = GET_MODE (reg);
+ HOST_WIDE_INT off;
+ off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start;
+ tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off));
+ }
+
+ /* No more fields. */
+ if (right_bits == 0)
+ {
+ rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos,
tmps));
+ acc->rtx_val = reg;
+ continue;
+ }
+
+ /* There are extra fields from the part of register. */
+ gcc_assert (!acc->writing);
+ gcc_assert ((right_bits % BITS_PER_UNIT) == 0);
+
+ HOST_WIDE_INT off;
+ off = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - start;
+ rtx reg = XEXP (XVECEXP (regs, 0, end_index), 0);
+ machine_mode mode = GET_MODE (reg);
+ int reg_size = GET_MODE_BITSIZE (mode).to_constant ();
+ int bits = reg_size - right_bits;
+ pos += extract_fields_from_reg (reg, bits, 0, tmps + pos, off);
+
+ /* Currently, PARALLELs with register elements for param/returns
+ are using BLKmode. */
+ acc->rtx_val = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos,
tmps));
+ continue;
+ }
+
+ /* Just need one reg for the correspond access. */
+ if (end_index == start_index && left_bits == 0 && right_bits ==
0)
+ {
+ rtx reg = extract_parallel_reg (regs, start_index);
+ if (GET_MODE (reg) != expr_mode)
+ reg = gen_lowpart (expr_mode, reg);
+
+ acc->rtx_val = reg;
+ continue;
+ }
+
+ /* Need to shift to extract a part reg for the access. */
+ if (!acc->writing && end_index == start_index)
+ {
+ rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+ acc->rtx_val = extract_sub_reg (orig_reg, left_bits, expr_mode);
+ if (acc->rtx_val)
+ continue;
+ }
+
+ break;
+ }
+
+ /* If all access expr(s) are not scalarized,
+ bind/map all expr(tree) to sclarized rtx. */
+ if (cur_access_index == n)
+ for (int j = 0; j < n; j++)
+ {
+ access_p access = (*access_vec)[j];
+ expr_rtx_vec->put (access->expr, access->rtx_val);
+ }
+}
+
+void
+set_scalar_rtx_for_returns ()
+{
+ tree res = DECL_RESULT (current_function_decl);
+ gcc_assert (res);
+ edge_iterator ei;
+ edge e;
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb
(e->src)))
+ {
+ tree val = gimple_return_retval (r);
+ if (val && VAR_P (val))
+ set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res));
+ }
+}
+
/* Return an expression tree corresponding to the RHS of GIMPLE
statement STMT. */
@@ -3778,7 +4274,8 @@ expand_return (tree retval)
/* If we are returning the RESULT_DECL, then the value has already
been stored into it, so we don't have to do anything special.
*/
- if (TREE_CODE (retval_rhs) == RESULT_DECL)
+ if (TREE_CODE (retval_rhs) == RESULT_DECL
+ || get_scalar_rtx_for_aggregate_expr (retval_rhs))
expand_value_return (result_rtl);
/* If the result is an aggregate that is being returned in one (or
more)
@@ -4422,6 +4919,9 @@ expand_debug_expr (tree exp)
int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
addr_space_t as;
scalar_int_mode op0_mode, op1_mode, addr_mode;
+ rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+ if (x)
+ return NULL_RTX;/* optimized out. */
switch (TREE_CODE_CLASS (TREE_CODE (exp)))
{
@@ -6620,6 +7120,8 @@ pass_expand::execute (function *fun)
avoid_deep_ter_for_debug (gsi_stmt (gsi), 0);
}
+ prepare_expander_sra ();
+
/* Mark arrays indexed with non-constant indices with
TREE_ADDRESSABLE. */
auto_bitmap forced_stack_vars;
discover_nonconstant_array_refs (forced_stack_vars);
@@ -7052,6 +7554,7 @@ pass_expand::execute (function *fun)
loop_optimizer_finalize ();
}
+ free_expander_sra ();
timevar_pop (TV_POST_EXPAND);
return 0;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index fff09dc9951..d487fe3b53b 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx,
rtx, rtx,
static rtx const_vector_from_tree (tree);
static tree tree_expr_size (const_tree);
static void convert_mode_scalar (rtx, rtx, int);
+rtx get_scalar_rtx_for_aggregate_expr (tree);
/* This is run to set up which modes can be used
@@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool
nontemporal)
Assignment of an array element at a constant index, and
assignment of
an array element in an unaligned packed structure field, has the
same
problem. Same for (partially) storing into a non-memory object.
*/
- if (handled_component_p (to)
- || (TREE_CODE (to) == MEM_REF
- && (REF_REVERSE_STORAGE_ORDER (to)
- || mem_ref_refers_to_non_mem_p (to)))
- || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
+ if (!get_scalar_rtx_for_aggregate_expr (to)
+ && (handled_component_p (to)
+ || (TREE_CODE (to) == MEM_REF
+ && (REF_REVERSE_STORAGE_ORDER (to)
+ || mem_ref_refers_to_non_mem_p (to)))
+ || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
{
machine_mode mode1;
poly_int64 bitsize, bitpos;
@@ -9011,6 +9013,9 @@ expand_expr_real (tree exp, rtx target,
machine_mode tmode,
ret = CONST0_RTX (tmode);
return ret ? ret : const0_rtx;
}
+ rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+ if (x)
+ return x;
ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
inner_reference_p);
diff --git a/gcc/function.cc b/gcc/function.cc
index dd2c1136e07..7fe927bd36b 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct
assign_parm_data_one *data)
data->stack_parm = stack_parm;
}
+extern void set_scalar_rtx_for_aggregate_access (tree, rtx);
+extern void set_scalar_rtx_for_returns ();
+
/* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that
it's
always valid and contiguous. */
@@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct
assign_parm_data_all *all,
emit_move_insn (mem, entry_parm);
}
else
- move_block_from_reg (REGNO (entry_parm), mem,
- size_stored / UNITS_PER_WORD);
+ {
+ int regno = REGNO (entry_parm);
+ int nregs = size_stored / UNITS_PER_WORD;
+ move_block_from_reg (regno, mem, nregs);
+
+ rtx *tmps = XALLOCAVEC (rtx, nregs);
+ machine_mode mode = word_mode;
+ HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
+ for (int i = 0; i < nregs; i++)
+ {
+ rtx reg = gen_rtx_REG (mode, regno + i);
+ rtx off = GEN_INT (word_size * i);
+ tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
+ }
+
+ rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+ set_scalar_rtx_for_aggregate_access (parm, regs);
+ }
}
else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
{
@@ -3716,6 +3735,10 @@ assign_parms (tree fndecl)
else
set_decl_incoming_rtl (parm, data.entry_parm, false);
+ rtx incoming = DECL_INCOMING_RTL (parm);
+ if (GET_CODE (incoming) == PARALLEL)
+ set_scalar_rtx_for_aggregate_access (parm, incoming);
+
assign_parm_adjust_stack_rtl (&data);
if (assign_parm_setup_block_p (&data))
@@ -5136,6 +5159,7 @@ expand_function_start (tree subr)
{
gcc_assert (GET_CODE (hard_reg) == PARALLEL);
set_parm_rtl (res, gen_group_rtx (hard_reg));
+ set_scalar_rtx_for_returns ();
}
}
diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C
b/gcc/testsuite/g++.target/powerpc/pr102024.C
index 769585052b5..c8995cae707 100644
--- a/gcc/testsuite/g++.target/powerpc/pr102024.C
+++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
@@ -5,7 +5,7 @@
// Test that a zero-width bit field in an otherwise homogeneous
aggregate
// generates a psabi warning and passes arguments in GPRs.
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
struct a_thing
{
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c
b/gcc/testsuite/gcc.target/powerpc/pr108073.c
new file mode 100644
index 00000000000..7dd1a4a326a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF {double a[4]; short s1; short s2; short s3; short
s4; } DF;
+typedef struct SF {float a[4]; int i1; int i2; } SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target {
has_arch_ppc64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64
&& has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64
&& has_arch_pwr8 } } } } */
+short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag ==
2)return a.s2+a.s3;return 0;}
+int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag ==
2)return a.i2+a.i1;return 0;}
+double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag ==
2)return arg.a[3];else return 0.0;}
+float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag ==
2)return arg.a[2]; return 0;}
+float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag ==
2)return arg.a[1];return 0;}
+
+DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
+
+int main()
+{
+ if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf,
2) == 4.0
+ && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+ __builtin_abort ();
+ if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf,
1) == 0
+ && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+ __builtin_abort ();
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
new file mode 100644
index 00000000000..4e1f87f7939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
@@ -0,0 +1,6 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+
+typedef struct LARGE {double a[4]; int arr[32];} LARGE;
+LARGE foo (LARGE a){return a;}
+/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
new file mode 100644
index 00000000000..8a8e1a0e996
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
@@ -0,0 +1,32 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target powerpc_elfv2 } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+typedef struct FLOATS
+{
+ double a[3];
+} FLOATS;
+
+/* 3 lfd after returns also optimized */
+/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
+
+/* 3 stfd */
+void st_arg (FLOATS a, FLOATS *p) {*p = a;}
+/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
+
+/* blr */
+FLOATS ret_arg (FLOATS a) {return a;}
+
+typedef struct MIX
+{
+ double a[2];
+ long l;
+} MIX;
+
+/* std 3 param regs to return slot */
+MIX ret_arg1 (MIX a) {return a;}
+/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
+
+/* count insns */
+/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
>
> Richard.
>
>>
>> BR,
>> Jeff (Jiufu Guo)
>>
>>
>> Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>
>> > Hi Martin,
>> >
>> > Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >
>> >> Hi,
>> >>
>> >> Martin Jambor <mjam...@suse.cz> writes:
>> >>
>> >>> Hi,
>> >>>
>> >>> On Tue, May 30 2023, Richard Biener wrote:
>> >>>> On Mon, 29 May 2023, Jiufu Guo wrote:
>> >>>>
>> >>>>> Hi,
>> >>>>>
>> >>>>> Previously, I was investigating some struct parameters and returns
related
>> >>>>> PRs 69143/65421/108073.
>> >>>>>
>> >>>>> Investigating the issues case by case, and drafting patches for each of
>> >>>>> them one by one. This would help us to enhance code incrementally.
>> >>>>> While, this way, patches would interact with each other and implement
>> >>>>> different codes for similar issues (because of the different paths in
>> >>>>> gimple/rtl). We may have a common fix for those issues.
>> >>>>>
>> >>>>> We know a few other related PRs(such as meta-bug PR101926) exist. For
those
>> >>>>> PRs in different targets with different symptoms (and also different
root
>> >>>>> cause), I would expect a method could help some of them, but it may
>> >>>>> be hard to handle all of them in one fix.
>> >>>>>
>> >>>>> With investigation and check discussion for the issues, I remember a
>> >>>>> suggestion from Richard: it would be nice to perform some SRA-like
analysis
>> >>>>> for the accesses on the structs (parameter/returns).
>> >>>>> https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605117.html
>> >>>>> This may be a 'fairly common method' for those issues. With this idea,
>> >>>>> I drafted a patch as below in this mail.
>> >>>>>
>> >>>>> I also thought about directly using tree-sra.cc, e.g. enhance it and
rerun it
>> >>>>> at the end of GIMPLE passes. While since some issues are introduced
inside
>> >>>>> the expander, so below patch also co-works with other parts of the
expander.
>> >>>>> And since we already have tree-sra in gimple pass, we only need to
take more
>> >>>>> care on parameter and return in this patch: other decls could be
handled
>> >>>>> well in tree-sra.
>> >>>>>
>> >>>>> The steps of this patch are:
>> >>>>> 1. Collect struct type parameters and returns, and then scan the
function to
>> >>>>> get the accesses on them. And figure out the accesses which would be
profitable
>> >>>>> to be scalarized (using registers of the parameter/return ). Now,
reading on
>> >>>>> parameter and writing on returns are checked in the current patch.
>> >>>>> 2. When/after the scalar registers are determined/expanded for the
return or
>> >>>>> parameters, compute the corresponding scalar register(s) for each
accesses of
>> >>>>> the return/parameter, and prepare the scalar RTLs for those accesses.
>> >>>>> 3. When using/expanding the accesses expression, leverage the
computed/prepared
>> >>>>> scalars directly.
>> >>>>>
>> >>>>> This patch is tested on ppc64 both LE and BE.
>> >>>>> To continue, I would ask for comments and suggestions first. And then
I would
>> >>>>> update/enhance accordingly. Thanks in advance!
>> >>>>
>> >>>> Thanks for working on this - the description above sounds exactly like
>> >>>> what should be done.
>> >>>>
>> >>>> Now - I'd like the code to re-use the access tree data structure from
>> >>>> SRA plus at least the worker creating the accesses from a stmt.
>> >>>
>> >
>> > I'm thinking about which part of the code can be re-used from
>> > ipa-sra and tree-sra.
>> > It seems there are some similar concepts between them:
>> > "access with offset/size", "collect and check candidates",
>> > "analyze accesses"...
>> >
>> > While because the purposes are different, the logic and behavior
>> > between them (ipa-sra, tree-sra, and expander-sra) are different,
>> > even for similar concepts.
>> >
>> > The same behavior and similar concept may be reusable. Below list
>> > may be part of them.
>> > *. allocate and maintain access
>> > basic access structure: offset, size, reverse
>> > *. type or expr checking
>> > *. disqualify
>> > *. scan and build expr access
>> > *. scan and walk stmts (return/assign/call/asm)
>> > *. collect candidates
>> > *. initialize/deinitialize
>> > *. access dump
>> >
>> > There are different behaviors for a similar concept.
>> > For examples:
>> > *. Access has grg/queues in tree-sra, access has nonarg in ipa-sra,
>> > and expander-sra does not check access's child/sibling yet.
>> > *. for same stmt(assign/call), different sra checks different logic.
>> > *. candidates have different checking logic: ipa-sra checks more stuff.
>> >
>> > Is this align with your thoughts? Thanks for comments!
>> >
>> > BR,
>> > Jeff (Jiufu Guo)
>> >
>> >> Thanks Martin for your reply and thanks for your time!
>> >>
>> >>> I have had a first look at the patch but still need to look into it more
>> >>> to understand how it uses the information it gathers.
>> >>>
>> >>> My plan is to make the access-tree infrastructure of IPA-SRA more
>> >>> generic and hopefully usable even for this purpose, rather than the one
>> >>> in tree-sra.cc. But that really builds a tree of accesses, bailing out
>> >>> on any partial overlaps, for example, which may not be the right thing
>> >>> here since I don't see any tree-building here.
>> >>
>> >> Yeap, both in tree-sra and ipa-sra, there are concepts about
>> >> "access" and "scan functions/stmts". In this light-sra, these concepts
>> >> are also used. And you may notice that ipa-sra and tree-sra have more
>> >> logic than the current 'light-expand-sra'.
>> >>
>> >> Currently, the 'light-expand-sra' just takes care few things: reading
>> >> from parameter, writing to returns, and disabling sra if address-taken.
>> >> As you notice, now the "access" in this patch is not in a 'tree-struct',
>> >> it is just a 'flat' (or say map & vector). And overlaps between
>> >> accesses are not checked because they are all just reading (for parm).
>> >>
>> >> When we take care of more stuff: passing to call argument, occur in
>> >> memory assignment, occur in line asm... This light-expander-sra would be
>> >> more and more like tee-sra and ipa-sra. And it would be good to leverage
>> >> more capabilities from tree-sra and ipa-sra. So, I agree that it would be
>> >> a great idea to share and reuse the same struct.
>> >>
>> >>> But I still need to
>> >>> properly read set_scalar_rtx_for_aggregate_access function in the patch,
>> >>> which I plan to do next week.
>> >>
>> >> set_scalar_rtx_for_aggregate_access is another key part of this patch.
>> >> Different from tree-sra/ipa-sra (which creates new scalars SSA for each
>> >> access), this patch invokes "set_scalar_rtx_for_aggregate_access" to
>> >> create an rtx expression for each access. Now, this part may not common
>> >> with tree-sra and ipa-sra.
>> >>
>> >> This function is invoked for each parameter if the parameter is
>> >> aggregate type and passed via registers.
>> >> For each access about this parameter, the function creates an rtx
>> >> according to the offset/size/mode of the access. The created rtx maybe:
>> >> 1. one rtx pseudo corresponds to an incoming reg,
>> >> 2. one rtx pseudo which is assigned by a part of incoming reg after
>> >> shift and mode adjust,
>> >> 3. a parallel rtx contains a few rtx pseudos corresponding to the
>> >> incoming registers.
>> >> For return, only 1 and 3 are ok.
>> >>
>> >> BR,
>> >> Jeff (Jiufu Guo)
>> >>
>> >>>
>> >>> Thanks,
>> >>>
>> >>> Martin
>> >>>
>> >>>>
>> >>>> The RTL expansion code already does a sweep over stmts in
>> >>>> discover_nonconstant_array_refs which makes sure RTL expansion doesn't
>> >>>> scalarize (aka assign non-stack) to variables which have accesses
>> >>>> that would later eventually FAIL to expand when operating on registers.
>> >>>> That's very much related to the task at hand so we should try to
>> >>>> at least merge the CFG walks of both (it produces a forced_stack_vars
>> >>>> bitmap).
>> >>>>
>> >>>> Can you work together with Martin to split out the access tree
>> >>>> data structure and share it?
>> >>>>
>> >>>> I didn't look in detail as of how you make use of the information
>> >>>> yet.
>> >>>>
>> >>>> Thanks,
>> >>>> Richard.
>> >>>>
>> >>>>>
>> >>>>> BR,
>> >>>>> Jeff (Jiufu)
>> >>>>>
>> >>>>>
>> >>>>> ---
>> >>>>> gcc/cfgexpand.cc | 567 ++++++++++++++++++-
>> >>>>> gcc/expr.cc | 15 +-
>> >>>>> gcc/function.cc | 26 +-
>> >>>>> gcc/opts.cc | 8 +-
>> >>>>> gcc/testsuite/g++.target/powerpc/pr102024.C | 2 +-
>> >>>>> gcc/testsuite/gcc.target/powerpc/pr108073.c | 29 +
>> >>>>> gcc/testsuite/gcc.target/powerpc/pr65421-1.c | 6 +
>> >>>>> gcc/testsuite/gcc.target/powerpc/pr65421-2.c | 32 ++
>> >>>>> 8 files changed, 675 insertions(+), 10 deletions(-)
>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> >>>>>
>> >>>>> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
>> >>>>> index 85a93a547c0..95c29b6b6fe 100644
>> >>>>> --- a/gcc/cfgexpand.cc
>> >>>>> +++ b/gcc/cfgexpand.cc
>> >>>>> @@ -97,6 +97,564 @@ static bool defer_stack_allocation (tree, bool);
>> >>>>>
>> >>>>> static void record_alignment_for_reg_var (unsigned int);
>> >>>>>
>> >>>>> +/* For light SRA in expander about paramaters and returns. */
>> >>>>> +namespace {
>> >>>>> +
>> >>>>> +struct access
>> >>>>> +{
>> >>>>> + /* Each accessing on the aggragate is about OFFSET/SIZE and BASE.
*/
>> >>>>> + HOST_WIDE_INT offset;
>> >>>>> + HOST_WIDE_INT size;
>> >>>>> + tree base;
>> >>>>> + bool writing;
>> >>>>> +
>> >>>>> + /* The context expression of this access. */
>> >>>>> + tree expr;
>> >>>>> +
>> >>>>> + /* The rtx for the access: link to incoming/returning register(s).
*/
>> >>>>> + rtx rtx_val;
>> >>>>> +};
>> >>>>> +
>> >>>>> +typedef struct access *access_p;
>> >>>>> +
>> >>>>> +/* Expr (tree) -> Acess (access_p) map. */
>> >>>>> +static hash_map<tree, access_p> *expr_access_vec;
>> >>>>> +
>> >>>>> +/* Base (tree) -> Vector (vec<access_p> *) map. */
>> >>>>> +static hash_map<tree, auto_vec<access_p> > *base_access_vec;
>> >>>>> +
>> >>>>> +/* Return a vector of pointers to accesses for the variable given in
BASE or
>> >>>>> + NULL if there is none. */
>> >>>>> +
>> >>>>> +static vec<access_p> *
>> >>>>> +get_base_access_vector (tree base)
>> >>>>> +{
>> >>>>> + return base_access_vec->get (base);
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Remove DECL from candidates for SRA. */
>> >>>>> +static void
>> >>>>> +disqualify_candidate (tree decl)
>> >>>>> +{
>> >>>>> + decl = get_base_address (decl);
>> >>>>> + base_access_vec->remove (decl);
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Create and insert access for EXPR. Return created access, or NULL
if it is
>> >>>>> + not possible. */
>> >>>>> +static struct access *
>> >>>>> +create_access (tree expr, bool write)
>> >>>>> +{
>> >>>>> + poly_int64 poffset, psize, pmax_size;
>> >>>>> + bool reverse;
>> >>>>> +
>> >>>>> + tree base
>> >>>>> + = get_ref_base_and_extent (expr, &poffset, &psize, &pmax_size,
&reverse);
>> >>>>> +
>> >>>>> + if (!DECL_P (base))
>> >>>>> + return NULL;
>> >>>>> +
>> >>>>> + vec<access_p> *access_vec = get_base_access_vector (base);
>> >>>>> + if (!access_vec)
>> >>>>> + return NULL;
>> >>>>> +
>> >>>>> + /* TODO: support reverse. */
>> >>>>> + if (reverse)
>> >>>>> + {
>> >>>>> + disqualify_candidate (expr);
>> >>>>> + return NULL;
>> >>>>> + }
>> >>>>> +
>> >>>>> + HOST_WIDE_INT offset, size, max_size;
>> >>>>> + if (!poffset.is_constant (&offset) || !psize.is_constant (&size)
>> >>>>> + || !pmax_size.is_constant (&max_size))
>> >>>>> + return NULL;
>> >>>>> +
>> >>>>> + if (size != max_size || size == 0 || offset < 0 || size < 0
>> >>>>> + || offset + size > tree_to_shwi (DECL_SIZE (base)))
>> >>>>> + return NULL;
>> >>>>> +
>> >>>>> + struct access *access = XNEWVEC (struct access, 1);
>> >>>>> +
>> >>>>> + memset (access, 0, sizeof (struct access));
>> >>>>> + access->base = base;
>> >>>>> + access->offset = offset;
>> >>>>> + access->size = size;
>> >>>>> + access->expr = expr;
>> >>>>> + access->writing = write;
>> >>>>> + access->rtx_val = NULL_RTX;
>> >>>>> +
>> >>>>> + access_vec->safe_push (access);
>> >>>>> +
>> >>>>> + return access;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Return true if VAR is a candidate for SRA. */
>> >>>>> +static bool
>> >>>>> +add_sra_candidate (tree var)
>> >>>>> +{
>> >>>>> + tree type = TREE_TYPE (var);
>> >>>>> +
>> >>>>> + if (!AGGREGATE_TYPE_P (type) || TREE_THIS_VOLATILE (var)
>> >>>>> + || !COMPLETE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE
(type))
>> >>>>> + || tree_to_shwi (TYPE_SIZE (type)) == 0
>> >>>>> + || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT
(va_list_type_node))
>> >>>>> + return false;
>> >>>>> +
>> >>>>> + base_access_vec->get_or_insert (var);
>> >>>>> +
>> >>>>> + return true;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Callback of walk_stmt_load_store_addr_ops visit_addr used to remove
>> >>>>> + operands with address taken. */
>> >>>>> +static tree
>> >>>>> +visit_addr (tree *tp, int *, void *)
>> >>>>> +{
>> >>>>> + tree op = *tp;
>> >>>>> + if (op && DECL_P (op))
>> >>>>> + disqualify_candidate (op);
>> >>>>> +
>> >>>>> + return NULL;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Scan expression EXPR and create access structures for all accesses
to
>> >>>>> + candidates for scalarization. Return the created access or NULL
if none is
>> >>>>> + created. */
>> >>>>> +static struct access *
>> >>>>> +build_access_from_expr (tree expr, bool write)
>> >>>>> +{
>> >>>>> + if (TREE_CODE (expr) == VIEW_CONVERT_EXPR)
>> >>>>> + expr = TREE_OPERAND (expr, 0);
>> >>>>> +
>> >>>>> + if (TREE_CODE (expr) == BIT_FIELD_REF || storage_order_barrier_p
(expr)
>> >>>>> + || TREE_THIS_VOLATILE (expr))
>> >>>>> + {
>> >>>>> + disqualify_candidate (expr);
>> >>>>> + return NULL;
>> >>>>> + }
>> >>>>> +
>> >>>>> + switch (TREE_CODE (expr))
>> >>>>> + {
>> >>>>> + case MEM_REF: {
>> >>>>> + tree op = TREE_OPERAND (expr, 0);
>> >>>>> + if (TREE_CODE (op) == ADDR_EXPR)
>> >>>>> + disqualify_candidate (TREE_OPERAND (op, 0));
>> >>>>> + break;
>> >>>>> + }
>> >>>>> + case ADDR_EXPR:
>> >>>>> + case IMAGPART_EXPR:
>> >>>>> + case REALPART_EXPR:
>> >>>>> + disqualify_candidate (TREE_OPERAND (expr, 0));
>> >>>>> + break;
>> >>>>> + case VAR_DECL:
>> >>>>> + case PARM_DECL:
>> >>>>> + case RESULT_DECL:
>> >>>>> + case COMPONENT_REF:
>> >>>>> + case ARRAY_REF:
>> >>>>> + case ARRAY_RANGE_REF:
>> >>>>> + return create_access (expr, write);
>> >>>>> + break;
>> >>>>> + default:
>> >>>>> + break;
>> >>>>> + }
>> >>>>> +
>> >>>>> + return NULL;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Scan function and look for interesting expressions and create
access
>> >>>>> + structures for them. */
>> >>>>> +static void
>> >>>>> +scan_function (void)
>> >>>>> +{
>> >>>>> + basic_block bb;
>> >>>>> +
>> >>>>> + FOR_EACH_BB_FN (bb, cfun)
>> >>>>> + {
>> >>>>> + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
>> >>>>> + gsi_next (&gsi))
>> >>>>> + {
>> >>>>> + gphi *phi = gsi.phi ();
>> >>>>> + for (size_t i = 0; i < gimple_phi_num_args (phi); i++)
>> >>>>> + {
>> >>>>> + tree t = gimple_phi_arg_def (phi, i);
>> >>>>> + walk_tree (&t, visit_addr, NULL, NULL);
>> >>>>> + }
>> >>>>> + }
>> >>>>> +
>> >>>>> + for (gimple_stmt_iterator gsi =
gsi_start_nondebug_after_labels_bb (bb);
>> >>>>> + !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
>> >>>>> + {
>> >>>>> + gimple *stmt = gsi_stmt (gsi);
>> >>>>> + switch (gimple_code (stmt))
>> >>>>> + {
>> >>>>> + case GIMPLE_RETURN: {
>> >>>>> + tree r = gimple_return_retval (as_a<greturn *> (stmt));
>> >>>>> + if (r && VAR_P (r) && r != DECL_RESULT
(current_function_decl))
>> >>>>> + build_access_from_expr (r, true);
>> >>>>> + }
>> >>>>> + break;
>> >>>>> + case GIMPLE_ASSIGN:
>> >>>>> + if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
>> >>>>> + {
>> >>>>> + tree lhs = gimple_assign_lhs (stmt);
>> >>>>> + tree rhs = gimple_assign_rhs1 (stmt);
>> >>>>> + if (TREE_CODE (rhs) == CONSTRUCTOR)
>> >>>>> + disqualify_candidate (lhs);
>> >>>>> + else
>> >>>>> + {
>> >>>>> + build_access_from_expr (rhs, false);
>> >>>>> + build_access_from_expr (lhs, true);
>> >>>>> + }
>> >>>>> + }
>> >>>>> + break;
>> >>>>> + default:
>> >>>>> + walk_gimple_op (stmt, visit_addr, NULL);
>> >>>>> + break;
>> >>>>> + }
>> >>>>> + }
>> >>>>> + }
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Collect the parameter and returns with type which is suitable for
>> >>>>> + * scalarization. */
>> >>>>> +static bool
>> >>>>> +collect_light_sra_candidates (void)
>> >>>>> +{
>> >>>>> + bool ret = false;
>> >>>>> +
>> >>>>> + /* Collect parameters. */
>> >>>>> + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
>> >>>>> + parm = DECL_CHAIN (parm))
>> >>>>> + ret |= add_sra_candidate (parm);
>> >>>>> +
>> >>>>> + /* Collect VARs on returns. */
>> >>>>> + if (DECL_RESULT (current_function_decl))
>> >>>>> + {
>> >>>>> + edge_iterator ei;
>> >>>>> + edge e;
>> >>>>> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>> >>>>> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
>> >>>>> + {
>> >>>>> + tree val = gimple_return_retval (r);
>> >>>>> + if (val && VAR_P (val))
>> >>>>> + ret |= add_sra_candidate (val);
>> >>>>> + }
>> >>>>> + }
>> >>>>> +
>> >>>>> + return ret;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Now, only scalarize the parms only with reading
>> >>>>> + or returns only with writing. */
>> >>>>> +bool
>> >>>>> +check_access_vec (tree const &base, auto_vec<access_p> const
&access_vec,
>> >>>>> + auto_vec<tree> *unqualify_vec)
>> >>>>> +{
>> >>>>> + bool read = false;
>> >>>>> + bool write = false;
>> >>>>> + for (unsigned int j = 0; j < access_vec.length (); j++)
>> >>>>> + {
>> >>>>> + struct access *access = access_vec[j];
>> >>>>> + if (access->writing)
>> >>>>> + write = true;
>> >>>>> + else
>> >>>>> + read = true;
>> >>>>> +
>> >>>>> + if (write && read)
>> >>>>> + break;
>> >>>>> + }
>> >>>>> + if ((write && read) || (!write && !read))
>> >>>>> + unqualify_vec->safe_push (base);
>> >>>>> +
>> >>>>> + return true;
>> >>>>> +}
>> >>>>> +
>> >>>>> +/* Analyze all the accesses, remove those inprofitable candidates.
>> >>>>> + And build the expr->access map. */
>> >>>>> +static void
>> >>>>> +analyze_accesses ()
>> >>>>> +{
>> >>>>> + auto_vec<tree> unqualify_vec;
>> >>>>> + base_access_vec->traverse<auto_vec<tree> *, check_access_vec> (
>> >>>>> + &unqualify_vec);
>> >>>>> +
>> >>>>> + tree base;
>> >>>>> + unsigned i;
>> >>>>> + FOR_EACH_VEC_ELT (unqualify_vec, i, base)
>> >>>>> + disqualify_candidate (base);
>> >>>>> +}
>> >>>>> +
>> >>>>> +static void
>> >>>>> +prepare_expander_sra ()
>> >>>>> +{
>> >>>>> + if (optimize <= 0)
>> >>>>> + return;
>> >>>>> +
>> >>>>> + base_access_vec = new hash_map<tree, auto_vec<access_p> >;
>> >>>>> + expr_access_vec = new hash_map<tree, access_p>;
>> >>>>> +
>> >>>>> + if (collect_light_sra_candidates ())
>> >>>>> + {
>> >>>>> + scan_function ();
>> >>>>> + analyze_accesses ();
>> >>>>> + }
>> >>>>> +}
>> >>>>> +
>> >>>>> +static void
>> >>>>> +free_expander_sra ()
>> >>>>> +{
>> >>>>> + if (optimize <= 0 || !expr_access_vec)
>> >>>>> + return;
>> >>>>> + delete expr_access_vec;
>> >>>>> + expr_access_vec = 0;
>> >>>>> + delete base_access_vec;
>> >>>>> + base_access_vec = 0;
>> >>>>> +}
>> >>>>> +} /* namespace */
>> >>>>> +
>> >>>>> +/* Check If there is an sra access for the expr.
>> >>>>> + Return the correspond scalar sym for the access. */
>> >>>>> +rtx
>> >>>>> +get_scalar_rtx_for_aggregate_expr (tree expr)
>> >>>>> +{
>> >>>>> + if (!expr_access_vec)
>> >>>>> + return NULL_RTX;
>> >>>>> + access_p *access = expr_access_vec->get (expr);
>> >>>>> + return access ? (*access)->rtx_val : NULL_RTX;
>> >>>>> +}
>> >>>>> +
>> >>>>> +extern rtx
>> >>>>> +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
int);
>> >>>>> +
>> >>>>> +/* Compute/Set RTX registers for those accesses on BASE. */
>> >>>>> +void
>> >>>>> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
>> >>>>> +{
>> >>>>> + if (!base_access_vec)
>> >>>>> + return;
>> >>>>> + vec<access_p> *access_vec = get_base_access_vector (base);
>> >>>>> + if (!access_vec)
>> >>>>> + return;
>> >>>>> +
>> >>>>> + /* Go through each access, compute corresponding rtx(regs or
subregs)
>> >>>>> + for the expression. */
>> >>>>> + int n = access_vec->length ();
>> >>>>> + int cur_access_index = 0;
>> >>>>> + for (; cur_access_index < n; cur_access_index++)
>> >>>>> + {
>> >>>>> + access_p acc = (*access_vec)[cur_access_index];
>> >>>>> + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
>> >>>>> + /* non BLK in mult registers*/
>> >>>>> + if (expr_mode != BLKmode
>> >>>>> + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
>> >>>>> + break;
>> >>>>> +
>> >>>>> + int start_index = -1;
>> >>>>> + int end_index = -1;
>> >>>>> + HOST_WIDE_INT left_margin_bits = 0;
>> >>>>> + HOST_WIDE_INT right_margin_bits = 0;
>> >>>>> + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
>> >>>>> + for (; cur_index < XVECLEN (regs, 0); cur_index++)
>> >>>>> + {
>> >>>>> + rtx slot = XVECEXP (regs, 0, cur_index);
>> >>>>> + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
>> >>>>> + HOST_WIDE_INT size
>> >>>>> + = GET_MODE_BITSIZE (GET_MODE (XEXP (slot, 0))).to_constant ();
>> >>>>> + if (off <= acc->offset && off + size > acc->offset)
>> >>>>> + {
>> >>>>> + start_index = cur_index;
>> >>>>> + left_margin_bits = acc->offset - off;
>> >>>>> + }
>> >>>>> + if (off + size >= acc->offset + acc->size)
>> >>>>> + {
>> >>>>> + end_index = cur_index;
>> >>>>> + right_margin_bits = off + size - (acc->offset + acc->size);
>> >>>>> + break;
>> >>>>> + }
>> >>>>> + }
>> >>>>> + /* accessing pading and outof bound. */
>> >>>>> + if (start_index < 0 || end_index < 0)
>> >>>>> + break;
>> >>>>> +
>> >>>>> + /* Need a parallel for possible multi-registers. */
>> >>>>> + if (expr_mode == BLKmode || end_index > start_index)
>> >>>>> + {
>> >>>>> + /* Can not support start from middle of a register. */
>> >>>>> + if (left_margin_bits != 0)
>> >>>>> + break;
>> >>>>> +
>> >>>>> + int len = end_index - start_index + 1;
>> >>>>> + const int margin = 3; /* more space for SI, HI, QI. */
>> >>>>> + rtx *tmps = XALLOCAVEC (rtx, len + (right_margin_bits ? margin :
0));
>> >>>>> +
>> >>>>> + HOST_WIDE_INT start_off
>> >>>>> + = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
>> >>>>> + int pos = 0;
>> >>>>> + for (; pos < len - (right_margin_bits ? 1 : 0); pos++)
>> >>>>> + {
>> >>>>> + int index = start_index + pos;
>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
>> >>>>> + machine_mode mode = GET_MODE (orig_reg);
>> >>>>> + rtx reg = NULL_RTX;
>> >>>>> + if (HARD_REGISTER_P (orig_reg))
>> >>>>> + {
>> >>>>> + /* Reading from param hard reg need to be moved to a temp.
*/
>> >>>>> + gcc_assert (!acc->writing);
>> >>>>> + reg = gen_reg_rtx (mode);
>> >>>>> + emit_move_insn (reg, orig_reg);
>> >>>>> + }
>> >>>>> + else
>> >>>>> + reg = orig_reg;
>> >>>>> +
>> >>>>> + HOST_WIDE_INT off = UINTVAL (XEXP (XVECEXP (regs, 0, index),
1));
>> >>>>> + tmps[pos]
>> >>>>> + = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off - start_off));
>> >>>>> + }
>> >>>>> +
>> >>>>> + /* There are some fields are in part of registers. */
>> >>>>> + if (right_margin_bits != 0)
>> >>>>> + {
>> >>>>> + if (acc->writing)
>> >>>>> + break;
>> >>>>> +
>> >>>>> + gcc_assert ((right_margin_bits % BITS_PER_UNIT) == 0);
>> >>>>> + HOST_WIDE_INT off_byte
>> >>>>> + = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) -
start_off;
>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, end_index), 0);
>> >>>>> + machine_mode orig_mode = GET_MODE (orig_reg);
>> >>>>> + gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
>> >>>>> +
>> >>>>> + machine_mode mode_aux[] = {SImode, HImode, QImode};
>> >>>>> + HOST_WIDE_INT reg_size
>> >>>>> + = GET_MODE_BITSIZE (orig_mode).to_constant ();
>> >>>>> + HOST_WIDE_INT off_bits = 0;
>> >>>>> + for (unsigned long j = 0;
>> >>>>> + j < sizeof (mode_aux) / sizeof (mode_aux[0]); j++)
>> >>>>> + {
>> >>>>> + HOST_WIDE_INT submode_bitsize
>> >>>>> + = GET_MODE_BITSIZE (mode_aux[j]).to_constant ();
>> >>>>> + if (reg_size - right_margin_bits - off_bits
>> >>>>> + >= submode_bitsize)
>> >>>>> + {
>> >>>>> + rtx reg = gen_reg_rtx (orig_mode);
>> >>>>> + emit_move_insn (reg, orig_reg);
>> >>>>> +
>> >>>>> + poly_uint64 lowpart_off
>> >>>>> + = subreg_lowpart_offset (mode_aux[j], orig_mode);
>> >>>>> + int lowpart_off_bits
>> >>>>> + = lowpart_off.to_constant () * BITS_PER_UNIT;
>> >>>>> + int shift_bits = lowpart_off_bits >= off_bits
>> >>>>> + ? (lowpart_off_bits - off_bits)
>> >>>>> + : (off_bits - lowpart_off_bits);
>> >>>>> + if (shift_bits > 0)
>> >>>>> + reg = expand_shift (RSHIFT_EXPR, orig_mode, reg,
>> >>>>> + shift_bits, NULL, 1);
>> >>>>> + rtx subreg = gen_lowpart (mode_aux[j], reg);
>> >>>>> + rtx off = GEN_INT (off_byte);
>> >>>>> + tmps[pos++]
>> >>>>> + = gen_rtx_EXPR_LIST (mode_aux[j], subreg, off);
>> >>>>> + off_byte += submode_bitsize / BITS_PER_UNIT;
>> >>>>> + off_bits += submode_bitsize;
>> >>>>> + }
>> >>>>> + }
>> >>>>> + }
>> >>>>> +
>> >>>>> + /* Currently, PARALLELs with register elements for param/returns
>> >>>>> + are using BLKmode. */
>> >>>>> + acc->rtx_val = gen_rtx_PARALLEL (TYPE_MODE (TREE_TYPE (acc->expr)),
>> >>>>> + gen_rtvec_v (pos, tmps));
>> >>>>> + continue;
>> >>>>> + }
>> >>>>> +
>> >>>>> + /* The access corresponds to one reg. */
>> >>>>> + if (end_index == start_index && left_margin_bits == 0
>> >>>>> + && right_margin_bits == 0)
>> >>>>> + {
>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
>> >>>>> + rtx reg = NULL_RTX;
>> >>>>> + if (HARD_REGISTER_P (orig_reg))
>> >>>>> + {
>> >>>>> + /* Reading from param hard reg need to be moved to a temp. */
>> >>>>> + gcc_assert (!acc->writing);
>> >>>>> + reg = gen_reg_rtx (GET_MODE (orig_reg));
>> >>>>> + emit_move_insn (reg, orig_reg);
>> >>>>> + }
>> >>>>> + else
>> >>>>> + reg = orig_reg;
>> >>>>> + if (GET_MODE (orig_reg) != expr_mode)
>> >>>>> + reg = gen_lowpart (expr_mode, reg);
>> >>>>> +
>> >>>>> + acc->rtx_val = reg;
>> >>>>> + continue;
>> >>>>> + }
>> >>>>> +
>> >>>>> + /* It is accessing a filed which is part of a register. */
>> >>>>> + scalar_int_mode imode;
>> >>>>> + if (!acc->writing && end_index == start_index
>> >>>>> + && int_mode_for_size (acc->size, 1).exists (&imode))
>> >>>>> + {
>> >>>>> + /* get and copy original register inside the param. */
>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
>> >>>>> + machine_mode mode = GET_MODE (orig_reg);
>> >>>>> + gcc_assert (GET_MODE_CLASS (mode) == MODE_INT);
>> >>>>> + rtx reg = gen_reg_rtx (mode);
>> >>>>> + emit_move_insn (reg, orig_reg);
>> >>>>> +
>> >>>>> + /* shift to expect part. */
>> >>>>> + poly_uint64 lowpart_off = subreg_lowpart_offset (imode, mode);
>> >>>>> + int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT;
>> >>>>> + int shift_bits = lowpart_off_bits >= left_margin_bits
>> >>>>> + ? (lowpart_off_bits - left_margin_bits)
>> >>>>> + : (left_margin_bits - lowpart_off_bits);
>> >>>>> + if (shift_bits > 0)
>> >>>>> + reg = expand_shift (RSHIFT_EXPR, mode, reg, shift_bits, NULL, 1);
>> >>>>> +
>> >>>>> + /* move corresond part subreg to result. */
>> >>>>> + rtx subreg = gen_lowpart (imode, reg);
>> >>>>> + rtx result = gen_reg_rtx (imode);
>> >>>>> + emit_move_insn (result, subreg);
>> >>>>> +
>> >>>>> + if (expr_mode != imode)
>> >>>>> + result = gen_lowpart (expr_mode, result);
>> >>>>> +
>> >>>>> + acc->rtx_val = result;
>> >>>>> + continue;
>> >>>>> + }
>> >>>>> +
>> >>>>> + break;
>> >>>>> + }
>> >>>>> +
>> >>>>> + /* Some access expr(s) are not scalarized. */
>> >>>>> + if (cur_access_index != n)
>> >>>>> + disqualify_candidate (base);
>> >>>>> + else
>> >>>>> + {
>> >>>>> + /* Add elements to expr->access map. */
>> >>>>> + for (int j = 0; j < n; j++)
>> >>>>> + {
>> >>>>> + access_p access = (*access_vec)[j];
>> >>>>> + expr_access_vec->put (access->expr, access);
>> >>>>> + }
>> >>>>> + }
>> >>>>> +}
>> >>>>> +
>> >>>>> +void
>> >>>>> +set_scalar_rtx_for_returns ()
>> >>>>> +{
>> >>>>> + tree res = DECL_RESULT (current_function_decl);
>> >>>>> + gcc_assert (res);
>> >>>>> + edge_iterator ei;
>> >>>>> + edge e;
>> >>>>> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>> >>>>> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
>> >>>>> + {
>> >>>>> + tree val = gimple_return_retval (r);
>> >>>>> + if (val && VAR_P (val))
>> >>>>> + set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res));
>> >>>>> + }
>> >>>>> +}
>> >>>>> +
>> >>>>> /* Return an expression tree corresponding to the RHS of GIMPLE
>> >>>>> statement STMT. */
>> >>>>>
>> >>>>> @@ -3778,7 +4336,8 @@ expand_return (tree retval)
>> >>>>>
>> >>>>> /* If we are returning the RESULT_DECL, then the value has already
>> >>>>> been stored into it, so we don't have to do anything special. */
>> >>>>> - if (TREE_CODE (retval_rhs) == RESULT_DECL)
>> >>>>> + if (TREE_CODE (retval_rhs) == RESULT_DECL
>> >>>>> + || get_scalar_rtx_for_aggregate_expr (retval_rhs))
>> >>>>> expand_value_return (result_rtl);
>> >>>>>
>> >>>>> /* If the result is an aggregate that is being returned in one (or
more)
>> >>>>> @@ -4422,6 +4981,9 @@ expand_debug_expr (tree exp)
>> >>>>> int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
>> >>>>> addr_space_t as;
>> >>>>> scalar_int_mode op0_mode, op1_mode, addr_mode;
>> >>>>> + rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>> >>>>> + if (x)
>> >>>>> + return NULL_RTX;/* optimized out. */
>> >>>>>
>> >>>>> switch (TREE_CODE_CLASS (TREE_CODE (exp)))
>> >>>>> {
>> >>>>> @@ -6630,6 +7192,8 @@ pass_expand::execute (function *fun)
>> >>>>> avoid_deep_ter_for_debug (gsi_stmt (gsi), 0);
>> >>>>> }
>> >>>>>
>> >>>>> + prepare_expander_sra ();
>> >>>>> +
>> >>>>> /* Mark arrays indexed with non-constant indices with
TREE_ADDRESSABLE. */
>> >>>>> auto_bitmap forced_stack_vars;
>> >>>>> discover_nonconstant_array_refs (forced_stack_vars);
>> >>>>> @@ -7062,6 +7626,7 @@ pass_expand::execute (function *fun)
>> >>>>> loop_optimizer_finalize ();
>> >>>>> }
>> >>>>>
>> >>>>> + free_expander_sra ();
>> >>>>> timevar_pop (TV_POST_EXPAND);
>> >>>>>
>> >>>>> return 0;
>> >>>>> diff --git a/gcc/expr.cc b/gcc/expr.cc
>> >>>>> index 56b51876f80..b970f98e689 100644
>> >>>>> --- a/gcc/expr.cc
>> >>>>> +++ b/gcc/expr.cc
>> >>>>> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx,
rtx, rtx,
>> >>>>> static rtx const_vector_from_tree (tree);
>> >>>>> static tree tree_expr_size (const_tree);
>> >>>>> static void convert_mode_scalar (rtx, rtx, int);
>> >>>>> +rtx get_scalar_rtx_for_aggregate_expr (tree);
>> >>>>>
>> >>>>>
>> >>>>> /* This is run to set up which modes can be used
>> >>>>> @@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool
nontemporal)
>> >>>>> Assignment of an array element at a constant index, and
assignment of
>> >>>>> an array element in an unaligned packed structure field, has the
same
>> >>>>> problem. Same for (partially) storing into a non-memory object.
*/
>> >>>>> - if (handled_component_p (to)
>> >>>>> - || (TREE_CODE (to) == MEM_REF
>> >>>>> - && (REF_REVERSE_STORAGE_ORDER (to)
>> >>>>> - || mem_ref_refers_to_non_mem_p (to)))
>> >>>>> - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
>> >>>>> + if (!get_scalar_rtx_for_aggregate_expr (to)
>> >>>>> + && (handled_component_p (to)
>> >>>>> + || (TREE_CODE (to) == MEM_REF
>> >>>>> + && (REF_REVERSE_STORAGE_ORDER (to)
>> >>>>> + || mem_ref_refers_to_non_mem_p (to)))
>> >>>>> + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
>> >>>>> {
>> >>>>> machine_mode mode1;
>> >>>>> poly_int64 bitsize, bitpos;
>> >>>>> @@ -8995,6 +8997,9 @@ expand_expr_real (tree exp, rtx target,
machine_mode tmode,
>> >>>>> ret = CONST0_RTX (tmode);
>> >>>>> return ret ? ret : const0_rtx;
>> >>>>> }
>> >>>>> + rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>> >>>>> + if (x)
>> >>>>> + return x;
>> >>>>>
>> >>>>> ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
>> >>>>> inner_reference_p);
>> >>>>> diff --git a/gcc/function.cc b/gcc/function.cc
>> >>>>> index 82102ed78d7..262d3f17e72 100644
>> >>>>> --- a/gcc/function.cc
>> >>>>> +++ b/gcc/function.cc
>> >>>>> @@ -2742,6 +2742,9 @@ assign_parm_find_stack_rtl (tree parm, struct
assign_parm_data_one *data)
>> >>>>> data->stack_parm = stack_parm;
>> >>>>> }
>> >>>>>
>> >>>>> +extern void
>> >>>>> +set_scalar_rtx_for_aggregate_access (tree, rtx);
>> >>>>> +
>> >>>>> /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that
it's
>> >>>>> always valid and contiguous. */
>> >>>>>
>> >>>>> @@ -3117,8 +3120,21 @@ assign_parm_setup_block (struct
assign_parm_data_all *all,
>> >>>>> emit_move_insn (mem, entry_parm);
>> >>>>> }
>> >>>>> else
>> >>>>> - move_block_from_reg (REGNO (entry_parm), mem,
>> >>>>> - size_stored / UNITS_PER_WORD);
>> >>>>> + {
>> >>>>> + int regno = REGNO (entry_parm);
>> >>>>> + int nregs = size_stored / UNITS_PER_WORD;
>> >>>>> + move_block_from_reg (regno, mem, nregs);
>> >>>>> +
>> >>>>> + rtx *tmps = XALLOCAVEC (rtx, nregs);
>> >>>>> + machine_mode mode = word_mode;
>> >>>>> + for (int i = 0; i < nregs; i++)
>> >>>>> + tmps[i] = gen_rtx_EXPR_LIST (
>> >>>>> + VOIDmode, gen_rtx_REG (mode, regno + i),
>> >>>>> + GEN_INT (GET_MODE_SIZE (mode).to_constant () * i));
>> >>>>> +
>> >>>>> + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
>> >>>>> + set_scalar_rtx_for_aggregate_access (parm, regs);
>> >>>>> + }
>> >>>>> }
>> >>>>> else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
>> >>>>> {
>> >>>>> @@ -3718,6 +3734,10 @@ assign_parms (tree fndecl)
>> >>>>> else
>> >>>>> set_decl_incoming_rtl (parm, data.entry_parm, false);
>> >>>>>
>> >>>>> + rtx incoming = DECL_INCOMING_RTL (parm);
>> >>>>> + if (GET_CODE (incoming) == PARALLEL)
>> >>>>> + set_scalar_rtx_for_aggregate_access (parm, incoming);
>> >>>>> +
>> >>>>> assign_parm_adjust_stack_rtl (&data);
>> >>>>>
>> >>>>> if (assign_parm_setup_block_p (&data))
>> >>>>> @@ -5037,6 +5057,7 @@ stack_protect_epilogue (void)
>> >>>>> the function's parameters, which must be run at any return
statement. */
>> >>>>>
>> >>>>> bool currently_expanding_function_start;
>> >>>>> +extern void set_scalar_rtx_for_returns ();
>> >>>>> void
>> >>>>> expand_function_start (tree subr)
>> >>>>> {
>> >>>>> @@ -5138,6 +5159,7 @@ expand_function_start (tree subr)
>> >>>>> {
>> >>>>> gcc_assert (GET_CODE (hard_reg) == PARALLEL);
>> >>>>> set_parm_rtl (res, gen_group_rtx (hard_reg));
>> >>>>> + set_scalar_rtx_for_returns ();
>> >>>>> }
>> >>>>> }
>> >>>>>
>> >>>>> diff --git a/gcc/opts.cc b/gcc/opts.cc
>> >>>>> index 86b94d62b58..5e129a1cc49 100644
>> >>>>> --- a/gcc/opts.cc
>> >>>>> +++ b/gcc/opts.cc
>> >>>>> @@ -1559,6 +1559,10 @@ public:
>> >>>>> vec<const char *> m_values;
>> >>>>> };
>> >>>>>
>> >>>>> +#ifdef __GNUC__
>> >>>>> +#pragma GCC diagnostic push
>> >>>>> +#pragma GCC diagnostic ignored "-Wformat-truncation"
>> >>>>> +#endif
>> >>>>> /* Print help for a specific front-end, etc. */
>> >>>>> static void
>> >>>>> print_filtered_help (unsigned int include_flags,
>> >>>>> @@ -1913,7 +1917,9 @@ print_filtered_help (unsigned int include_flags,
>> >>>>> printf ("\n\n");
>> >>>>> }
>> >>>>> }
>> >>>>> -
>> >>>>> +#ifdef __GNUC__
>> >>>>> +#pragma GCC diagnostic pop
>> >>>>> +#endif
>> >>>>> /* Display help for a specified type of option.
>> >>>>> The options must have ALL of the INCLUDE_FLAGS set
>> >>>>> ANY of the flags in the ANY_FLAGS set
>> >>>>> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C
b/gcc/testsuite/g++.target/powerpc/pr102024.C
>> >>>>> index 769585052b5..c8995cae707 100644
>> >>>>> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C
>> >>>>> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
>> >>>>> @@ -5,7 +5,7 @@
>> >>>>> // Test that a zero-width bit field in an otherwise homogeneous
aggregate
>> >>>>> // generates a psabi warning and passes arguments in GPRs.
>> >>>>>
>> >>>>> -// { dg-final { scan-assembler-times {\mstd\M} 4 } }
>> >>>>> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
>> >>>>>
>> >>>>> struct a_thing
>> >>>>> {
>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c
b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>> >>>>> new file mode 100644
>> >>>>> index 00000000000..7dd1a4a326a
>> >>>>> --- /dev/null
>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>> >>>>> @@ -0,0 +1,29 @@
>> >>>>> +/* { dg-do run } */
>> >>>>> +/* { dg-options "-O2 -save-temps" } */
>> >>>>> +
>> >>>>> +typedef struct DF {double a[4]; short s1; short s2; short s3; short
s4; } DF;
>> >>>>> +typedef struct SF {float a[4]; int i1; int i2; } SF;
>> >>>>> +
>> >>>>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64
&& has_arch_pwr8 } } } } */
>> >>>>> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64
&& has_arch_pwr8 } } } } */
>> >>>>> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64
&& has_arch_pwr8 } } } } */
>> >>>>> +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag ==
2)return a.s2+a.s3;return 0;}
>> >>>>> +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag ==
2)return a.i2+a.i1;return 0;}
>> >>>>> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag ==
2)return arg.a[3];else return 0.0;}
>> >>>>> +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag ==
2)return arg.a[2]; return 0;}
>> >>>>> +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag ==
2)return arg.a[1];return 0;}
>> >>>>> +
>> >>>>> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
>> >>>>> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
>> >>>>> +
>> >>>>> +int main()
>> >>>>> +{
>> >>>>> + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf,
2) == 4.0
>> >>>>> + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
>> >>>>> + __builtin_abort ();
>> >>>>> + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf,
1) == 0
>> >>>>> + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
>> >>>>> + __builtin_abort ();
>> >>>>> + return 0;
>> >>>>> +}
>> >>>>> +
>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> >>>>> new file mode 100644
>> >>>>> index 00000000000..4e1f87f7939
>> >>>>> --- /dev/null
>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> >>>>> @@ -0,0 +1,6 @@
>> >>>>> +/* PR target/65421 */
>> >>>>> +/* { dg-options "-O2" } */
>> >>>>> +
>> >>>>> +typedef struct LARGE {double a[4]; int arr[32];} LARGE;
>> >>>>> +LARGE foo (LARGE a){return a;}
>> >>>>> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> >>>>> new file mode 100644
>> >>>>> index 00000000000..8a8e1a0e996
>> >>>>> --- /dev/null
>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> >>>>> @@ -0,0 +1,32 @@
>> >>>>> +/* PR target/65421 */
>> >>>>> +/* { dg-options "-O2" } */
>> >>>>> +/* { dg-require-effective-target powerpc_elfv2 } */
>> >>>>> +/* { dg-require-effective-target has_arch_ppc64 } */
>> >>>>> +
>> >>>>> +typedef struct FLOATS
>> >>>>> +{
>> >>>>> + double a[3];
>> >>>>> +} FLOATS;
>> >>>>> +
>> >>>>> +/* 3 lfd after returns also optimized */
>> >>>>> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
>> >>>>> +
>> >>>>> +/* 3 stfd */
>> >>>>> +void st_arg (FLOATS a, FLOATS *p) {*p = a;}
>> >>>>> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
>> >>>>> +
>> >>>>> +/* blr */
>> >>>>> +FLOATS ret_arg (FLOATS a) {return a;}
>> >>>>> +
>> >>>>> +typedef struct MIX
>> >>>>> +{
>> >>>>> + double a[2];
>> >>>>> + long l;
>> >>>>> +} MIX;
>> >>>>> +
>> >>>>> +/* std 3 param regs to return slot */
>> >>>>> +MIX ret_arg1 (MIX a) {return a;}
>> >>>>> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
>> >>>>> +
>> >>>>> +/* count insns */
>> >>>>> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
>> >>>>>
>> >>>>
>> >>>> --
>> >>>> Richard Biener <rguent...@suse.de>
>> >>>> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
Nuernberg,
>> >>>> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
>> >>>> HRB 36809 (AG Nuernberg)
>>