Hi,

I just updated the patch.  We could review this one.

Compare with previous patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627287.html
This version:
* Supports bitfield access from one register.
* Allow return scalar registers cleaned via contructor.

Bootstrapped and regtested on x86_64-redhat-linux, and
powerpc64{,le}-linux-gnu.

Is it ok for trunk?


        PR target/65421
        PR target/69143

gcc/ChangeLog:

        * cfgexpand.cc (extract_bit_field): Extern declare.
        (struct access): New class.
        (struct expand_sra): New class.
        (expand_sra::build_access): New member function.
        (expand_sra::visit_base): Likewise.
        (expand_sra::analyze_default_stmt): Likewise.
        (expand_sra::analyze_assign): Likewise.
        (expand_sra::add_sra_candidate): Likewise.
        (expand_sra::collect_sra_candidates): Likewise.
        (expand_sra::valid_scalariable_accesses): Likewise.
        (expand_sra::prepare_expander_sra): Likewise.
        (expand_sra::expand_sra): Class constructor.
        (expand_sra::~expand_sra): Class destructor.
        (expand_sra::get_scalarized_rtx): New member function.
        (extract_one_reg): New function.
        (extract_bitfield): New function.
        (expand_sra::scalarize_access): New member function.
        (expand_sra::scalarize_accesses): New member function.
        (get_scalar_rtx_for_aggregate_expr): New function.
        (set_scalar_rtx_for_aggregate_access): New function.
        (set_scalar_rtx_for_returns): New function.
        (expand_return): Call get_scalar_rtx_for_aggregate_expr.
        (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr.
        (pass_expand::execute): Update to use the expand_sra.
        * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare.
        (expand_assignment): Call get_scalar_rtx_for_aggregate_expr.
        (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr.
        * function.cc (set_scalar_rtx_for_aggregate_access):  Extern declare.
        (set_scalar_rtx_for_returns): Extern declare.
        (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access.
        (assign_parms): Call set_scalar_rtx_for_aggregate_access. 
        (expand_function_start): Call set_scalar_rtx_for_returns.
        * tree-sra.h (struct base_access): New class.
        (struct default_analyzer): New class.
        (scan_function): New function template.

gcc/testsuite/ChangeLog:

        * g++.target/powerpc/pr102024.C: Updated.
        * gcc.target/powerpc/pr108073.c: New test.
        * gcc.target/powerpc/pr65421-1.c: New test.
        * gcc.target/powerpc/pr65421-2.c: New test.

---
 gcc/cfgexpand.cc                             | 474 ++++++++++++++++++-
 gcc/expr.cc                                  |  29 +-
 gcc/function.cc                              |  28 +-
 gcc/tree-sra.h                               |  77 +++
 gcc/testsuite/g++.target/powerpc/pr102024.C  |   2 +-
 gcc/testsuite/gcc.target/powerpc/pr108073.c  |  29 ++
 gcc/testsuite/gcc.target/powerpc/pr65421-1.c |   6 +
 gcc/testsuite/gcc.target/powerpc/pr65421-2.c |  32 ++
 8 files changed, 668 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 
edf292cfbe95ac2711faee7769e839cb4edb0dd3..385b6c781aa2805e7ca40293a0ae84f87e23e0b6
 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -74,6 +74,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "output.h"
 #include "builtins.h"
 #include "opts.h"
+#include "tree-sra.h"
 
 /* Some systems use __main in a way incompatible with its use in gcc, in these
    cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN to
@@ -97,6 +98,468 @@ static bool defer_stack_allocation (tree, bool);
 
 static void record_alignment_for_reg_var (unsigned int);
 
+extern rtx extract_bit_field (rtx, poly_uint64, poly_uint64, int, rtx,
+                             machine_mode, machine_mode, bool, rtx *);
+
+/* For light SRA in expander about paramaters and returns.  */
+struct access : public base_access
+{
+  /* The rtx for the access: link to incoming/returning register(s).  */
+  rtx rtx_val;
+};
+
+typedef struct access *access_p;
+
+struct expand_sra : public default_analyzer
+{
+  expand_sra ();
+  ~expand_sra ();
+
+  /* Now use default APIs, no actions for
+     pre_analyze_stmt, analyze_return.  */
+
+  /* overwrite analyze_default_stmt.  */
+  void analyze_default_stmt (gimple *);
+
+  /* overwrite analyze phi,call,asm .  */
+  void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); };
+  void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); };
+  void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); };
+  /* overwrite analyze_assign.  */
+  void analyze_assign (gassign *);
+
+  /* Compute the scalar rtx(s) for all access of BASE from a parrallel REGS. */
+  bool scalarize_accesses (tree base, rtx regs);
+  /* Return the scalarized rtx for EXPR.  */
+  rtx get_scalarized_rtx (tree expr);
+
+private:
+  void prepare_expander_sra (void);
+
+  /* Return true if VAR is a candidate for SRA.  */
+  bool add_sra_candidate (tree var);
+
+  /* Collect the parameter and returns with type which is suitable for
+     scalarization.  */
+  bool collect_sra_candidates (void);
+
+  /* Return true if EXPR has interesting access to the sra candidates,
+     and created access, return false otherwise.  */
+  access_p build_access (tree expr, bool write);
+
+  /* Check if the accesses of BASE are scalarizbale.
+     Now support the parms only with reading or returns only with writing.  */
+  bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm);
+
+  /* Compute the scalar rtx for one access ACC from a parrallel REGS. */
+  bool scalarize_access (access_p acc, rtx regs);
+
+  /* Callback of walk_stmt_load_store_addr_ops, used to remove
+     unscalarizable accesses.  */
+  static bool visit_base (gimple *, tree op, tree, void *data);
+
+  /* Expr (tree) -> Scalarized value (rtx) map.  */
+  hash_map<tree, rtx> *expr_rtx_vec;
+
+  /* Base (tree) -> Vector (vec<access_p> *) map.  */
+  hash_map<tree, auto_vec<access_p> > *base_access_vec;
+};
+
+access_p
+expand_sra::build_access (tree expr, bool write)
+{
+  enum tree_code code = TREE_CODE (expr);
+  if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF
+      && code != ARRAY_REF && code != ARRAY_RANGE_REF)
+    return NULL;
+
+  HOST_WIDE_INT offset, size;
+  bool reverse;
+  tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse);
+  if (!base || !DECL_P (base))
+    return NULL;
+  if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr))
+    {
+      base_access_vec->remove (base);
+      return NULL;
+    }
+
+  vec<access_p> *access_vec = base_access_vec->get (base);
+  if (!access_vec)
+    return NULL;
+
+  /* TODO: support reverse. */
+  if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE (base)))
+    {
+      base_access_vec->remove (base);
+      return NULL;
+    }
+
+  struct access *access = XNEWVEC (struct access, 1);
+
+  memset (access, 0, sizeof (struct access));
+  access->offset = offset;
+  access->size = size;
+  access->expr = expr;
+  access->write = write;
+  access->rtx_val = NULL_RTX;
+
+  access_vec->safe_push (access);
+
+  return access;
+}
+
+bool
+expand_sra::visit_base (gimple *, tree op, tree, void *data)
+{
+  op = get_base_address (op);
+  if (op && DECL_P (op))
+    {
+      expand_sra *p = (expand_sra *) data;
+      p->base_access_vec->remove (op);
+    }
+  return false;
+}
+
+void
+expand_sra::analyze_default_stmt (gimple *stmt)
+{
+  if (base_access_vec && !base_access_vec->is_empty ())
+    walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base,
+                                  visit_base);
+}
+
+void
+expand_sra::analyze_assign (gassign *stmt)
+{
+  if (!base_access_vec || base_access_vec->is_empty ())
+    return;
+
+  if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
+    {
+      tree rhs = gimple_assign_rhs1 (stmt);
+      tree lhs = gimple_assign_lhs (stmt);
+      bool res_r = build_access (rhs, false);
+      bool res_l = build_access (lhs, true);
+
+      if (res_l || res_r)
+       return;
+    }
+
+  analyze_default_stmt (stmt);
+}
+
+/* Return true if VAR is a candidate for SRA.  */
+
+bool
+expand_sra::add_sra_candidate (tree var)
+{
+  tree type = TREE_TYPE (var);
+
+  if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type))
+      || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var)
+      || is_va_list_type (type))
+    return false;
+  gcc_assert (COMPLETE_TYPE_P (type));
+
+  base_access_vec->get_or_insert (var);
+
+  return true;
+}
+
+bool
+expand_sra::collect_sra_candidates (void)
+{
+  bool ret = false;
+
+  /* Collect parameters.  */
+  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
+       parm = DECL_CHAIN (parm))
+    ret |= add_sra_candidate (parm);
+
+  /* Collect VARs on returns.  */
+  if (DECL_RESULT (current_function_decl))
+    {
+      edge_iterator ei;
+      edge e;
+      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+       if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+         {
+           tree val = gimple_return_retval (r);
+           /* To sclaraized the return, the return value should be only
+              writen, except this return stmt.
+              Then using 'true(write)' to create the access. */
+           if (val && VAR_P (val))
+             ret |= add_sra_candidate (val) && build_access (val, true);
+         }
+    }
+
+  return ret;
+}
+
+bool
+expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool 
is_parm)
+{
+  if (access_vec->is_empty ())
+    return false;
+
+  for (unsigned int j = 0; j < access_vec->length (); j++)
+    {
+      struct access *access = (*access_vec)[j];
+      if (is_parm && access->write)
+       return false;
+
+      if (!is_parm && !access->write)
+       return false;
+    }
+
+  return true;
+}
+
+void
+expand_sra::prepare_expander_sra ()
+{
+  if (optimize <= 0)
+    return;
+
+  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
+  expr_rtx_vec = new hash_map<tree, rtx>;
+
+  collect_sra_candidates ();
+}
+
+expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL)
+{
+  prepare_expander_sra ();
+}
+
+expand_sra::~expand_sra ()
+{
+  if (optimize <= 0)
+    return;
+  delete expr_rtx_vec;
+  expr_rtx_vec = NULL;
+  delete base_access_vec;
+  base_access_vec = NULL;
+}
+
+rtx
+expand_sra::get_scalarized_rtx (tree expr)
+{
+  if (!expr_rtx_vec)
+    return NULL_RTX;
+  rtx *val = expr_rtx_vec->get (expr);
+  return val ? *val : NULL_RTX;
+}
+
+/* Get the register at INDEX from a parallel REGS.  */
+
+static rtx
+extract_one_reg (rtx regs, int index)
+{
+  rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
+  if (!HARD_REGISTER_P (orig_reg))
+    return orig_reg;
+
+  /* Reading from param hard reg need to be moved to a temp.  */
+  rtx reg = gen_reg_rtx (GET_MODE (orig_reg));
+  emit_move_insn (reg, orig_reg);
+  return reg;
+}
+
+/* Extract bitfield from REG with SIZE as MODE, start from POS. */
+
+static rtx
+extract_bitfield (rtx reg, int size, int pos, machine_mode tmode, bool 
unsignedp)
+{
+  scalar_int_mode imode;
+  if (!int_mode_for_mode (tmode).exists (&imode))
+    return NULL_RTX;
+
+  machine_mode mode = GET_MODE (reg);
+  bool reverse = false;
+  rtx bfld = extract_bit_field (reg, size, pos, unsignedp, NULL_RTX, mode,
+                               imode, reverse, NULL);
+  mode = GET_MODE (bfld);
+  if (mode != imode)
+    bfld = gen_lowpart (imode, bfld);
+  rtx result = gen_reg_rtx (imode);
+  emit_move_insn (result, bfld);
+
+  if (tmode != imode)
+    result = gen_lowpart (tmode, result);
+
+  return result;
+}
+
+bool
+expand_sra::scalarize_access (access_p acc, rtx regs)
+{
+  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
+
+  /* mode of mult registers. */
+  if (expr_mode != BLKmode
+      && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
+    return false;
+
+  /* Compute the position of the access in the whole parallel rtx.  */
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+  for (; cur_index < XVECLEN (regs, 0); cur_index++)
+    {
+      rtx slot = XVECEXP (regs, 0, cur_index);
+      HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+      machine_mode mode = GET_MODE (XEXP (slot, 0));
+      HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+      if (off <= acc->offset && off + size > acc->offset)
+       {
+         start_index = cur_index;
+         left_bits = acc->offset - off;
+       }
+      if (off + size >= acc->offset + acc->size)
+       {
+         end_index = cur_index;
+         right_bits = off + size - (acc->offset + acc->size);
+         break;
+       }
+    }
+  /* Invalid access possition: padding or outof bound.  */
+  if (start_index < 0 || end_index < 0)
+    return false;
+
+  /* Need multi-registers in a parallel for the access.  */
+  if (expr_mode == BLKmode || end_index > start_index)
+    {
+      if (left_bits || right_bits)
+       return false;
+
+      int num_words = end_index - start_index + 1;
+      rtx *tmps = XALLOCAVEC (rtx, num_words);
+
+      int pos = 0;
+      HOST_WIDE_INT start;
+      start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
+      /* Extract whole registers.  */
+      for (; pos < num_words; pos++)
+       {
+         int index = start_index + pos;
+         rtx reg = extract_one_reg (regs, index);
+         machine_mode mode = GET_MODE (reg);
+         HOST_WIDE_INT off;
+         off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start;
+         tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off));
+       }
+
+      rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps));
+      acc->rtx_val = reg;
+      return true;
+    }
+
+  /* Just need one reg for the access.  */
+  if (end_index == start_index && left_bits == 0 && right_bits == 0)
+    {
+      rtx reg = extract_one_reg (regs, start_index);
+      if (GET_MODE (reg) != expr_mode)
+       reg = gen_lowpart (expr_mode, reg);
+
+      acc->rtx_val = reg;
+      return true;
+    }
+
+  /* Need to extract part reg for the access.  */
+  if (!acc->write && end_index == start_index)
+    {
+      rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+      bool sgn = TYPE_UNSIGNED (TREE_TYPE (acc->expr));
+      acc->rtx_val
+       = extract_bitfield (reg, acc->size, left_bits, expr_mode, sgn);
+      if (acc->rtx_val)
+       return true;
+    }
+
+  return false;
+}
+
+bool
+expand_sra::scalarize_accesses (tree base, rtx regs)
+{
+  if (!base_access_vec)
+    return false;
+  vec<access_p> *access_vec = base_access_vec->get (base);
+  if (!access_vec)
+    return false;
+  bool is_parm = TREE_CODE (base) == PARM_DECL;
+  if (!valid_scalariable_accesses (access_vec, is_parm))
+    return false;
+
+  /* Go through each access, compute corresponding rtx(regs or subregs)
+     for the expression.  */
+  int n = access_vec->length ();
+  int cur_access_index = 0;
+  for (; cur_access_index < n; cur_access_index++)
+    if (!scalarize_access ((*access_vec)[cur_access_index], regs))
+      break;
+
+  /* Avoid un-scalarized accesses. */
+  if (cur_access_index != n)
+    {
+      base_access_vec->remove (base);
+      return false;
+    }
+
+  /* Bind/map expr(tree) to scalarized rtx.  */
+  for (int j = 0; j < n; j++)
+    {
+      access_p access = (*access_vec)[j];
+      expr_rtx_vec->put (access->expr, access->rtx_val);
+    }
+
+  return true;
+}
+
+static expand_sra *current_sra = NULL;
+
+/* Check If there is an sra access for the expr.
+   Return the correspond scalar sym for the access. */
+
+rtx
+get_scalar_rtx_for_aggregate_expr (tree expr)
+{
+  return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX;
+}
+
+/* Compute/Set RTX registers for those accesses on BASE.  */
+
+void
+set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
+{
+  if (!current_sra)
+    return;
+  current_sra->scalarize_accesses (base, regs);
+}
+
+void
+set_scalar_rtx_for_returns ()
+{
+  if (!current_sra)
+    return;
+
+  tree res = DECL_RESULT (current_function_decl);
+  gcc_assert (res);
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+      {
+       tree val = gimple_return_retval (r);
+       if (val && VAR_P (val))
+         current_sra->scalarize_accesses (val, DECL_RTL (res));
+      }
+}
+
 /* Return an expression tree corresponding to the RHS of GIMPLE
    statement STMT.  */
 
@@ -3778,7 +4241,8 @@ expand_return (tree retval)
 
   /* If we are returning the RESULT_DECL, then the value has already
      been stored into it, so we don't have to do anything special.  */
-  if (TREE_CODE (retval_rhs) == RESULT_DECL)
+  if (TREE_CODE (retval_rhs) == RESULT_DECL
+      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
     expand_value_return (result_rtl);
 
   /* If the result is an aggregate that is being returned in one (or more)
@@ -4422,6 +4886,9 @@ expand_debug_expr (tree exp)
   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
   addr_space_t as;
   scalar_int_mode op0_mode, op1_mode, addr_mode;
+  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+  if (x)
+    return NULL_RTX;/* optimized out.  */
 
   switch (TREE_CODE_CLASS (TREE_CODE (exp)))
     {
@@ -6624,6 +7091,9 @@ pass_expand::execute (function *fun)
   auto_bitmap forced_stack_vars;
   discover_nonconstant_array_refs (forced_stack_vars);
 
+  current_sra = new expand_sra;
+  scan_function (cfun, *current_sra);
+
   /* Make sure all values used by the optimization passes have sane
      defaults.  */
   reg_renumber = 0;
@@ -7052,6 +7522,8 @@ pass_expand::execute (function *fun)
       loop_optimizer_finalize ();
     }
 
+  delete current_sra;
+  current_sra = NULL;
   timevar_pop (TV_POST_EXPAND);
 
   return 0;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 
174f8acb269ab5450fc799516471d5a2bd9b9efa..57b037040d6d8e8c98b2befcb556221c0c5604c4
 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, rtx,
 static rtx const_vector_from_tree (tree);
 static tree tree_expr_size (const_tree);
 static void convert_mode_scalar (rtx, rtx, int);
+rtx get_scalar_rtx_for_aggregate_expr (tree);
 
 
 /* This is run to set up which modes can be used
@@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool nontemporal)
      Assignment of an array element at a constant index, and assignment of
      an array element in an unaligned packed structure field, has the same
      problem.  Same for (partially) storing into a non-memory object.  */
-  if (handled_component_p (to)
-      || (TREE_CODE (to) == MEM_REF
-         && (REF_REVERSE_STORAGE_ORDER (to)
-             || mem_ref_refers_to_non_mem_p (to)))
-      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
+  if (!get_scalar_rtx_for_aggregate_expr (to)
+      && (handled_component_p (to)
+         || (TREE_CODE (to) == MEM_REF
+             && (REF_REVERSE_STORAGE_ORDER (to)
+                 || mem_ref_refers_to_non_mem_p (to)))
+         || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
     {
       machine_mode mode1;
       poly_int64 bitsize, bitpos;
@@ -8912,6 +8914,20 @@ expand_constructor (tree exp, rtx target, enum 
expand_modifier modifier,
       && ! mostly_zeros_p (exp))
     return NULL_RTX;
 
+  if (target && GET_CODE (target) == PARALLEL && all_zeros_p (exp))
+    {
+      int length = XVECLEN (target, 0);
+      int start = XEXP (XVECEXP (target, 0, 0), 0) ? 0 : 1;
+      for (int i = start; i < length; i++)
+       {
+         rtx dst = XEXP (XVECEXP (target, 0, i), 0);
+         rtx zero = CONST0_RTX (GET_MODE (dst));
+         gcc_assert (zero);
+         emit_move_insn (dst, zero);
+       }
+      return target;
+    }
+
   /* Handle calls that pass values in multiple non-contiguous
      locations.  The Irix 6 ABI has examples of this.  */
   if (target == 0 || ! safe_from_p (target, exp, 1)
@@ -9006,6 +9022,9 @@ expand_expr_real (tree exp, rtx target, machine_mode 
tmode,
       ret = CONST0_RTX (tmode);
       return ret ? ret : const0_rtx;
     }
+  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+  if (x)
+    return x;
 
   ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
                            inner_reference_p);
diff --git a/gcc/function.cc b/gcc/function.cc
index 
dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be
 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct 
assign_parm_data_one *data)
   data->stack_parm = stack_parm;
 }
 
+extern void set_scalar_rtx_for_aggregate_access (tree, rtx);
+extern void set_scalar_rtx_for_returns ();
+
 /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that it's
    always valid and contiguous.  */
 
@@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all 
*all,
          emit_move_insn (mem, entry_parm);
        }
       else
-       move_block_from_reg (REGNO (entry_parm), mem,
-                            size_stored / UNITS_PER_WORD);
+       {
+         int regno = REGNO (entry_parm);
+         int nregs = size_stored / UNITS_PER_WORD;
+         move_block_from_reg (regno, mem, nregs);
+
+         rtx *tmps = XALLOCAVEC (rtx, nregs);
+         machine_mode mode = word_mode;
+         HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
+         for (int i = 0; i < nregs; i++)
+           {
+             rtx reg = gen_rtx_REG (mode, regno + i);
+             rtx off = GEN_INT (word_size * i);
+             tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
+           }
+
+         rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+         set_scalar_rtx_for_aggregate_access (parm, regs);
+       }
     }
   else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
     {
@@ -3716,6 +3735,10 @@ assign_parms (tree fndecl)
       else
        set_decl_incoming_rtl (parm, data.entry_parm, false);
 
+      rtx incoming = DECL_INCOMING_RTL (parm);
+      if (GET_CODE (incoming) == PARALLEL)
+       set_scalar_rtx_for_aggregate_access (parm, incoming);
+
       assign_parm_adjust_stack_rtl (&data);
 
       if (assign_parm_setup_block_p (&data))
@@ -5136,6 +5159,7 @@ expand_function_start (tree subr)
            {
              gcc_assert (GET_CODE (hard_reg) == PARALLEL);
              set_parm_rtl (res, gen_group_rtx (hard_reg));
+             set_scalar_rtx_for_returns ();
            }
        }
 
diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h
index 
f20266c46226f7840299a768cb575f6f92b54207..bd0396e672b30f7ef66c305d8d131e91639039d7
 100644
--- a/gcc/tree-sra.h
+++ b/gcc/tree-sra.h
@@ -19,6 +19,83 @@ You should have received a copy of the GNU General Public 
License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+struct base_access
+{
+  /* Values returned by get_ref_base_and_extent, indicates the
+     OFFSET, SIZE and BASE of the access.  */
+  HOST_WIDE_INT offset;
+  HOST_WIDE_INT size;
+  tree base;
+
+  /* The context expression of this access.  */
+  tree expr;
+
+  /* Indicates this is a write access.  */
+  bool write : 1;
+
+  /* Indicates if this access is made in reverse storage order.  */
+  bool reverse : 1;
+};
+
+/* Default template for sra_scan_function.  */
+
+struct default_analyzer
+{
+  /* Template analyze functions.  */
+  void analyze_phi (gphi *){};
+  void pre_analyze_stmt (gimple *){};
+  void analyze_return (greturn *){};
+  void analyze_assign (gassign *){};
+  void analyze_call (gcall *){};
+  void analyze_asm (gasm *){};
+  void analyze_default_stmt (gimple *){};
+};
+
+/* Scan function and look for interesting expressions.  */
+
+template <typename analyzer>
+void
+scan_function (struct function *fun, analyzer &a)
+{
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, fun)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+          gsi_next (&gsi))
+       a.analyze_phi (gsi.phi ());
+
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+          gsi_next (&gsi))
+       {
+         gimple *stmt = gsi_stmt (gsi);
+         a.pre_analyze_stmt (stmt);
+
+         switch (gimple_code (stmt))
+           {
+           case GIMPLE_RETURN:
+             a.analyze_return (as_a<greturn *> (stmt));
+             break;
+
+           case GIMPLE_ASSIGN:
+             a.analyze_assign (as_a<gassign *> (stmt));
+             break;
+
+           case GIMPLE_CALL:
+             a.analyze_call (as_a<gcall *> (stmt));
+             break;
+
+           case GIMPLE_ASM:
+             a.analyze_asm (as_a<gasm *> (stmt));
+             break;
+
+           default:
+             a.analyze_default_stmt (stmt);
+             break;
+           }
+       }
+    }
+}
+
 bool type_internals_preclude_sra_p (tree type, const char **msg);
 
 /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA
diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
b/gcc/testsuite/g++.target/powerpc/pr102024.C
index 
769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966
 100644
--- a/gcc/testsuite/g++.target/powerpc/pr102024.C
+++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
@@ -5,7 +5,7 @@
 // Test that a zero-width bit field in an otherwise homogeneous aggregate
 // generates a psabi warning and passes arguments in GPRs.
 
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
 
 struct a_thing
 {
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
b/gcc/testsuite/gcc.target/powerpc/pr108073.c
new file mode 100644
index 
0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF;
+typedef struct SF {float a[4]; int i1; int i2; } SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 
&& has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && 
has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && 
has_arch_pwr8 } } } } */
+short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return 
a.s2+a.s3;return 0;}
+int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return 
a.i2+a.i1;return 0;}
+double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 2)return 
arg.a[3];else return 0.0;}
+float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 2)return 
arg.a[2]; return 0;}
+float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 2)return 
arg.a[1];return 0;}
+
+DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
+
+int main()
+{
+  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0
+       && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+    __builtin_abort ();
+  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
+       && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+    __builtin_abort ();
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
@@ -0,0 +1,6 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+
+typedef struct LARGE {double a[4]; int arr[32];} LARGE;
+LARGE foo (LARGE a){return a;}
+/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c 
b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
@@ -0,0 +1,32 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target powerpc_elfv2 } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+typedef struct FLOATS
+{
+  double a[3];
+} FLOATS;
+
+/* 3 lfd after returns also optimized */
+/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
+
+/* 3 stfd */
+void st_arg (FLOATS a, FLOATS *p) {*p = a;}
+/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
+
+/* blr */
+FLOATS ret_arg (FLOATS a) {return a;}
+
+typedef struct MIX
+{
+  double a[2];
+  long l;
+} MIX;
+
+/* std 3 param regs to return slot */
+MIX ret_arg1 (MIX a) {return a;}
+/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
+
+/* count insns */
+/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
-- 
2.25.1



BR,
Jeff (Jiufu Guo)


Jiufu Guo <guoji...@linux.ibm.com> writes:

> Hi,
>
> There are a few PRs about the issues on the struct parameters and
> returns, like PRs 69143/65421/108073.
>
> we could consider introducing a light SRA in the expander to
> handle those parameters and returns in aggregate type, if they
> are passed through registers.  For access to the fields of
> the parameters or returns, the corresponding scalar registers
> can be used.
>
> As discussed:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619884.html
>
> This is an initial patch for the light-expander-sra.
>
> Bootstrapped and regtested on x86_64-redhat-linux, and
> powerpc64{,le}-linux-gnu.
>
> Is it ok for trunk?
>
>
> BR,
> Jeff (Jiufu Guo)
>
>
>       PR target/65421
>       PR target/69143
>
> gcc/ChangeLog:
>
>       * cfgexpand.cc (expand_shift): Extern declare.
>       (struct access): New class.
>       (struct expand_sra): New class.
>       (expand_sra::build_access): New member function.
>       (expand_sra::visit_base): Likewise.
>       (expand_sra::analyze_default_stmt): Likewise.
>       (expand_sra::analyze_assign): Likewise.
>       (expand_sra::add_sra_candidate): Likewise.
>       (expand_sra::collect_sra_candidates): Likewise.
>       (expand_sra::valid_scalariable_accesses): Likewise.
>       (expand_sra::prepare_expander_sra): Likewise.
>       (expand_sra::expand_sra): Class constructor.
>       (expand_sra::~expand_sra): Class destructor.
>       (expand_sra::get_scalarized_rtx): New member function.
>       (extract_one_reg): New function.
>       (extract_sub_reg): New function.
>       (expand_sra::scalarize_access): New member function.
>       (expand_sra::scalarize_accesses): New member function.
>       (get_scalar_rtx_for_aggregate_expr): New function.
>       (set_scalar_rtx_for_aggregate_access): New function.
>       (set_scalar_rtx_for_returns): New function.
>       (expand_return): Call get_scalar_rtx_for_aggregate_expr.
>       (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr.
>       (pass_expand::execute): Update to use the expand_sra.
>       * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare.
>       (expand_assignment): Call get_scalar_rtx_for_aggregate_expr.
>       (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr.
>       * function.cc (set_scalar_rtx_for_aggregate_access):  Extern declare.
>       (set_scalar_rtx_for_returns): Extern declare.
>       (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access.
>       (assign_parms): Call set_scalar_rtx_for_aggregate_access. 
>       (expand_function_start): Call set_scalar_rtx_for_returns.
>       * tree-sra.h (struct base_access): New class.
>       (struct default_analyzer): New class.
>       (scan_function): New function template.
>
> gcc/testsuite/ChangeLog:
>
>       * g++.target/powerpc/pr102024.C: Updated.
>       * gcc.target/powerpc/pr108073.c: New test.
>       * gcc.target/powerpc/pr65421-1.c: New test.
>       * gcc.target/powerpc/pr65421-2.c: New test.
>
> ---
>  gcc/cfgexpand.cc                             | 478 ++++++++++++++++++-
>  gcc/expr.cc                                  |  15 +-
>  gcc/function.cc                              |  28 +-
>  gcc/tree-sra.h                               |  80 +++-
>  gcc/testsuite/g++.target/powerpc/pr102024.C  |   2 +-
>  gcc/testsuite/gcc.target/powerpc/pr108073.c  |  29 ++
>  gcc/testsuite/gcc.target/powerpc/pr65421-1.c |   6 +
>  gcc/testsuite/gcc.target/powerpc/pr65421-2.c |  32 ++
>  8 files changed, 660 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index 
> edf292cfbe95ac2711faee7769e839cb4edb0dd3..21a09ebac96bbcddc67da73c42f470c6d5f60e6c
>  100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -74,6 +74,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "output.h"
>  #include "builtins.h"
>  #include "opts.h"
> +#include "tree-sra.h"
>  
>  /* Some systems use __main in a way incompatible with its use in gcc, in 
> these
>     cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN 
> to
> @@ -97,6 +98,472 @@ static bool defer_stack_allocation (tree, bool);
>  
>  static void record_alignment_for_reg_var (unsigned int);
>  
> +extern rtx
> +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int);
> +
> +/* For light SRA in expander about paramaters and returns.  */
> +struct access : public base_access
> +{
> +  /* The rtx for the access: link to incoming/returning register(s).  */
> +  rtx rtx_val;
> +};
> +
> +typedef struct access *access_p;
> +
> +struct expand_sra : public default_analyzer
> +{
> +  expand_sra ();
> +  ~expand_sra ();
> +
> +  /* Now use default APIs, no actions for
> +     pre_analyze_stmt, analyze_return.  */
> +
> +  /* overwrite analyze_default_stmt.  */
> +  void analyze_default_stmt (gimple *);
> +
> +  /* overwrite analyze phi,call,asm .  */
> +  void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); };
> +  void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); };
> +  void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); };
> +  /* overwrite analyze_assign.  */
> +  void analyze_assign (gassign *);
> +
> +  /* Compute the scalar rtx(s) for all access of BASE from a parrallel REGS. 
> */
> +  bool scalarize_accesses (tree base, rtx regs);
> +  /* Return the scalarized rtx for EXPR.  */
> +  rtx get_scalarized_rtx (tree expr);
> +
> +private:
> +  void prepare_expander_sra (void);
> +
> +  /* Return true if VAR is a candidate for SRA.  */
> +  bool add_sra_candidate (tree var);
> +
> +  /* Collect the parameter and returns with type which is suitable for
> +     scalarization.  */
> +  bool collect_sra_candidates (void);
> +
> +  /* Return true if EXPR has interesting access to the sra candidates,
> +     and created access, return false otherwise.  */
> +  access_p build_access (tree expr, bool write);
> +
> +  /* Check if the accesses of BASE are scalarizbale.
> +     Now support the parms only with reading or returns only with writing.  
> */
> +  bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm);
> +
> +  /* Compute the scalar rtx for one access ACC from a parrallel REGS. */
> +  bool scalarize_access (access_p acc, rtx regs);
> +
> +  /* Callback of walk_stmt_load_store_addr_ops, used to remove
> +     unscalarizable accesses.  */
> +  static bool visit_base (gimple *, tree op, tree, void *data);
> +
> +  /* Expr (tree) -> Scalarized value (rtx) map.  */
> +  hash_map<tree, rtx> *expr_rtx_vec;
> +
> +  /* Base (tree) -> Vector (vec<access_p> *) map.  */
> +  hash_map<tree, auto_vec<access_p> > *base_access_vec;
> +};
> +
> +access_p
> +expand_sra::build_access (tree expr, bool write)
> +{
> +  enum tree_code code = TREE_CODE (expr);
> +  if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF
> +      && code != ARRAY_REF && code != ARRAY_RANGE_REF)
> +    return NULL;
> +
> +  HOST_WIDE_INT offset, size;
> +  bool reverse;
> +  tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse);
> +  if (!base || !DECL_P (base))
> +    return NULL;
> +  if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr))
> +    {
> +      base_access_vec->remove (base);
> +      return NULL;
> +    }
> +
> +  vec<access_p> *access_vec = base_access_vec->get (base);
> +  if (!access_vec)
> +    return NULL;
> +
> +  /* TODO: support reverse. */
> +  if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE 
> (base)))
> +    {
> +      base_access_vec->remove (base);
> +      return NULL;
> +    }
> +
> +  struct access *access = XNEWVEC (struct access, 1);
> +
> +  memset (access, 0, sizeof (struct access));
> +  access->offset = offset;
> +  access->size = size;
> +  access->expr = expr;
> +  access->write = write;
> +  access->rtx_val = NULL_RTX;
> +
> +  access_vec->safe_push (access);
> +
> +  return access;
> +}
> +
> +bool
> +expand_sra::visit_base (gimple *, tree op, tree, void *data)
> +{
> +  op = get_base_address (op);
> +  if (op && DECL_P (op))
> +    {
> +      expand_sra *p = (expand_sra *) data;
> +      p->base_access_vec->remove (op);
> +    }
> +  return false;
> +}
> +
> +void
> +expand_sra::analyze_default_stmt (gimple *stmt)
> +{
> +  if (base_access_vec && !base_access_vec->is_empty ())
> +    walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base,
> +                                visit_base);
> +}
> +
> +void
> +expand_sra::analyze_assign (gassign *stmt)
> +{
> +  if (!base_access_vec || base_access_vec->is_empty ())
> +    return;
> +
> +  if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
> +    {
> +      tree rhs = gimple_assign_rhs1 (stmt);
> +      tree lhs = gimple_assign_lhs (stmt);
> +      bool res_r = build_access (rhs, false);
> +      bool res_l = build_access (lhs, true);
> +      if (res_l && TREE_CODE (rhs) == CONSTRUCTOR)
> +     base_access_vec->remove (get_base_address (lhs));
> +
> +      if (res_l || res_r)
> +     return;
> +    }
> +
> +  analyze_default_stmt (stmt);
> +}
> +
> +/* Return true if VAR is a candidate for SRA.  */
> +
> +bool
> +expand_sra::add_sra_candidate (tree var)
> +{
> +  tree type = TREE_TYPE (var);
> +
> +  if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type))
> +      || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var)
> +      || is_va_list_type (type))
> +    return false;
> +  gcc_assert (COMPLETE_TYPE_P (type));
> +
> +  base_access_vec->get_or_insert (var);
> +
> +  return true;
> +}
> +
> +bool
> +expand_sra::collect_sra_candidates (void)
> +{
> +  bool ret = false;
> +
> +  /* Collect parameters.  */
> +  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
> +       parm = DECL_CHAIN (parm))
> +    ret |= add_sra_candidate (parm);
> +
> +  /* Collect VARs on returns.  */
> +  if (DECL_RESULT (current_function_decl))
> +    {
> +      edge_iterator ei;
> +      edge e;
> +      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> +     if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
> +       {
> +         tree val = gimple_return_retval (r);
> +         /* To sclaraized the return, the return value should be only
> +            writen, except this return stmt.
> +            Then using 'true(write)' to create the access. */
> +         if (val && VAR_P (val))
> +           ret |= add_sra_candidate (val) && build_access (val, true);
> +       }
> +    }
> +
> +  return ret;
> +}
> +
> +bool
> +expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool 
> is_parm)
> +{
> +  if (access_vec->is_empty ())
> +    return false;
> +
> +  for (unsigned int j = 0; j < access_vec->length (); j++)
> +    {
> +      struct access *access = (*access_vec)[j];
> +      if (is_parm && access->write)
> +     return false;
> +
> +      if (!is_parm && !access->write)
> +     return false;
> +    }
> +
> +  return true;
> +}
> +
> +void
> +expand_sra::prepare_expander_sra ()
> +{
> +  if (optimize <= 0)
> +    return;
> +
> +  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
> +  expr_rtx_vec = new hash_map<tree, rtx>;
> +
> +  collect_sra_candidates ();
> +}
> +
> +expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL)
> +{
> +  prepare_expander_sra ();
> +}
> +
> +expand_sra::~expand_sra ()
> +{
> +  if (optimize <= 0)
> +    return;
> +  delete expr_rtx_vec;
> +  expr_rtx_vec = NULL;
> +  delete base_access_vec;
> +  base_access_vec = NULL;
> +}
> +
> +rtx
> +expand_sra::get_scalarized_rtx (tree expr)
> +{
> +  if (!expr_rtx_vec)
> +    return NULL_RTX;
> +  rtx *val = expr_rtx_vec->get (expr);
> +  return val ? *val : NULL_RTX;
> +}
> +
> +/* Get the register at INDEX from a parallel REGS.  */
> +
> +static rtx
> +extract_one_reg (rtx regs, int index)
> +{
> +  rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
> +  if (!HARD_REGISTER_P (orig_reg))
> +    return orig_reg;
> +
> +  /* Reading from param hard reg need to be moved to a temp.  */
> +  rtx reg = gen_reg_rtx (GET_MODE (orig_reg));
> +  emit_move_insn (reg, orig_reg);
> +  return reg;
> +}
> +
> +/* Get IMODE part from REG at OFF_BITS.  */
> +
> +static rtx
> +extract_sub_reg (rtx orig_reg, int off_bits, machine_mode mode)
> +{
> +  scalar_int_mode imode;
> +  if (!int_mode_for_mode (mode).exists (&imode))
> +    return NULL_RTX;
> +
> +  machine_mode orig_mode = GET_MODE (orig_reg);
> +  gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
> +
> +  poly_uint64 lowpart_off = subreg_lowpart_offset (imode, orig_mode);
> +  int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT;
> +  int shift_bits;
> +  if (lowpart_off_bits >= off_bits)
> +    shift_bits = lowpart_off_bits - off_bits;
> +  else
> +    shift_bits = off_bits - lowpart_off_bits;
> +
> +  rtx reg = orig_reg;
> +  if (shift_bits > 0)
> +    reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, shift_bits, NULL, 1);
> +
> +  rtx subreg = gen_lowpart (imode, reg);
> +  rtx result = gen_reg_rtx (imode);
> +  emit_move_insn (result, subreg);
> +
> +  if (mode != imode)
> +    result = gen_lowpart (mode, result);
> +
> +  return result;
> +}
> +
> +bool
> +expand_sra::scalarize_access (access_p acc, rtx regs)
> +{
> +  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
> +
> +  /* mode of mult registers. */
> +  if (expr_mode != BLKmode
> +      && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
> +    return false;
> +
> +  /* Compute the position of the access in the whole parallel rtx.  */
> +  int start_index = -1;
> +  int end_index = -1;
> +  HOST_WIDE_INT left_bits = 0;
> +  HOST_WIDE_INT right_bits = 0;
> +  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
> +  for (; cur_index < XVECLEN (regs, 0); cur_index++)
> +    {
> +      rtx slot = XVECEXP (regs, 0, cur_index);
> +      HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
> +      machine_mode mode = GET_MODE (XEXP (slot, 0));
> +      HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
> +      if (off <= acc->offset && off + size > acc->offset)
> +     {
> +       start_index = cur_index;
> +       left_bits = acc->offset - off;
> +     }
> +      if (off + size >= acc->offset + acc->size)
> +     {
> +       end_index = cur_index;
> +       right_bits = off + size - (acc->offset + acc->size);
> +       break;
> +     }
> +    }
> +  /* Invalid access possition: padding or outof bound.  */
> +  if (start_index < 0 || end_index < 0)
> +    return false;
> +
> +  /* Need multi-registers in a parallel for the access.  */
> +  if (expr_mode == BLKmode || end_index > start_index)
> +    {
> +      if (left_bits || right_bits)
> +     return false;
> +
> +      int num_words = end_index - start_index + 1;
> +      rtx *tmps = XALLOCAVEC (rtx, num_words);
> +
> +      int pos = 0;
> +      HOST_WIDE_INT start;
> +      start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
> +      /* Extract whole registers.  */
> +      for (; pos < num_words; pos++)
> +     {
> +       int index = start_index + pos;
> +       rtx reg = extract_one_reg (regs, index);
> +       machine_mode mode = GET_MODE (reg);
> +       HOST_WIDE_INT off;
> +       off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start;
> +       tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off));
> +     }
> +
> +      rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps));
> +      acc->rtx_val = reg;
> +      return true;
> +    }
> +
> +  /* Just need one reg for the access.  */
> +  if (end_index == start_index && left_bits == 0 && right_bits == 0)
> +    {
> +      rtx reg = extract_one_reg (regs, start_index);
> +      if (GET_MODE (reg) != expr_mode)
> +     reg = gen_lowpart (expr_mode, reg);
> +
> +      acc->rtx_val = reg;
> +      return true;
> +    }
> +
> +  /* Need to extract part reg for the access.  */
> +  if (!acc->write && end_index == start_index
> +      && (acc->size % BITS_PER_UNIT) == 0)
> +    {
> +      rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
> +      acc->rtx_val = extract_sub_reg (orig_reg, left_bits, expr_mode);
> +      if (acc->rtx_val)
> +     return true;
> +    }
> +
> +  return false;
> +}
> +
> +bool
> +expand_sra::scalarize_accesses (tree base, rtx regs)
> +{
> +  if (!base_access_vec)
> +    return false;
> +  vec<access_p> *access_vec = base_access_vec->get (base);
> +  if (!access_vec)
> +    return false;
> +  bool is_parm = TREE_CODE (base) == PARM_DECL;
> +  if (!valid_scalariable_accesses (access_vec, is_parm))
> +    return false;
> +
> +  /* Go through each access, compute corresponding rtx(regs or subregs)
> +     for the expression.  */
> +  int n = access_vec->length ();
> +  int cur_access_index = 0;
> +  for (; cur_access_index < n; cur_access_index++)
> +    if (!scalarize_access ((*access_vec)[cur_access_index], regs))
> +      break;
> +
> +  /* Bind/map expr(tree) to sclarized rtx if all access scalarized.  */
> +  if (cur_access_index == n)
> +    for (int j = 0; j < n; j++)
> +      {
> +     access_p access = (*access_vec)[j];
> +     expr_rtx_vec->put (access->expr, access->rtx_val);
> +      }
> +
> +  return true;
> +}
> +
> +static expand_sra *current_sra = NULL;
> +
> +/* Check If there is an sra access for the expr.
> +   Return the correspond scalar sym for the access. */
> +
> +rtx
> +get_scalar_rtx_for_aggregate_expr (tree expr)
> +{
> +  return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX;
> +}
> +
> +/* Compute/Set RTX registers for those accesses on BASE.  */
> +
> +void
> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
> +{
> +  if (!current_sra)
> +    return;
> +  current_sra->scalarize_accesses (base, regs);
> +}
> +
> +void
> +set_scalar_rtx_for_returns ()
> +{
> +  if (!current_sra)
> +    return;
> +
> +  tree res = DECL_RESULT (current_function_decl);
> +  gcc_assert (res);
> +  edge_iterator ei;
> +  edge e;
> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> +    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
> +      {
> +     tree val = gimple_return_retval (r);
> +     if (val && VAR_P (val))
> +       current_sra->scalarize_accesses (val, DECL_RTL (res));
> +      }
> +}
> +
>  /* Return an expression tree corresponding to the RHS of GIMPLE
>     statement STMT.  */
>  
> @@ -3778,7 +4245,8 @@ expand_return (tree retval)
>  
>    /* If we are returning the RESULT_DECL, then the value has already
>       been stored into it, so we don't have to do anything special.  */
> -  if (TREE_CODE (retval_rhs) == RESULT_DECL)
> +  if (TREE_CODE (retval_rhs) == RESULT_DECL
> +      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
>      expand_value_return (result_rtl);
>  
>    /* If the result is an aggregate that is being returned in one (or more)
> @@ -4422,6 +4890,9 @@ expand_debug_expr (tree exp)
>    int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
>    addr_space_t as;
>    scalar_int_mode op0_mode, op1_mode, addr_mode;
> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
> +  if (x)
> +    return NULL_RTX;/* optimized out.  */
>  
>    switch (TREE_CODE_CLASS (TREE_CODE (exp)))
>      {
> @@ -6624,6 +7095,9 @@ pass_expand::execute (function *fun)
>    auto_bitmap forced_stack_vars;
>    discover_nonconstant_array_refs (forced_stack_vars);
>  
> +  current_sra = new expand_sra;
> +  scan_function (cfun, *current_sra);
> +
>    /* Make sure all values used by the optimization passes have sane
>       defaults.  */
>    reg_renumber = 0;
> @@ -7052,6 +7526,8 @@ pass_expand::execute (function *fun)
>        loop_optimizer_finalize ();
>      }
>  
> +  delete current_sra;
> +  current_sra = NULL;
>    timevar_pop (TV_POST_EXPAND);
>  
>    return 0;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 
> 174f8acb269ab5450fc799516471d5a2bd9b9efa..53b48aba790d4dd8ade326a2b33a0c7ec3fffc47
>  100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, 
> rtx,
>  static rtx const_vector_from_tree (tree);
>  static tree tree_expr_size (const_tree);
>  static void convert_mode_scalar (rtx, rtx, int);
> +rtx get_scalar_rtx_for_aggregate_expr (tree);
>  
>  
>  /* This is run to set up which modes can be used
> @@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool 
> nontemporal)
>       Assignment of an array element at a constant index, and assignment of
>       an array element in an unaligned packed structure field, has the same
>       problem.  Same for (partially) storing into a non-memory object.  */
> -  if (handled_component_p (to)
> -      || (TREE_CODE (to) == MEM_REF
> -       && (REF_REVERSE_STORAGE_ORDER (to)
> -           || mem_ref_refers_to_non_mem_p (to)))
> -      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
> +  if (!get_scalar_rtx_for_aggregate_expr (to)
> +      && (handled_component_p (to)
> +       || (TREE_CODE (to) == MEM_REF
> +           && (REF_REVERSE_STORAGE_ORDER (to)
> +               || mem_ref_refers_to_non_mem_p (to)))
> +       || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
>      {
>        machine_mode mode1;
>        poly_int64 bitsize, bitpos;
> @@ -9006,6 +9008,9 @@ expand_expr_real (tree exp, rtx target, machine_mode 
> tmode,
>        ret = CONST0_RTX (tmode);
>        return ret ? ret : const0_rtx;
>      }
> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
> +  if (x)
> +    return x;
>  
>    ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
>                           inner_reference_p);
> diff --git a/gcc/function.cc b/gcc/function.cc
> index 
> dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be
>  100644
> --- a/gcc/function.cc
> +++ b/gcc/function.cc
> @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct 
> assign_parm_data_one *data)
>    data->stack_parm = stack_parm;
>  }
>  
> +extern void set_scalar_rtx_for_aggregate_access (tree, rtx);
> +extern void set_scalar_rtx_for_returns ();
> +
>  /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that it's
>     always valid and contiguous.  */
>  
> @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all 
> *all,
>         emit_move_insn (mem, entry_parm);
>       }
>        else
> -     move_block_from_reg (REGNO (entry_parm), mem,
> -                          size_stored / UNITS_PER_WORD);
> +     {
> +       int regno = REGNO (entry_parm);
> +       int nregs = size_stored / UNITS_PER_WORD;
> +       move_block_from_reg (regno, mem, nregs);
> +
> +       rtx *tmps = XALLOCAVEC (rtx, nregs);
> +       machine_mode mode = word_mode;
> +       HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
> +       for (int i = 0; i < nregs; i++)
> +         {
> +           rtx reg = gen_rtx_REG (mode, regno + i);
> +           rtx off = GEN_INT (word_size * i);
> +           tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
> +         }
> +
> +       rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
> +       set_scalar_rtx_for_aggregate_access (parm, regs);
> +     }
>      }
>    else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
>      {
> @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl)
>        else
>       set_decl_incoming_rtl (parm, data.entry_parm, false);
>  
> +      rtx incoming = DECL_INCOMING_RTL (parm);
> +      if (GET_CODE (incoming) == PARALLEL)
> +     set_scalar_rtx_for_aggregate_access (parm, incoming);
> +
>        assign_parm_adjust_stack_rtl (&data);
>  
>        if (assign_parm_setup_block_p (&data))
> @@ -5136,6 +5159,7 @@ expand_function_start (tree subr)
>           {
>             gcc_assert (GET_CODE (hard_reg) == PARALLEL);
>             set_parm_rtl (res, gen_group_rtx (hard_reg));
> +           set_scalar_rtx_for_returns ();
>           }
>       }
>  
> diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h
> index 
> f20266c46226f7840299a768cb575f6f92b54207..7af87bccf1b43badbc3f8a4c51a87c84d5020b9e
>  100644
> --- a/gcc/tree-sra.h
> +++ b/gcc/tree-sra.h
> @@ -19,7 +19,85 @@ You should have received a copy of the GNU General Public 
> License
>  along with GCC; see the file COPYING3.  If not see
>  <http://www.gnu.org/licenses/>.  */
>  
> -bool type_internals_preclude_sra_p (tree type, const char **msg);
> +struct base_access
> +{
> +  /* Values returned by get_ref_base_and_extent, indicates the
> +     OFFSET, SIZE and BASE of the access.  */
> +  HOST_WIDE_INT offset;
> +  HOST_WIDE_INT size;
> +  tree base;
> +
> +  /* The context expression of this access.  */
> +  tree expr;
> +
> +  /* Indicates this is a write access.  */
> +  bool write : 1;
> +
> +  /* Indicates if this access is made in reverse storage order.  */
> +  bool reverse : 1;
> +};
> +
> +/* Default template for sra_scan_function.  */
> +
> +struct default_analyzer
> +{
> +  /* Template analyze functions.  */
> +  void analyze_phi (gphi *){};
> +  void pre_analyze_stmt (gimple *){};
> +  void analyze_return (greturn *){};
> +  void analyze_assign (gassign *){};
> +  void analyze_call (gcall *){};
> +  void analyze_asm (gasm *){};
> +  void analyze_default_stmt (gimple *){};
> +};
> +
> +/* Scan function and look for interesting expressions.  */
> +
> +template <typename analyzer>
> +void
> +scan_function (struct function *fun, analyzer &a)
> +{
> +  basic_block bb;
> +  FOR_EACH_BB_FN (bb, fun)
> +    {
> +      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
> +        gsi_next (&gsi))
> +     a.analyze_phi (gsi.phi ());
> +
> +      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
> +        gsi_next (&gsi))
> +     {
> +       gimple *stmt = gsi_stmt (gsi);
> +       a.pre_analyze_stmt (stmt);
> +
> +       switch (gimple_code (stmt))
> +         {
> +         case GIMPLE_RETURN:
> +           a.analyze_return (as_a<greturn *> (stmt));
> +           break;
> +
> +         case GIMPLE_ASSIGN:
> +           a.analyze_assign (as_a<gassign *> (stmt));
> +           break;
> +
> +         case GIMPLE_CALL:
> +           a.analyze_call (as_a<gcall *> (stmt));
> +           break;
> +
> +         case GIMPLE_ASM:
> +           a.analyze_asm (as_a<gasm *> (stmt));
> +           break;
> +
> +         default:
> +           a.analyze_default_stmt (stmt);
> +           break;
> +         }
> +     }
> +    }
> +}
> +
> +bool
> +type_internals_preclude_sra_p (tree type, const char **msg);
>  
>  /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA
>     should leave alone).  */
> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
> b/gcc/testsuite/g++.target/powerpc/pr102024.C
> index 
> 769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966
>  100644
> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C
> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
> @@ -5,7 +5,7 @@
>  // Test that a zero-width bit field in an otherwise homogeneous aggregate
>  // generates a psabi warning and passes arguments in GPRs.
>  
> -// { dg-final { scan-assembler-times {\mstd\M} 4 } }
> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
>  
>  struct a_thing
>  {
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
> b/gcc/testsuite/gcc.target/powerpc/pr108073.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -save-temps" } */
> +
> +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF;
> +typedef struct SF {float a[4]; int i1; int i2; } SF;
> +
> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 
> && has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && 
> has_arch_pwr8 } } } } */
> +short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return 
> a.s2+a.s3;return 0;}
> +int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return 
> a.i2+a.i1;return 0;}
> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 
> 2)return arg.a[3];else return 0.0;}
> +float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 
> 2)return arg.a[2]; return 0;}
> +float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 
> 2)return arg.a[1];return 0;}
> +
> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
> +
> +int main()
> +{
> +  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 
> 4.0
> +     && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
> +    __builtin_abort ();
> +  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
> +     && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c 
> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
> @@ -0,0 +1,6 @@
> +/* PR target/65421 */
> +/* { dg-options "-O2" } */
> +
> +typedef struct LARGE {double a[4]; int arr[32];} LARGE;
> +LARGE foo (LARGE a){return a;}
> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c 
> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
> @@ -0,0 +1,32 @@
> +/* PR target/65421 */
> +/* { dg-options "-O2" } */
> +/* { dg-require-effective-target powerpc_elfv2 } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +typedef struct FLOATS
> +{
> +  double a[3];
> +} FLOATS;
> +
> +/* 3 lfd after returns also optimized */
> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
> +
> +/* 3 stfd */
> +void st_arg (FLOATS a, FLOATS *p) {*p = a;}
> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
> +
> +/* blr */
> +FLOATS ret_arg (FLOATS a) {return a;}
> +
> +typedef struct MIX
> +{
> +  double a[2];
> +  long l;
> +} MIX;
> +
> +/* std 3 param regs to return slot */
> +MIX ret_arg1 (MIX a) {return a;}
> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
> +
> +/* count insns */
> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */

Reply via email to