This is the final piece which makes use of all the previous regrename
patches, and transforms this loop (which occurs in a popular embedded
benchmark):
                sploop  2
.L4:
                ldh     .d1t1   *A6++[1], A7
                ldh     .d1t1   *++A5[1], A8
                nop     4
                mpy     .m1     A8, A7, A19
                nop     1
                shr     .s1     A19, A9, A20
                spkernel        4, 0

into this:
                sploop  1
.L4:
                ldh     .d2t2   *B5++[1], B6
        ||      ldh     .d1t1   *++A5[1], A8
                nop     4
                mpy     .m1x    A8, B6, A19
                nop     1
                shr     .s1     A19, A9, A20
                spkernel        8, 0

In the original loop, D1 and T1 are reserved twice, while there is no
reservation for D2 and T2: there is an imbalance which limits the
initiation interval.  By shifting some of the registers to the other
side of the machine, the balance is restored, the two loads can issue in
the same cycle, and the initiation interval reaches the optimum value of 1.

This code is really quite limited and only a first step - there are more
transformations we could do, but given that, a 100% speedup on some
loops doesn't seem so bad.

Will commit once the 1/2 regrename patch is approved.


Bernd

        * config/c6x/c6x.md (attr "op_pattern"): New.
        (load_sdata_pic, mov<mode>_insn for QIHIM and SISFVM): Set it.
        * config/c6x/c6x-mult.md.in (mulhi3_VARIANT_, mulhisi3_insn_VARIANT_):
        Likewise.
        * config/c6x/c6x-mult.md: Regenerate.
        * config/c6x/c6x.c: Include "regrename.h".
        (unit_req_table): New typedef.
        (unit_reqs): Use it for the declaration.
        (req_imbalance, get_unit_operand_masks, try_rename_operands,
        reshuffle_units): New static functions.
        (count_unit_reqs): New arg reqs.  All callers changed.  Use
        get_unit_reqs, and don't merge here.
        (res_mii): New arg reqs.  All callers changed.  Rewrite to use a loop
        using unit_req_factor.
        (hwloop_optimize): Call reshuffle_units.  Call merge_unit_reqs after
        count_unit_reqs.
        (c6x_reorg): Add reg notes problem, and call df_analyze.
        * Makefile.in ($(out_object_file)): Depend on regrename.h.

Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in     (revision 179379)
+++ gcc/Makefile.in     (working copy)
@@ -3535,7 +3536,8 @@ $(out_object_file): $(out_file) $(CONFIG
    output.h $(INSN_ATTR_H) $(SYSTEM_H) toplev.h $(DIAGNOSTIC_CORE_H) \
    $(TARGET_H) $(LIBFUNCS_H) $(TARGET_DEF_H) $(FUNCTION_H) $(SCHED_INT_H) \
    $(TM_P_H) $(EXPR_H) langhooks.h $(GGC_H) $(OPTABS_H) $(REAL_H) \
-   tm-constrs.h $(GIMPLE_H) $(DF_H) cselib.h $(COMMON_TARGET_H) hw-doloop.h
+   tm-constrs.h $(GIMPLE_H) $(DF_H) cselib.h $(COMMON_TARGET_H) hw-doloop.h \
+   regrename.h
        $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) \
                $(out_file) $(OUTPUT_OPTION)
 
Index: gcc/config/c6x/c6x.md
===================================================================
--- gcc/config/c6x/c6x.md       (revision 179393)
+++ gcc/config/c6x/c6x.md       (working copy)
@@ -166,6 +166,17 @@ (define_attr "cross"
   "n,y"
   (const_string "n"))
 
+;; This describes the relationship between operands and register files.
+;; For example, "sxs" means that operands 0 and 2 determine the side of
+;; the machine, and operand 1 can optionally use the cross path.  "dt" and
+;; "td" are used to describe loads and stores.
+;; Used for register renaming in loops for improving modulo scheduling.
+(define_attr "op_pattern"
+  "unknown,dt,td,sx,sxs,ssx"
+  (cond [(eq_attr "type" "load") (const_string "td")
+        (eq_attr "type" "store") (const_string "dt")]
+       (const_string "unknown")))
+
 (define_attr "has_shadow"
   "n,y"
   (const_string "n"))
@@ -567,6 +578,7 @@ (define_insn "load_sdata_pic"
    %|%.\\tadda%D2\\t%$\\t%1, %2, %0"
   [(set_attr "units" "d")
    (set_attr "cross" "y,n")
+   (set_attr "op_pattern" "unknown")
    (set_attr "predicable" "no")])
 
 ;; Move instruction patterns
@@ -599,6 +611,7 @@ (define_insn "mov<mode>_insn"
   [(set_attr "type" "*,*,*,*,*,*,load,load,load,load,store,store,store,store")
    (set_attr "units62" 
"dls,dls,ls,ls,s,s,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr")
    (set_attr "units64" 
"dls,dls,ls,ls,dl,s,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr")
+   (set_attr "op_pattern" "sx,sx,sx,sx,*,*,*,*,*,*,*,*,*,*")
    (set_attr "addr_regfile" "*,*,*,*,*,*,a,b,b,a,a,b,b,a")
    (set_attr "dest_regfile" "*,*,*,*,*,*,a,a,b,b,a,a,b,b")
    (set_attr "cross" "n,n,y,y,n,n,n,y,n,y,n,y,n,y")])
@@ -631,6 +644,7 @@ (define_insn "mov<mode>_insn"
   [(set_attr "type" 
"*,*,*,*,*,*,*,*,*,load,load,load,load,store,store,store,store")
    (set_attr "units62" 
"dls,dls,ls,ls,s,s,d,d,*,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr")
    (set_attr "units64" 
"dls,dls,ls,ls,dl,s,d,d,*,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr,d_addr")
+   (set_attr "op_pattern" "sx,sx,sx,sx,*,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "addr_regfile" "*,*,*,*,*,*,*,*,*,a,b,b,a,a,b,b,a")
    (set_attr "dest_regfile" "*,*,*,*,*,*,*,*,*,a,a,b,b,a,a,b,b")
    (set_attr "cross" "n,n,y,y,n,n,y,n,*,n,y,n,y,n,y,n,y")
@@ -855,7 +869,7 @@ (define_mode_attr ext_shift [(QI "24") (
 
 (define_insn "<ext_name><mode>si2"
  [(set (match_operand:SI 0 "register_operand" "=a,b,a,?a, b,?b")
-       (any_ext: SI (match_operand:QIHIM 1 "nonimmediate_operand" "a,b,Q, R, 
R, Q")))]
+       (any_ext:SI (match_operand:QIHIM 1 "nonimmediate_operand" "a,b,Q, R, R, 
Q")))]
   ""
  "@
   %|%.\\text<u>\\t%$\\t%1, <ext_shift>, <ext_shift>, %0
Index: gcc/config/c6x/c6x-mult.md
===================================================================
--- gcc/config/c6x/c6x-mult.md  (revision 179379)
+++ gcc/config/c6x/c6x-mult.md  (working copy)
@@ -81,6 +81,7 @@ (define_insn "mulhi3"
   "%|%.\\tmpy\\t%$\\t%2, %1, %0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "sxs")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_const"
@@ -104,6 +105,7 @@ (define_insn "*mulhisi3_insn"
   "%|%.\\tmpy\\t%$\\t%1, %2, %0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "ssx")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_lh"
@@ -500,6 +502,7 @@ (define_insn "mulhi3_real"
   "%|%.\\tmpy\\t%$\\t%2, %1, %k0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "sxs")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_const_real"
@@ -523,6 +526,7 @@ (define_insn "*mulhisi3_insn_real"
   "%|%.\\tmpy\\t%$\\t%1, %2, %k0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "ssx")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_lh_real"
Index: gcc/config/c6x/c6x.c
===================================================================
--- gcc/config/c6x/c6x.c        (revision 179393)
+++ gcc/config/c6x/c6x.c        (working copy)
@@ -51,6 +51,7 @@
 #include "debug.h"
 #include "opts.h"
 #include "hw-doloop.h"
+#include "regrename.h"
 
 /* Table of supported architecture variants.  */
 typedef struct
@@ -3312,6 +3313,25 @@ merge_unit_reqs (unit_req_table reqs)
     }
 }
 
+/* Examine the table REQS and return a measure of unit imbalance by comparing
+   the two sides of the machine.  If, for example, D1 is used twice and D2
+   used not at all, the return value should be 1 in the absence of other
+   imbalances.  */
+static int
+unit_req_imbalance (unit_req_table reqs)
+{
+  int val = 0;
+  int i;
+
+  for (i = 0; i < UNIT_REQ_MAX; i++)
+    {
+      int factor = unit_req_factor (i);
+      int diff = abs (reqs[0][i] - reqs[1][i]);
+      val += (diff + factor - 1) / factor / 2;
+    }
+  return val;
+}
+
 /* Return the resource-constrained minimum iteration interval given the
    data in the REQS table.  This must have been processed with
    merge_unit_reqs already.  */
@@ -3329,6 +3349,241 @@ res_mii (unit_req_table reqs)
 
   return worst;
 }
+
+/* Examine INSN, and store in PMASK1 and PMASK2 bitmasks that represent
+   the operands that are involved in the (up to) two reservations, as
+   found by get_unit_reqs.  Return true if we did this successfully, false
+   if we couldn't identify what to do with INSN.  */
+static bool
+get_unit_operand_masks (rtx insn, unsigned int *pmask1, unsigned int *pmask2)
+{
+  enum attr_units units;
+  enum attr_type type;
+  enum attr_op_pattern op_pat;
+
+  if (recog_memoized (insn) < 0)
+    return 0;
+  if (GET_CODE (PATTERN (insn)) == COND_EXEC)
+    return false;
+  extract_insn (insn);
+  op_pat = get_attr_op_pattern (insn);
+  if (op_pat == OP_PATTERN_DT)
+    {
+      gcc_assert (recog_data.n_operands == 2);
+      *pmask1 = 1 << 0;
+      *pmask2 = 1 << 1;
+      return true;
+    }
+  else if (op_pat == OP_PATTERN_TD)
+    {
+      gcc_assert (recog_data.n_operands == 2);
+      *pmask1 = 1 << 1;
+      *pmask2 = 1 << 0;
+      return true;
+    }
+  else if (op_pat == OP_PATTERN_SXS)
+    {
+      gcc_assert (recog_data.n_operands == 3);
+      *pmask1 = (1 << 0) | (1 << 2);
+      *pmask2 = 1 << 1;
+      return true;
+    }
+  else if (op_pat == OP_PATTERN_SX)
+    {
+      gcc_assert (recog_data.n_operands == 2);
+      *pmask1 = 1 << 0;
+      *pmask2 = 1 << 1;
+      return true;
+    }
+  else if (op_pat == OP_PATTERN_SSX)
+    {
+      gcc_assert (recog_data.n_operands == 3);
+      *pmask1 = (1 << 0) | (1 << 1);
+      *pmask2 = 1 << 2;
+      return true;
+    }
+  return false;
+}
+
+/* Try to replace a register in INSN, which has corresponding rename info
+   from regrename_analyze in INFO.  OP_MASK and ORIG_SIDE provide information
+   about the operands that must be renamed and the side they are on.
+   REQS is the table of unit reservations in the loop between HEAD and TAIL.
+   We recompute this information locally after our transformation, and keep
+   it only if we managed to improve the balance.  */
+static void
+try_rename_operands (rtx head, rtx tail, unit_req_table reqs, rtx insn,
+                    insn_rr_info *info, unsigned int op_mask, int orig_side)
+{
+  enum reg_class super_class = orig_side == 0 ? B_REGS : A_REGS;
+  HARD_REG_SET unavailable;
+  unit_req_table new_unit_reqs;
+  du_head_p this_head;
+  struct du_chain *chain;
+  int i;
+  unsigned tmp_mask;
+  int best_reg, old_reg;
+  VEC (du_head_p, heap) *involved_chains = NULL;
+  unit_req_table new_reqs;
+
+  for (i = 0, tmp_mask = op_mask; tmp_mask; i++)
+    {
+      du_head_p op_chain;
+      if ((tmp_mask & (1 << i)) == 0)
+       continue;
+      if (info->op_info[i].n_chains != 1)
+       goto out_fail;
+      op_chain = regrename_chain_from_id (info->op_info[i].heads[0]->id);
+      VEC_safe_push (du_head_p, heap, involved_chains, op_chain);
+      tmp_mask &= ~(1 << i);
+    }
+
+  if (VEC_length (du_head_p, involved_chains) > 1)
+    goto out_fail;
+
+  this_head = VEC_index (du_head_p, involved_chains, 0);
+  if (this_head->cannot_rename)
+    goto out_fail;
+
+  for (chain = this_head->first; chain; chain = chain->next_use)
+    {
+      unsigned int mask1, mask2, mask_changed;
+      int count, side1, side2, req1, req2;
+      insn_rr_info *this_rr = VEC_index (insn_rr_info, insn_rr,
+                                        INSN_UID (chain->insn));
+
+      count = get_unit_reqs (chain->insn, &req1, &side1, &req2, &side2);
+
+      if (count == 0)
+       goto out_fail;
+
+      if (!get_unit_operand_masks (chain->insn, &mask1, &mask2))
+       goto out_fail;
+
+      extract_insn (chain->insn);
+
+      mask_changed = 0;
+      for (i = 0; i < recog_data.n_operands; i++)
+       {
+         int j;
+         int n_this_op = this_rr->op_info[i].n_chains;
+         for (j = 0; j < n_this_op; j++)
+           {
+             du_head_p other = this_rr->op_info[i].heads[j];
+             if (regrename_chain_from_id (other->id) == this_head)
+               break;
+           }
+         if (j == n_this_op)
+           continue;
+
+         if (n_this_op != 1)
+           goto out_fail;
+         mask_changed |= 1 << i;
+       }
+      gcc_assert (mask_changed != 0);
+      if (mask_changed != mask1 && mask_changed != mask2)
+       goto out_fail;
+    }
+
+  /* If we get here, we can do the renaming.  */
+  COMPL_HARD_REG_SET (unavailable, reg_class_contents[(int) super_class]);
+
+  old_reg = this_head->regno;
+  best_reg = find_best_rename_reg (this_head, super_class, &unavailable, 
old_reg);
+
+  regrename_do_replace (this_head, best_reg);
+
+  count_unit_reqs (new_reqs, head, PREV_INSN (tail));
+  merge_unit_reqs (new_reqs);
+  if (dump_file)
+    {
+      fprintf (dump_file, "reshuffle for insn %d, op_mask %x, "
+              "original side %d, new reg %d\n",
+              INSN_UID (insn), op_mask, orig_side, best_reg);
+      fprintf (dump_file, "  imbalance %d -> %d\n",
+              unit_req_imbalance (reqs), unit_req_imbalance (new_reqs));
+    }
+  if (unit_req_imbalance (new_reqs) > unit_req_imbalance (reqs))
+    regrename_do_replace (this_head, old_reg);
+  else
+    memcpy (reqs, new_reqs, sizeof (unit_req_table));
+
+ out_fail:
+  VEC_free (du_head_p, heap, involved_chains);
+}
+
+/* Find insns in LOOP which would, if shifted to the other side
+   of the machine, reduce an imbalance in the unit reservations.  */
+static void
+reshuffle_units (basic_block loop)
+{
+  rtx head = BB_HEAD (loop);
+  rtx tail = BB_END (loop);
+  rtx insn;
+  int side;
+  unit_req_table reqs;
+  edge e;
+  edge_iterator ei;
+  basic_block first = NULL;
+  bitmap_head bbs;
+
+  count_unit_reqs (reqs, head, PREV_INSN (tail));
+  merge_unit_reqs (reqs);
+
+  regrename_init (true);
+
+  bitmap_initialize (&bbs, &bitmap_default_obstack);
+
+  FOR_EACH_EDGE (e, ei, loop->preds)
+    {
+      bitmap_set_bit (&bbs, e->src->index);
+      first = e->src;
+    }
+  bitmap_set_bit (&bbs, loop->index);
+  regrename_analyze (&bbs);
+
+  for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
+    {
+      enum attr_units units;
+      enum attr_cross cross;
+      int count, side1, side2, req1, req2;
+      unsigned int mask1, mask2;
+      insn_rr_info *info;
+
+      if (!NONDEBUG_INSN_P (insn))
+       continue;
+
+      count = get_unit_reqs (insn, &req1, &side1, &req2, &side2);
+
+      if (count == 0)
+       continue;
+
+      if (!get_unit_operand_masks (insn, &mask1, &mask2))
+       continue;
+
+      info = VEC_index (insn_rr_info, insn_rr, INSN_UID (insn));
+      if (info->op_info == NULL)
+       continue;
+
+      if (reqs[side1][req1] > 1
+         && reqs[side1][req1] > 2 * reqs[side1 ^ 1][req1])
+       {
+         try_rename_operands (head, tail, reqs, insn, info, mask1, side1);
+       }
+
+      units = get_attr_units (insn);
+      if (units == UNITS_D_ADDR)
+       {
+         gcc_assert (count == 2);
+         if (reqs[side2][req2] > 1
+             && reqs[side2][req2] > 2 * reqs[side2 ^ 1][req2])
+           {
+             try_rename_operands (head, tail, reqs, insn, info, mask2, side2);
+           }
+       }
+    }
+  regrename_finish ();
+}
 
 /* Backend scheduling state.  */
 typedef struct c6x_sched_context
@@ -5263,6 +5518,8 @@ hwloop_optimize (hwloop_info loop)
   if (entry_edge == NULL)
     return false;
 
+  reshuffle_units (loop->head);
+
   schedule_ebbs_init ();
   schedule_ebb (BB_HEAD (loop->tail), loop->loop_end, true);
   schedule_ebbs_finish ();
@@ -5632,11 +5889,14 @@ c6x_reorg (void)
   compute_bb_for_insn ();
 
   df_clear_flags (DF_LR_RUN_DCE);
+  df_note_add_problem ();
 
   /* If optimizing, we'll have split before scheduling.  */
   if (optimize == 0)
     split_all_insns ();
 
+  df_analyze ();
+
   if (c6x_flag_schedule_insns2)
     {
       int sz = get_max_uid () * 3 / 2 + 1;
Index: gcc/config/c6x/c6x-mult.md.in
===================================================================
--- gcc/config/c6x/c6x-mult.md.in       (revision 179379)
+++ gcc/config/c6x/c6x-mult.md.in       (working copy)
@@ -79,6 +79,7 @@ (define_insn "mulhi3_VARIANT_"
   "%|%.\\tmpy\\t%$\\t%2, %1, %_MODk_0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "sxs")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_const_VARIANT_"
@@ -102,6 +103,7 @@ (define_insn "*mulhisi3_insn_VARIANT_"
   "%|%.\\tmpy\\t%$\\t%1, %2, %_MODk_0"
   [(set_attr "type" "mpy2")
    (set_attr "units" "m")
+   (set_attr "op_pattern" "ssx")
    (set_attr "cross" "n,n,y,y")])
 
 (define_insn "mulhisi3_lh_VARIANT_"

Reply via email to