Hi,
As you may know, we have loop unroll pass in RTL which was introduced a few
years ago, and works for a long time. Currently, this unroller is using the
pseudos in the original body, and then the pseudos are written multiple times.
It would be a good idea to create new pseudos for those duplicated instructions
during loop unrolling. This would relax reg dependence, and then provide more
freedom/opportunity for other optimizations, like scheduling, RA...
As below example:
Original loop:
```
26: L26:
17: NOTE_INSN_BASIC_BLOCK 4
18: r130:SF=[r125:DI+r122:DI]
19: r131:SF=[r126:DI+r122:DI]
20: r129:SF=r130:SF*r131:SF
21: r132:DF=float_extend(r129:SF)
22: r121:DF=r121:DF+r132:DF
23: r133:SI=r123:DI#0-0x1
24: r123:DI=zero_extend(r133:SI)
25: r122:DI=r122:DI+0x4
27: r134:CC=cmp(r123:DI,0)
28: pc={(r134:CC!=0)?L49:pc}
; pc falls through to BB 5
49: L49:
48: NOTE_INSN_BASIC_BLOCK 8
; pc falls through to BB 4
```
It was unrolled to:
```
26: L26:
17: NOTE_INSN_BASIC_BLOCK 4
18: r130:SF=[r125:DI+r122:DI]
19: r131:SF=[r126:DI+r122:DI]
20: r129:SF=r130:SF*r131:SF
21: r132:DF=float_extend(r129:SF)
93: r145:DF=r121:DF+r132:DF
23: r133:SI=r123:DI#0-0x1
95: r148:DI=zero_extend(r133:SI)
91: r140:DI=r122:DI+0x4
25: r122:DI=r140:DI
97: r134:CC=cmp(r148:DI,0)
87: NOTE_INSN_BASIC_BLOCK 16
77: r130:SF=[r125:DI+r122:DI]
78: r131:SF=[r126:DI+r122:DI]
79: r129:SF=r130:SF*r131:SF
80: r132:DF=float_extend(r129:SF)
81: r121:DF=r121:DF+r132:DF
82: r133:SI=r123:DI#0-0x1
83: r123:DI=zero_extend(r133:SI)
84: r122:DI=r140:DI+0x4
85: r134:CC=cmp(r123:DI,0)
86: pc={(r134:CC!=0)?L90:pc}
```
We can see, the original loop is using r130,r131,r129,r132,r121...
For the unrolled sequence BB 16, they are still using them.
We could use new pseudos for duplicate body, like below:
```
26: L26:
17: NOTE_INSN_BASIC_BLOCK 4
18: r130:SF=[r125:DI+r122:DI]
19: r131:SF=[r126:DI+r122:DI]
20: r129:SF=r130:SF*r131:SF
21: r132:DF=float_extend(r129:SF)
93: r145:DF=r121:DF+r132:DF
23: r133:SI=r123:DI#0-0x1
95: r148:DI=zero_extend(r133:SI)
91: r140:DI=r122:DI+0x4
25: r122:DI=r140:DI
97: r134:CC=cmp(r148:DI,0)
87: NOTE_INSN_BASIC_BLOCK 16
77: r141:SF=[r125:DI+r122:DI]
78: r142:SF=[r126:DI+r122:DI]
79: r143:SF=r141:SF*r142:SF
80: r144:DF=float_extend(r143:SF)
81: r146:DF=r145:DF+r144:DF
82: r147:SI=r148:DI#0-0x1
83: r149:DI=zero_extend(r147:SI)
84: r122:DI=r140:DI+0x4
85: r150:CC=cmp(r149:DI,0)
98: r130:SF=r141:SF
99: r131:SF=r142:SF
100: r129:SF=r143:SF
101: r132:DF=r144:DF
102: r121:DF=r146:DF
103: r133:SI=r147:SI
104: r123:DI=r149:DI
105: r134:CC=r150:CC
86: pc={(r150:CC!=0)?L90:pc}
```
In BB 16, new pseudos: r141,r142,r143... are used.
For this, I drafted a prototype like the patch below.
This patch only supports simple loops: one exit edge with one major basic block.
With this patch, we can see, the generated asm code uses fewer instructions on
PowerPC.
Welcome for comments, thanks!
BR.
Jiufu Guo.
---
gcc/common.opt | 4 +
gcc/loop-unroll.c | 309 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 310 insertions(+), 3 deletions(-)
diff --git a/gcc/common.opt b/gcc/common.opt
index fa9da505fc2..e298ce09c82 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2526,6 +2526,10 @@ fvariable-expansion-in-unroller
Common Report Var(flag_variable_expansion_in_unroller) Optimization
Apply variable expansion when loops are unrolled.
+fassin-new-operand-in-unroller
+Common Report Var(flag_assign_new_operand_in_unroller) Init(1) Optimization
+Creating new pseudo for duplicated instruction when loops are unrolled.
+
fstack-check=
Common Report RejectNegative Joined Optimization
-fstack-check=[no|generic|specific] Insert stack checking code into the
program.
diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c
index 693c7768868..54a48bd9fd7 100644
--- a/gcc/loop-unroll.c
+++ b/gcc/loop-unroll.c
@@ -94,6 +94,17 @@ struct var_to_expand
var_expansions[REUSE_EXPANSION - 1]. */
};
+struct def_to_split
+{
+ rtx_insn *insn; /* The insn in that the assigment occurs. */
+ rtx reg; /* The def/set pseudo. */
+ vec<rtx> defs; /* The copies of the defs which is split to. */
+ struct def_to_split *next; /* Next entry in walking order. */
+ int count; /* Number of DEFS. */
+ int use_before_def; /* The pseudo is used before re-defined. */
+ rtx_insn *last_insn;
+};
+
/* Hashtable helper for iv_to_split. */
struct iv_split_hasher : free_ptr_hash <iv_to_split>
@@ -143,6 +154,30 @@ var_expand_hasher::equal (const var_to_expand *i1, const
var_to_expand *i2)
return i1->insn == i2->insn;
}
+/* Hashtable helper for iv_to_split. */
+
+struct def_split_hasher : free_ptr_hash<def_to_split>
+{
+ static inline hashval_t hash (const def_to_split *);
+ static inline bool equal (const def_to_split *, const def_to_split *);
+};
+
+/* Return a hash for VES. */
+
+inline hashval_t
+def_split_hasher::hash (const def_to_split *i)
+{
+ return (hashval_t) INSN_UID (i->insn);
+}
+
+/* Return true if I1 and I2 refer to the same instruction. */
+
+inline bool
+def_split_hasher::equal (const def_to_split *i1, const def_to_split *i2)
+{
+ return i1->insn == i2->insn;
+}
+
/* Information about optimization applied in
the unrolled loop. */
@@ -156,10 +191,17 @@ struct opt_info
insns with accumulators to expand. */
struct var_to_expand *var_to_expand_head; /* The first var to expand. */
struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the
list. */
+
+ hash_table<def_split_hasher> *insns_def_split; /* A hashtable of insns to
+ split def. */
+ struct def_to_split *def_to_split_head; /* The def to split. */
+ struct def_to_split **def_to_split_tail; /* Pointer to the tail of the list.
*/
+
unsigned first_new_block; /* The first basic block that was
duplicated. */
basic_block loop_exit; /* The loop exit basic block. */
basic_block loop_preheader; /* The loop preheader basic block. */
+
};
static void decide_unroll_stupid (class loop *, int);
@@ -183,6 +225,7 @@ static void combine_var_copies_in_loop_exit (struct
var_to_expand *,
basic_block);
static rtx get_expansion (struct var_to_expand *);
+static struct def_to_split *analyze_insn_to_split_def (class loop *, rtx_insn
*);
/* Emit a message summarizing the unroll that will be
performed for LOOP, along with the loop's location LOCUS, if
appropriate given the dump or -fopt-info settings. */
@@ -898,7 +941,8 @@ unroll_loop_runtime_iterations (class loop *loop)
struct opt_info *opt_info = NULL;
bool ok;
- if (flag_split_ivs_in_unroller
+ if (flag_assign_new_operand_in_unroller
+ || flag_split_ivs_in_unroller
|| flag_variable_expansion_in_unroller)
opt_info = analyze_insns_in_loop (loop);
@@ -1564,6 +1608,71 @@ analyze_iv_to_split_insn (rtx_insn *insn)
return ivts;
}
+static struct def_to_split *
+analyze_insn_to_split_def (class loop *loop, rtx_insn *insn)
+{
+ rtx set, dest, src;
+ rtx_insn *ins;
+ struct def_to_split *dts;
+
+ /* TODO: to be relaxed */
+ if (loop->num_nodes != 2)
+ return NULL;
+
+ FOR_BB_INSNS (loop->latch, ins)
+ if (INSN_P (ins))
+ return NULL;
+
+ set = single_set (insn);
+ if (!set)
+ return NULL;
+
+ dest = SET_DEST (set);
+ src = SET_SRC (set);
+
+ if (!REG_P (dest)) // && !(GET_CODE (dest) == SUBREG && REG_P (SUBREG_REG
+ // (dest))))
+ return NULL;
+
+ if (REGNO (dest) < FIRST_PSEUDO_REGISTER)
+ return NULL;
+
+ int use_before_set = 0;
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ rtx set1;
+ for (ins = BB_HEAD (bb); ins != insn; ins = NEXT_INSN (ins))
+ if (rtx_referenced_p (dest, ins))
+ {
+ set1 = single_set (ins);
+ if (set1 && rtx_equal_p (SET_DEST (set1), dest))
+ return NULL;
+ use_before_set = 1;
+ }
+
+ if (!use_before_set && rtx_referenced_p (dest, src))
+ use_before_set = 1;
+
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "\n;; Split assignment %s: ",
+ use_before_set ? "use leads def" : "");
+ print_rtl (dump_file, dest);
+ fprintf (dump_file, "\n");
+ }
+
+ /* Record the accumulator to expand. */
+ dts = XNEW (struct def_to_split);
+ dts->insn = insn;
+ dts->reg = copy_rtx (dest);
+ dts->defs.create (1);
+ dts->next = NULL;
+ dts->count = 0;
+ dts->use_before_def = use_before_set;
+ dts->last_insn = NULL;
+ return dts;
+}
+
/* Determines which of insns in LOOP can be optimized.
Return a OPT_INFO struct with the relevant hash tables filled
with all insns to be optimized. The FIRST_NEW_BLOCK field
@@ -1578,6 +1687,7 @@ analyze_insns_in_loop (class loop *loop)
rtx_insn *insn;
struct iv_to_split *ivts = NULL;
struct var_to_expand *ves = NULL;
+ struct def_to_split *dts= NULL;
iv_to_split **slot1;
var_to_expand **slot2;
vec<edge> edges = get_loop_exit_edges (loop);
@@ -1618,6 +1728,15 @@ analyze_insns_in_loop (class loop *loop)
opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
}
+ if (flag_assign_new_operand_in_unroller && can_apply
+ && LPT_UNROLL_RUNTIME == loop->lpt_decision.decision)
+ {
+ opt_info->insns_def_split
+ = new hash_table<def_split_hasher> (5 * loop->num_nodes);
+ opt_info->def_to_split_head = NULL;
+ opt_info->def_to_split_tail = &opt_info->def_to_split_head;
+ }
+
for (i = 0; i < loop->num_nodes; i++)
{
bb = body[i];
@@ -1653,6 +1772,20 @@ analyze_insns_in_loop (class loop *loop)
*opt_info->var_to_expand_tail = ves;
opt_info->var_to_expand_tail = &ves->next;
}
+
+ dts = NULL;
+ if (opt_info->insns_def_split && !ves && !ivts)
+ dts = analyze_insn_to_split_def (loop, insn);
+
+ if (dts)
+ {
+ struct def_to_split **slot
+ = opt_info->insns_def_split->find_slot (dts, INSERT);
+ gcc_assert (*slot == NULL);
+ *slot = dts;
+ *opt_info->def_to_split_tail = dts;
+ opt_info->def_to_split_tail = &dts->next;
+ }
}
}
@@ -1840,6 +1973,145 @@ expand_var_during_unrolling (struct var_to_expand *ve,
rtx_insn *insn)
}
}
+/* Replace INSN with a new duplicate insn. */
+static rtx_insn *
+replace_with_new_insn (rtx_insn *insn)
+{
+ rtx_insn *seq;
+ start_sequence ();
+ seq = duplicate_insn_chain (insn, insn);
+ gcc_assert (seq == get_insns ());
+ end_sequence ();
+ emit_insn_before (seq, insn);
+ delete_insn (insn);
+ return seq;
+}
+
+/* For original body, those insn(s) are not able to be update directly. Then,
+ * duplicate new one for update. */
+
+static void
+prepare_to_update_orignal_body (struct def_to_split *head)
+{
+ struct def_to_split *dts;
+ rtx_insn *insn;
+ vec<rtx_insn *> new_insn_list;
+
+ /* For all those defines which is used before define, replace with new one
for
+ * update. */
+ new_insn_list.create (1);
+ for (dts = head; dts; dts = dts->next)
+ if (dts->use_before_def)
+ {
+ dts->insn = replace_with_new_insn (dts->insn);
+ new_insn_list.safe_push (dts->insn);
+ }
+
+ /* For all those insn which is using those defines, replace with new one for
+ * update. */
+ for (dts = head; dts; dts = dts->next)
+ if (dts->use_before_def)
+ for (insn = NEXT_INSN (dts->insn);
+ insn != NEXT_INSN (BB_END (BLOCK_FOR_INSN (dts->insn)));
+ insn = NEXT_INSN (insn))
+ if (rtx_referenced_p (dts->reg, insn) && !new_insn_list.contains (insn))
+ {
+ insn = replace_with_new_insn (insn);
+ new_insn_list.safe_push (insn);
+ }
+
+ new_insn_list.release ();
+}
+
+/* Replace reg which occur between START and END. */
+static void
+replace_regs (rtx_insn *start, rtx_insn *end, rtx old_reg, rtx new_reg)
+{
+ rtx_insn *insn;
+ for (insn = start; insn != NEXT_INSN (end); insn = NEXT_INSN (insn))
+ if (rtx_referenced_p (old_reg, insn))
+ validate_replace_rtx_group (old_reg, new_reg, insn);
+
+ apply_change_group ();
+}
+
+/* Update the the pseudo in orignal blocks, and assign back them at the last
+ * block. */
+static void
+do_additional_update (struct def_to_split *head)
+{
+ rtx_insn *insn;
+ rtx new_reg;
+ basic_block bb;
+ struct def_to_split *dts;
+
+ for (dts = head; dts; dts = dts->next)
+ {
+ if (dts->use_before_def)
+ {
+ gcc_assert (dts->last_insn);
+ gcc_assert (dts->count > 1);
+
+ /* Update the insns in original body. */
+ bb = BLOCK_FOR_INSN (dts->insn);
+ new_reg = dts->defs[0];
+ SET_DEST (single_set (dts->insn)) = new_reg;
+
+ replace_regs (NEXT_INSN (dts->insn), BB_END (bb), dts->reg, new_reg);
+ }
+
+ /* In last BB, assign back to orignal reg. */
+ start_sequence ();
+ emit_move_insn (dts->reg, SET_DEST (single_set (dts->last_insn)));
+ insn = get_insns ();
+ end_sequence ();
+ emit_insn_after (insn, dts->last_insn);
+ }
+}
+
+/* Replace the DTS->REG with new split reg for INSN's block. If the DTS->REG is
+ * referenced before set, Those usage which leading INSN is replaced with
+ * previous define/set. */
+static void
+split_def_during_unrolling (struct def_to_split *dts, rtx_insn *insn)
+{
+ rtx new_reg, new_reg0, set;
+ basic_block bb = BLOCK_FOR_INSN (insn);
+
+ if (dts->use_before_def)
+ {
+ if (dts->count == 0)
+ {
+ dts->defs.safe_push (gen_reg_rtx (GET_MODE (dts->reg)));
+ dts->count++;
+ }
+
+ /* Replace the usage of the define reg with new reg0 for those insns
which
+ * are leading the define, include the usage of the define. */
+ new_reg0 = dts->defs[dts->count - 1];
+ replace_regs (BB_HEAD (bb), insn, dts->reg, new_reg0);
+
+ /* Replace the define reg with new reg, for the define and those usages
+ * which following the define. */
+ set = single_set (insn);
+ gcc_assert (set);
+ new_reg = gen_reg_rtx (GET_MODE (dts->reg));
+ SET_DEST (single_set (insn)) = new_reg;
+
+ replace_regs (NEXT_INSN (insn), BB_END (bb), dts->reg, new_reg);
+ }
+ else
+ {
+ new_reg = gen_reg_rtx (GET_MODE (dts->reg));
+ replace_regs (insn, BB_END (bb), dts->reg, new_reg);
+ }
+
+ /* Record the insn in last block. */
+ dts->last_insn = insn;
+ dts->defs.safe_push (new_reg);
+ dts->count++;
+}
+
/* Initialize the variable expansions in loop preheader. PLACE is the
loop-preheader basic block where the initialization of the
expansions should take place. The expansions are initialized with
@@ -2011,6 +2283,7 @@ apply_opt_in_copies (struct opt_info *opt_info,
rtx_insn *insn, *orig_insn, *next;
struct iv_to_split ivts_templ, *ivts;
struct var_to_expand ve_templ, *ves;
+ struct def_to_split dts_templ, *dts;
/* Sanity check -- we need to put initialization in the original loop
body. */
@@ -2051,8 +2324,9 @@ apply_opt_in_copies (struct opt_info *opt_info,
ivts_templ.insn = orig_insn;
ve_templ.insn = orig_insn;
+ dts_templ.insn = orig_insn;
- /* Apply splitting iv optimization. */
+ /* Apply splitting iv optimization. */
if (opt_info->insns_to_split)
{
maybe_strip_eq_note_for_split_iv (opt_info, insn);
@@ -2081,6 +2355,19 @@ apply_opt_in_copies (struct opt_info *opt_info,
expand_var_during_unrolling (ves, insn);
}
}
+
+ if (unrolling && opt_info->insns_def_split)
+ {
+ dts = (struct def_to_split *) opt_info->insns_def_split->find (
+ &dts_templ);
+ if (dts)
+ {
+ gcc_assert (GET_CODE (PATTERN (insn))
+ == GET_CODE (PATTERN (orig_insn)));
+ split_def_during_unrolling (dts, insn);
+ }
+ }
+
orig_insn = NEXT_INSN (orig_insn);
}
}
@@ -2135,9 +2422,14 @@ apply_opt_in_copies (struct opt_info *opt_info,
continue;
}
}
-
}
}
+
+ if (unrolling && opt_info->insns_def_split)
+ {
+ prepare_to_update_orignal_body (opt_info->def_to_split_head);
+ do_additional_update (opt_info->def_to_split_head);
+ }
}
/* Release OPT_INFO. */
@@ -2156,5 +2448,16 @@ free_opt_info (struct opt_info *opt_info)
delete opt_info->insns_with_var_to_expand;
opt_info->insns_with_var_to_expand = NULL;
}
+
+ if (opt_info->insns_def_split)
+ {
+ struct def_to_split *dts;
+
+ for (dts = opt_info->def_to_split_head; dts; dts = dts->next)
+ dts->defs.release ();
+ delete opt_info->insns_def_split;
+ opt_info->insns_def_split = NULL;
+ }
+
free (opt_info);
}
--
2.17.1