Re: [PATCH] ira: Fix go_through_subreg offset calculation [PR115281]

2024-05-30 Thread Vladimir Makarov



On 5/30/24 03:59, Richard Sandiford wrote:


Tested on aarch64-linux-gnu & x86_64-linux-gnu.  OK to install?

Yes.  Thank you, Richard.


gcc/
PR rtl-optimization/115281
* ira-conflicts.cc (go_through_subreg): Use the natural size of
the inner mode rather than the outer mode.

gcc/testsuite/
PR rtl-optimization/115281
* gfortran.dg/pr115281.f90: New test.




[pushed][PR115013][LRA]: Modify register starvation recognition

2024-05-13 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115013

Successfully tested and bootstrapped on x86-64.
commit 44430ef3d8ba75692efff5f6969d5610134566d3
Author: Vladimir N. Makarov 
Date:   Mon May 13 10:12:11 2024 -0400

[PR115013][LRA]: Modify register starvation recognition

  My recent patch to recognize reg starvation resulted in few GCC test
failures.  The following patch fixes this by using more accurate
starvation calculation and ignoring small reg classes.

gcc/ChangeLog:

PR rtl-optimization/115013
* lra-constraints.cc (process_alt_operands): Update all_used_nregs
only for winreg.  Ignore reg starvation for small reg classes.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index e945a4da451..92b343fa99a 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -2674,8 +2674,9 @@ process_alt_operands (int only_alternative)
 	  if (early_clobber_p
 		  || curr_static_id->operand[nop].type != OP_OUT)
 		{
-		  all_used_nregs
-		+= ira_reg_class_min_nregs[this_alternative][mode];
+		  if (winreg)
+		all_used_nregs
+		  += ira_reg_class_min_nregs[this_alternative][mode];
 		  all_this_alternative
 		= (reg_class_subunion
 		   [all_this_alternative][this_alternative]);
@@ -3250,6 +3251,7 @@ process_alt_operands (int only_alternative)
 	  overall += LRA_MAX_REJECT;
 	}
   if (all_this_alternative != NO_REGS
+	  && !SMALL_REGISTER_CLASS_P (all_this_alternative)
 	  && all_used_nregs != 0 && all_reload_nregs != 0
 	  && (all_used_nregs + all_reload_nregs + 1
 	  >= ira_class_hard_regs_num[all_this_alternative]))


[pushed][PR114942][LRA]: Don't reuse input reload reg of inout early clobber operand

2024-05-10 Thread Vladimir Makarov

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114942

The patch was successfully bootstrapped and tested on x86-64, ppc64le, 
aarch64.
commit 9585317f0715699197b1313bbf939c6ea3c1ace6
Author: Vladimir N. Makarov 
Date:   Fri May 10 09:15:50 2024 -0400

[PR114942][LRA]: Don't reuse input reload reg of inout early clobber operand

  The insn in question has the same reg in inout operand and input
operand.  The inout operand is early clobber.  LRA reused input reload
reg of the inout operand for the input operand which is wrong.  It
were a good decision if the inout operand was not early clobber one.
The patch rejects the reuse for the PR test case.

gcc/ChangeLog:

PR target/114942
* lra-constraints.cc (struct input_reload): Add new member early_clobber_p.
(get_reload_reg): Add new arg early_clobber_p, don't reuse input
reload with true early_clobber_p member value, use the arg for new
element of curr_insn_input_reloads.
(match_reload): Assign false to early_clobber_p member.
(process_addr_reg, simplify_operand_subreg, curr_insn_transform):
Adjust get_reload_reg calls.

gcc/testsuite/ChangeLog:

PR target/114942
* gcc.target/i386/pr114942.c: New.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 5b78fd0b7e5..e945a4da451 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -599,6 +599,8 @@ struct input_reload
 {
   /* True for input reload of matched operands.  */
   bool match_p;
+  /* True for input reload of inout earlyclobber operand.  */
+  bool early_clobber_p;
   /* Reloaded value.  */
   rtx input;
   /* Reload pseudo used.  */
@@ -649,13 +651,15 @@ canonicalize_reload_addr (rtx addr)
 /* Create a new pseudo using MODE, RCLASS, EXCLUDE_START_HARD_REGS, ORIGINAL or
reuse an existing reload pseudo.  Don't reuse an existing reload pseudo if
IN_SUBREG_P is true and the reused pseudo should be wrapped up in a SUBREG.
+   EARLY_CLOBBER_P is true for input reload of inout early clobber operand.
The result pseudo is returned through RESULT_REG.  Return TRUE if we created
a new pseudo, FALSE if we reused an existing reload pseudo.  Use TITLE to
describe new registers for debug purposes.  */
 static bool
 get_reload_reg (enum op_type type, machine_mode mode, rtx original,
 		enum reg_class rclass, HARD_REG_SET *exclude_start_hard_regs,
-		bool in_subreg_p, const char *title, rtx *result_reg)
+		bool in_subreg_p, bool early_clobber_p,
+		const char *title, rtx *result_reg)
 {
   int i, regno;
   enum reg_class new_class;
@@ -703,6 +707,7 @@ get_reload_reg (enum op_type type, machine_mode mode, rtx original,
 for (i = 0; i < curr_insn_input_reloads_num; i++)
   {
 	if (! curr_insn_input_reloads[i].match_p
+	&& ! curr_insn_input_reloads[i].early_clobber_p
 	&& rtx_equal_p (curr_insn_input_reloads[i].input, original)
 	&& in_class_p (curr_insn_input_reloads[i].reg, rclass, _class))
 	  {
@@ -750,6 +755,8 @@ get_reload_reg (enum op_type type, machine_mode mode, rtx original,
   lra_assert (curr_insn_input_reloads_num < LRA_MAX_INSN_RELOADS);
   curr_insn_input_reloads[curr_insn_input_reloads_num].input = original;
   curr_insn_input_reloads[curr_insn_input_reloads_num].match_p = false;
+  curr_insn_input_reloads[curr_insn_input_reloads_num].early_clobber_p
+= early_clobber_p;
   curr_insn_input_reloads[curr_insn_input_reloads_num++].reg = *result_reg;
   return true;
 }
@@ -1189,6 +1196,7 @@ match_reload (signed char out, signed char *ins, signed char *outs,
   lra_assert (curr_insn_input_reloads_num < LRA_MAX_INSN_RELOADS);
   curr_insn_input_reloads[curr_insn_input_reloads_num].input = in_rtx;
   curr_insn_input_reloads[curr_insn_input_reloads_num].match_p = true;
+  curr_insn_input_reloads[curr_insn_input_reloads_num].early_clobber_p = false;
   curr_insn_input_reloads[curr_insn_input_reloads_num++].reg = new_in_reg;
   for (i = 0; (in = ins[i]) >= 0; i++)
 if (GET_MODE (*curr_id->operand_loc[in]) == VOIDmode
@@ -1577,7 +1585,7 @@ process_addr_reg (rtx *loc, bool check_only_p, rtx_insn **before, rtx_insn **aft
 	  reg = *loc;
 	  if (get_reload_reg (after == NULL ? OP_IN : OP_INOUT,
 			  mode, reg, cl, NULL,
-			  subreg_p, "address", _reg))
+			  subreg_p, false, "address", _reg))
 	before_p = true;
 	}
   else if (new_class != NO_REGS && rclass != new_class)
@@ -1733,7 +1741,7 @@ simplify_operand_subreg (int nop, machine_mode reg_mode)
 	= (enum reg_class) targetm.preferred_reload_class (reg, ALL_REGS);
 	  if (get_reload_reg (curr_static_id->operand[nop].type, innermode,
 			  reg, rclass, NULL,
-			  true, "slow/invalid mem", _reg))
+			  true, false, "slow/invalid mem", _reg))
 	{
 	  bool insert_before, insert_after;
 	  bitmap_set_bit (_subreg_reload_pseudos, REGNO (new_reg));
@@ 

Re: [pushed][PR114810][LRA]: Recognize alternatives with lack of available registers for insn and demote them.

2024-05-09 Thread Vladimir Makarov


On 5/8/24 23:25, Li, Pan2 wrote:


Hi Vladimir,

Looks this patch results in some ICE in the rvv.exp of RISC-V backend, 
feel free to ping me if more information is needed for reproducing.


= Summary of gcc testsuite =

| # of unexpected case / # of unique unexpected case

|gcc |g++ |gfortran |

rv64gcv/lp64d/ medlow | 1061 /69 |0 /0 |- |

make: *** [Makefile:1096: report-gcc-newlib] Error 1

Just pick one imm_loop_invariant-10.c as below.

/home/pli/gcc/111/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/imm_loop_invariant-10.c:20:1: 
error: unrecognizable insn:


(insn 265 0 0 (parallel [

(set (reg:RVVMF8QI 309 [239])

(unspec:RVVMF8QI [

(reg:SI 0 zero)

] UNSPEC_VUNDEF))

(clobber (scratch:SI))

]) -1

(nil))


Thank you for reporting this.  Could you fill a PR for this.  I guess 
fixing this might take some time.




[pushed][PR114810][LRA]: Recognize alternatives with lack of available registers for insn and demote them.

2024-05-08 Thread Vladimir Makarov

The following patch is a fix for PR114810 from LRA side.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114810

The patch was successfully bootstrapped and tested on x86_64, aarch64, 
ppc64le.


commit dc859c1fcb6f3ad95022fb078c040907ef361e4c
Author: Vladimir N. Makarov 
Date:   Wed May 8 10:39:04 2024 -0400

[PR114810][LRA]: Recognize alternatives with lack of available registers for insn and demote them.

  PR114810 was fixed in machine-dependent way.  This patch is a fix of
the PR on LRA side.  LRA chose alternative with constraints `,r,ro`
on i686 when all operands of DImode and there are only 6 available
general regs.  The patch recognizes such case and significantly
increase the alternative cost.  It does not reject alternative
completely.  So the fix is safe but it might not work for all
potentially possible cases of registers lack as register classes can
have any relations including subsets and intersections.

gcc/ChangeLog:

PR target/114810
* lra-constraints.cc (process_alt_operands): Calculate union reg
class for the alternative, peak matched regs and required reload
regs.  Recognize alternatives with lack of available registers and
make them costly.  Add debug print about this case.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 10e3d4e4097..5b78fd0b7e5 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -2127,6 +2127,8 @@ process_alt_operands (int only_alternative)
   /* Numbers of operands which are early clobber registers.  */
   int early_clobbered_nops[MAX_RECOG_OPERANDS];
   enum reg_class curr_alt[MAX_RECOG_OPERANDS];
+  enum reg_class all_this_alternative;
+  int all_used_nregs, all_reload_nregs;
   HARD_REG_SET curr_alt_set[MAX_RECOG_OPERANDS];
   HARD_REG_SET curr_alt_exclude_start_hard_regs[MAX_RECOG_OPERANDS];
   bool curr_alt_match_win[MAX_RECOG_OPERANDS];
@@ -2229,7 +2231,8 @@ process_alt_operands (int only_alternative)
   curr_alt_out_sp_reload_p = false;
   curr_reuse_alt_p = true;
   curr_alt_class_change_p = false;
-  
+  all_this_alternative = NO_REGS;
+  all_used_nregs = all_reload_nregs = 0;
   for (nop = 0; nop < n_operands; nop++)
 	{
 	  const char *p;
@@ -2660,6 +2663,15 @@ process_alt_operands (int only_alternative)
 	  /* Record which operands fit this alternative.  */
 	  if (win)
 	{
+	  if (early_clobber_p
+		  || curr_static_id->operand[nop].type != OP_OUT)
+		{
+		  all_used_nregs
+		+= ira_reg_class_min_nregs[this_alternative][mode];
+		  all_this_alternative
+		= (reg_class_subunion
+		   [all_this_alternative][this_alternative]);
+		}
 	  this_alternative_win = true;
 	  if (class_change_p)
 		{
@@ -2781,7 +2793,19 @@ process_alt_operands (int only_alternative)
 		   & ~((ira_prohibited_class_mode_regs
 			[this_alternative][mode])
 			   | lra_no_alloc_regs));
-		  if (hard_reg_set_empty_p (available_regs))
+		  if (!hard_reg_set_empty_p (available_regs))
+		{
+		  if (early_clobber_p
+			  || curr_static_id->operand[nop].type != OP_OUT)
+			{
+			  all_reload_nregs
+			+= ira_reg_class_min_nregs[this_alternative][mode];
+			  all_this_alternative
+			= (reg_class_subunion
+			   [all_this_alternative][this_alternative]);
+			}
+		}
+		  else
 		{
 		  /* There are no hard regs holding a value of given
 			 mode.  */
@@ -3217,6 +3241,21 @@ process_alt_operands (int only_alternative)
 		 "Cycle danger: overall += LRA_MAX_REJECT\n");
 	  overall += LRA_MAX_REJECT;
 	}
+  if (all_this_alternative != NO_REGS
+	  && all_used_nregs != 0 && all_reload_nregs != 0
+	  && (all_used_nregs + all_reload_nregs + 1
+	  >= ira_class_hard_regs_num[all_this_alternative]))
+	{
+	  if (lra_dump_file != NULL)
+	fprintf
+	  (lra_dump_file,
+	   "Register starvation: overall += LRA_MAX_REJECT"
+	   "(class=%s,avail=%d,used=%d,reload=%d)\n",
+	   reg_class_names[all_this_alternative],
+	   ira_class_hard_regs_num[all_this_alternative],
+	   all_used_nregs, all_reload_nregs);
+	  overall += LRA_MAX_REJECT;
+	}
   ok_p = true;
   curr_alt_dont_inherit_ops_num = 0;
   for (nop = 0; nop < early_clobbered_regs_num; nop++)


Re: [PATCH 4/4] lra: Apply DF_LIVE_SUBREG data

2024-05-08 Thread Vladimir Makarov



On 5/7/24 23:01, Lehua Ding wrote:

Hi Vladimir,

I'll send V3 patchs based on these comments. Note that these four 
patches only support subreg liveness tracking and apply to IRA and LRA 
pass. Therefore, no performance changes are expected before we support 
subreg coalesce. There will be new patches later to complete the 
subreg coalesce functionality. Support for subreg coalesce requires 
support for subreg copy i.e. modifying the logic for conflict detection.



Thank you for your clarification that the current batch of patches does 
not change the performance.  I hope the next batch of patches will be 
added to devel/subreg-coalesce branch too for their easier evaluation.





Re: [PATCH 4/4] lra: Apply DF_LIVE_SUBREG data

2024-05-01 Thread Vladimir Makarov


On 2/3/24 05:50, Lehua Ding wrote:

This patch apply the DF_LIVE_SUBREG to LRA pass. More changes were made
to the LRA than the IRA since the LRA will modify the DF data directly.
The main big changes are centered on the lra-lives.cc file.

gcc/ChangeLog:

* lra-coalesce.cc (update_live_info): Extend to DF_LIVE_SUBREG.
(lra_coalesce): Ditto.
* lra-constraints.cc (update_ebb_live_info): Ditto.
(get_live_on_other_edges): Ditto.
(inherit_in_ebb): Ditto.
(lra_inheritance): Ditto.
(fix_bb_live_info): Ditto.
(remove_inheritance_pseudos): Ditto.
* lra-int.h (GCC_LRA_INT_H): include subreg-live-range.h
(struct lra_insn_reg): Add op filed to record the corresponding rtx.
* lra-lives.cc (class bb_data_pseudos): Extend the bb_data_pseudos to
include new partial_def/use and range_def/use fileds for DF_LIVE_SUBREG
problem.

Typo "fileds".

(need_track_subreg_p): checking is the regno need to be tracked.
(make_hard_regno_live): switch to live_subreg filed.

The same typo.

(make_hard_regno_dead): Ditto.
(mark_regno_live): Support record subreg liveness.
(mark_regno_dead): Ditto.
(live_trans_fun): Adjust transfer function to support subreg liveness.
(live_con_fun_0): Adjust Confluence function to support subreg liveness.
(live_con_fun_n): Ditto.
(initiate_live_solver): Ditto.
(finish_live_solver): Ditto.
(process_bb_lives): Ditto.
(lra_create_live_ranges_1): Dump subreg liveness.
* lra-remat.cc (dump_candidates_and_remat_bb_data): Switch to
DF_LIVE_SUBREG df data.
(calculate_livein_cands): Ditto.
(do_remat): Ditto.
* lra-spills.cc (spill_pseudos): Ditto.
* lra.cc (new_insn_reg): New argument op.
(add_regs_to_insn_regno_info): Add new argument op.


The patch is ok for me with some minor requests:

You missed log entry for collect_non_operand_hard_regs.  Log entry for 
lra_create_live_ranges_1 is not full (at least, it should be "Ditto. ...").


Also you changed signature for functions update_live_info, 
fix_bb_live_info, mark_regno_live, mark_regno_dead, new_insn_reg but did 
not updated the function comments.  Outdated comments are even worse 
than the comment absence.  Please fix them.


Also some variable naming could be improved but it is up to you.

So now you need just an approval for the rest patches to commit your 
work but they are not my area responsibility.


It is difficult predict for patches of this size how they will work for 
other targets.  I tested you patches on aarch64 and ppc64le. They seems 
working right but please be prepare to switch them off (it is easy) if 
the patches create some issues for other targets, of course until fixing 
the issues.


And thank you for your contribution.  Improving GCC performance these 
days is a challenging task as so many people are working on GCC but you 
found such opportunity and most importantly implement it.




Re: [PATCH 3/4] ira: Apply DF_LIVE_SUBREG data

2024-05-01 Thread Vladimir Makarov



On 2/3/24 05:50, Lehua Ding wrote:

This patch simple replace df_get_live_in to df_get_subreg_live_in
and replace df_get_live_out to df_get_subreg_live_out.

gcc/ChangeLog:

* ira-build.cc (create_bb_allocnos): Switch to DF_LIVE_SUBREG df data.
(create_loop_allocnos): Ditto.
* ira-color.cc (ira_loop_edge_freq): Ditto.
* ira-emit.cc (generate_edge_moves): Ditto.
(add_ranges_and_copies): Ditto.
* ira-lives.cc (process_out_of_region_eh_regs): Ditto.
(add_conflict_from_region_landing_pads): Ditto.
(process_bb_node_lives): Ditto.
* ira.cc (find_moveable_pseudos): Ditto.
(interesting_dest_for_shprep_1): Ditto.
(allocate_initial_values): Ditto.
(ira): Ditto.


This patch is ok for me.

  gcc/ira-build.cc |  7 ---
  gcc/ira-color.cc |  8 
  gcc/ira-emit.cc  | 12 ++--
  gcc/ira-lives.cc |  7 ---
  gcc/ira.cc   | 19 ---
  5 files changed, 30 insertions(+), 23 deletions(-)




Fwd: [PATCH V2 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA

2024-05-01 Thread Vladimir Makarov


I am resending this message as the previous one had one wrong response 
email address "gcc-pat...@gcc.gnu.org"


 Forwarded Message 
Subject: 	Re: [PATCH V2 0/4] Add DF_LIVE_SUBREG data and apply to IRA 
and LRA

Date:   Wed, 1 May 2024 08:35:27 -0400
From:   Vladimir Makarov 
To: 	Lehua Ding , gcc-pat...@gcc.gnu.org, 
richard.sandif...@arm.com

CC: juzhe.zh...@rivai.ai, shuo.c...@rivai.ai, jin@rivai.ai




On 4/24/24 06:01, Lehua Ding wrote:

Hi Vladimir and Richard,

These patches are used to add a new data flow DF_LIVE_SUBREG,
which will track subreg liveness and then apply it to IRA and LRA
passes (enabled via -O3 or -ftrack-subreg-liveness). These patches
are for GCC 15. And these codes are pushed to the devel/subreg-coalesce
branch. In addition, my colleague Shuo Chen will also be involved in some
of the remain work, thank you for your support.

Thank you for creation of the branch.  It helped me a lot.

These patches are separated from the subreg-coalesce patches submitted
a few months ago. I refactored the code according to comments. The next
patches will support subreg coalesce base on they. Here are some data
abot build time of SPEC INT 2017 (x86-64 target):

Thank you for refactoring patches.

baseline baseline(+track-subreg-liveness)
specint2017 build time : 1892s 1883s

Interesting and surprisingly unexpected improvement.

Regarding build times, I've run it a few times, but they all seem to take
much less time. Since the difference is small, it's possible that it's 
just

a change in environment. But it's theoretically possible, since supporting
subreg-liveness could have reduced the number of living regs.

For memory usage, I trided PR 69609 by valgrind, peak memory size grow 
from

2003910656 to 2003947520, very small increase.


I'll soon finish code review of IRA and LRA changes and send it today or 
tomorrow.


But In brief I have no objections to the patches, just some minor 
requests to improve them.


Re: [PATCH] regalloc: Ignore '^' in early costing [PR114766]

2024-04-30 Thread Vladimir Makarov



On 4/29/24 08:59, Wilco Dijkstra wrote:

According to documentation, '^' should only have an effect during reload.
However ira-costs.cc treats it in the same way as '?' during early costing.
As a result using '^' can accidentally disable valid alternatives and cause
significant regressions (see PR114741).  Avoid this by ignoring '^' during
costing.

Passes bootstrap and regress, OK for commit?

gcc:
 PR rtl-optimization/114766
 * ira-costs.cc (record_reg_classes): Ignore '^' during costing.

Sorry, I can not accept this patch.  This constraint is used by other 
targets (as I know rs6000, s390, sh, gcn).  I suspect changing the 
semantics can affect the targets in some undesirable way.


Saying that, I understand that aarch64 needs the new semantics. So I 
think we can add a new hint with the required semantics. There are still 
few undefined characters for the new hint: '~', '-', '/', and '.' (may 
be I missed some),


I propose to use '~' for the new hint. So besides changing code, the 
documentation for '^' needs to clarified and documentation for code '~' 
should be added.




diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 
c86c5a16563aeefac9d4fa72839bee8d95409f4b..04d2f21b023f3456ba6f8c16c2418d7313965b2f
 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -771,10 +771,6 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
  c = *++p;
  break;
  
-		case '^':

- alt_cost += 2;
- break;
-
case '?':
  alt_cost += 2;
  break;







[pushed][PR114415][scheduler]: Fixing wrong code generation

2024-04-04 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114415

The patch was successfully tested and bootstrapped on x86_64, ppc64le, 
aarch64.


commit fe305ca39930afc301cdd1f1143d540d1bfa2a48
Author: Vladimir N. Makarov 
Date:   Thu Apr 4 16:04:04 2024 -0400

[PR114415][scheduler]: Fixing wrong code generation

  For the test case, the insn scheduler (working for live range
shrinkage) moves insns modifying stack memory before an insn reserving
the stack memory. Comments in the patch contains more details about
the problem and its solution.

gcc/ChangeLog:

PR rtl-optimization/114415
* sched-deps.cc (add_insn_mem_dependence): Add memory check for mem argument.
(sched_analyze_1): Treat stack pointer modification as memory read.
(sched_analyze_2, sched_analyze_insn): Add memory guard for processing pending_read_mems.
* sched-int.h (deps_desc): Add comment to pending_read_mems.

gcc/testsuite/ChangeLog:

PR rtl-optimization/114415
* gcc.target/i386/pr114415.c: New test.

diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
index 5034e664e5e..4c668245049 100644
--- a/gcc/sched-deps.cc
+++ b/gcc/sched-deps.cc
@@ -1735,7 +1735,7 @@ add_insn_mem_dependence (class deps_desc *deps, bool read_p,
   insn_node = alloc_INSN_LIST (insn, *insn_list);
   *insn_list = insn_node;
 
-  if (sched_deps_info->use_cselib)
+  if (sched_deps_info->use_cselib && MEM_P (mem))
 {
   mem = shallow_copy_rtx (mem);
   XEXP (mem, 0) = cselib_subst_to_values_from_insn (XEXP (mem, 0),
@@ -2458,6 +2458,25 @@ sched_analyze_1 (class deps_desc *deps, rtx x, rtx_insn *insn)
 			   FIRST_STACK_REG);
 	}
 #endif
+  if (!deps->readonly && regno == STACK_POINTER_REGNUM)
+	{
+	  /* Please see PR114115.  We have insn modifying memory on the stack
+	 and not addressed by stack pointer and we have insn reserving the
+	 stack space.  If we move the insn modifying memory before insn
+	 reserving the stack space, we can change memory out of the red
+	 zone.  Even worse, some optimizations (e.g. peephole) can add
+	 insns using temporary stack slots before insn reserving the stack
+	 space but after the insn modifying memory.  This will corrupt the
+	 modified memory.  Therefore we treat insn changing the stack as
+	 reading unknown memory.  This will create anti-dependence.  We
+	 don't need to treat the insn as writing memory because GCC by
+	 itself does not generate code reading undefined stack memory.  */
+	  if ((deps->pending_read_list_length + deps->pending_write_list_length)
+	  >= param_max_pending_list_length
+	  && !DEBUG_INSN_P (insn))
+	flush_pending_lists (deps, insn, true, true);
+	  add_insn_mem_dependence (deps, true, insn, dest);
+	}
 }
   else if (MEM_P (dest))
 {
@@ -2498,10 +2517,11 @@ sched_analyze_1 (class deps_desc *deps, rtx x, rtx_insn *insn)
 	  pending_mem = deps->pending_read_mems;
 	  while (pending)
 	{
-	  if (anti_dependence (pending_mem->element (), t)
-		  && ! sched_insns_conditions_mutex_p (insn, pending->insn ()))
-		note_mem_dep (t, pending_mem->element (), pending->insn (),
-			  DEP_ANTI);
+	  rtx mem = pending_mem->element ();
+	  if (REG_P (mem)
+		  || (anti_dependence (mem, t)
+		  && ! sched_insns_conditions_mutex_p (insn, pending->insn (
+		note_mem_dep (t, mem, pending->insn (), DEP_ANTI);
 
 	  pending = pending->next ();
 	  pending_mem = pending_mem->next ();
@@ -2637,12 +2657,10 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn)
 	pending_mem = deps->pending_read_mems;
 	while (pending)
 	  {
-		if (read_dependence (pending_mem->element (), t)
-		&& ! sched_insns_conditions_mutex_p (insn,
-			 pending->insn ()))
-		  note_mem_dep (t, pending_mem->element (),
-pending->insn (),
-DEP_ANTI);
+		rtx mem = pending_mem->element ();
+		if (MEM_P (mem) && read_dependence (mem, t)
+		&& ! sched_insns_conditions_mutex_p (insn, pending->insn ()))
+		  note_mem_dep (t, mem, pending->insn (), DEP_ANTI);
 
 		pending = pending->next ();
 		pending_mem = pending_mem->next ();
@@ -3026,8 +3044,7 @@ sched_analyze_insn (class deps_desc *deps, rtx x, rtx_insn *insn)
 	  while (pending)
 	{
 	  if (! sched_insns_conditions_mutex_p (insn, pending->insn ()))
-		add_dependence (insn, pending->insn (),
-REG_DEP_OUTPUT);
+		add_dependence (insn, pending->insn (),	REG_DEP_OUTPUT);
 	  pending = pending->next ();
 	  pending_mem = pending_mem->next ();
 	}
@@ -3036,10 +3053,10 @@ sched_analyze_insn (class deps_desc *deps, rtx x, rtx_insn *insn)
 	  pending_mem = deps->pending_read_mems;
 	  while (pending)
 	{
-	  if (MEM_VOLATILE_P (pending_mem->element ())
+	  rtx mem = pending_mem->element ();
+	  if (MEM_P (mem) && MEM_VOLATILE_P (mem)
 		  && ! sched_insns_conditions_mutex_p 

[pushed][PR99829][LRA]: Fixing LRA ICE on arm

2024-03-19 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99829

The patch was successfully bootstrapped and tested on x86-64, ppc64le, 
and aarch64.


commit 9c91f8a88b2db50c8faf70786d3cef27b39ac9fc
Author: Vladimir N. Makarov 
Date:   Tue Mar 19 16:57:11 2024 -0400

[PR99829][LRA]: Fixing LRA ICE on arm

  LRA removed insn setting equivalence to memory whose output was
reloaded. This resulted in writing an uninitiated value to the memory
which triggered assert in LRA code checking the final generated code.
This patch fixes the problem.  Comment in the patch contains more
details about the problem and its solution.

gcc/ChangeLog:

PR target/99829
* lra-constraints.cc (lra_constraints): Prevent removing insn
with reverse equivalence to memory if the memory was reloaded.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 0ae81c1ff9c..10e3d4e4097 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5213,7 +5213,7 @@ lra_constraints (bool first_p)
   bool changed_p;
   int i, hard_regno, new_insns_num;
   unsigned int min_len, new_min_len, uid;
-  rtx set, x, reg, dest_reg;
+  rtx set, x, reg, nosubreg_dest;
   rtx_insn *original_insn;
   basic_block last_bb;
   bitmap_iterator bi;
@@ -5377,14 +5377,14 @@ lra_constraints (bool first_p)
 	{
 	  if ((set = single_set (curr_insn)) != NULL_RTX)
 	{
-	  dest_reg = SET_DEST (set);
+	  nosubreg_dest = SET_DEST (set);
 	  /* The equivalence pseudo could be set up as SUBREG in a
 		 case when it is a call restore insn in a mode
 		 different from the pseudo mode.  */
-	  if (GET_CODE (dest_reg) == SUBREG)
-		dest_reg = SUBREG_REG (dest_reg);
-	  if ((REG_P (dest_reg)
-		   && (x = get_equiv (dest_reg)) != dest_reg
+	  if (GET_CODE (nosubreg_dest) == SUBREG)
+		nosubreg_dest = SUBREG_REG (nosubreg_dest);
+	  if ((REG_P (nosubreg_dest)
+		   && (x = get_equiv (nosubreg_dest)) != nosubreg_dest
 		   /* Remove insns which set up a pseudo whose value
 		  cannot be changed.  Such insns might be not in
 		  init_insns because we don't update equiv data
@@ -5403,11 +5403,21 @@ lra_constraints (bool first_p)
 			  up the equivalence.  */
 		   || in_list_p (curr_insn,
  ira_reg_equiv
- [REGNO (dest_reg)].init_insns)))
+ [REGNO (nosubreg_dest)].init_insns)))
 		  || (((x = get_equiv (SET_SRC (set))) != SET_SRC (set))
 		  && in_list_p (curr_insn,
 ira_reg_equiv
-[REGNO (SET_SRC (set))].init_insns)))
+[REGNO (SET_SRC (set))].init_insns)
+		  /* This is a reverse equivalence to memory (see ira.cc)
+			 in store insn.  We can reload all the destination and
+			 have an output reload which is a store to memory.  If
+			 we just remove the insn, we will have the output
+			 reload storing an undefined value to the memory.
+			 Check that we did not reload the memory to prevent a
+			 wrong code generation.  We could implement using the
+			 equivalence still in such case but doing this is not
+			 worth the efforts as such case is very rare.  */
+		  && MEM_P (nosubreg_dest)))
 		{
 		  /* This is equiv init insn of pseudo which did not get a
 		 hard register -- remove the insn.	*/


[pushed][PR113790][LRA]: Fixing LRA ICE on riscv64

2024-03-08 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113790

The patch was successfully bootstrapped and tested on x86-64,ppc64le, 
and aarch64.


commit cebbaa2a84586a7345837f74a53b7a0263bf29ee
Author: Vladimir N. Makarov 
Date:   Fri Mar 8 14:48:33 2024 -0500

[PR113790][LRA]: Fixing LRA ICE on riscv64

  LRA failed to consider all insn alternatives when non-reload pseudo
did not get a hard register.  This resulted in failure to generate
code by LRA.  The patch fixes this problem.

gcc/ChangeLog:

PR target/113790
* lra-assigns.cc (assign_by_spills): Set up all_spilled_pseudos
for non-reload pseudo too.

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index d1b2b35ffc9..7dfa6f70941 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1430,13 +1430,19 @@ assign_by_spills (void)
 	hard_regno = spill_for (regno, _spilled_pseudos, iter == 1);
 	  if (hard_regno < 0)
 	{
-	  if (reload_p) {
-		/* Put unassigned reload pseudo first in the
-		   array.  */
-		regno2 = sorted_pseudos[nfails];
-		sorted_pseudos[nfails++] = regno;
-		sorted_pseudos[i] = regno2;
-	  }
+	  if (reload_p)
+		{
+		  /* Put unassigned reload pseudo first in the array.  */
+		  regno2 = sorted_pseudos[nfails];
+		  sorted_pseudos[nfails++] = regno;
+		  sorted_pseudos[i] = regno2;
+		}
+	  else
+		{
+		  /* Consider all alternatives on the next constraint
+		 subpass.  */
+		  bitmap_set_bit (_spilled_pseudos, regno);
+		}
 	}
 	  else
 	{


Re: [PATCH 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA

2024-02-06 Thread Vladimir Makarov



On 2/5/24 11:10, Jeff Law wrote:



On 2/5/24 00:01, Lehua Ding wrote:
For SPEC INT 2017, when using upstream GCC (whitout these patches), 
I get a
coredump when training the peak case, so no data yet. The cause of 
the core

dump still needs to be investigated.


Typo, SPEC INT 2017 -> SPEC FP 2017
Also There is a bad news, the score of specint 2017 (with these 
patches) is dropped, a bit strange and I need to be locating the cause.
Just a note.  I doubt this will get much traction from a review 
standpoint until gcc-14 is basically out the door.


My recommendation is to continue development, bugfixing, cleanup, etc 
between now and then.  Consider creating a branch for the work in the 
upstream repo.



Thank you for posting this work.  The compilation time improvement is a 
surprise for me and very encouraging.


I agree with Jeff's recommendation to create a branch as most probably 
some people (at least me :) would like to try this on own set of benchmarks.


I am planning to make a review of RA part of these patches at the 
beginning of April.  Still when I have spare time I'll look at the 
patches and could give some feedback even earlier.




[pushed][PR113526][LRA]: Fixing asm-flag-1.c failure on ARM

2024-01-25 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113526

The patch was successfully bootstrapped and tested on x86-64, ppc64le, 
and aarch64.


commit 5c617df410602d0e51d61c84d1ae7e9b3f51efa4
Author: Vladimir N. Makarov 
Date:   Thu Jan 25 14:41:17 2024 -0500

[PR113526][LRA]: Fixing asm-flag-1.c failure on ARM

My recent patch for PR113356 results in failure asm-flag-1.c test on arm.
After the patch LRA treats asm operand pseudos as general regs.  There
are too many such operands and LRA can not assign hard regs to all
operand pseudos.  Actually we should not assign hard regs to the
operand pseudo at all.  The following patch fixes this.

gcc/ChangeLog:

PR target/113526
* lra-constraints.cc (curr_insn_transform): Change class even for
spilled pseudo successfully matched with with NO_REGS.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 3379b88ff22..0ae81c1ff9c 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4498,10 +4498,10 @@ curr_insn_transform (bool check_only_p)
 		 registers for other pseudos referenced in the insn.  The most
 		 common case of this is a scratch register which will be
 		 transformed to scratch back at the end of LRA.  */
-	  && lra_get_regno_hard_regno (regno) >= 0
 	  && bitmap_single_bit_set_p (_reg_info[regno].insn_bitmap))
 	{
-	  lra_change_class (regno, NO_REGS, "  Change to", true);
+	  if (lra_get_allocno_class (regno) != NO_REGS)
+		lra_change_class (regno, NO_REGS, "  Change to", true);
 	  reg_renumber[regno] = -1;
 	}
 	  /* We can do an optional reload.  If the pseudo got a hard


Re: [PATCH v3 1/8] sched-deps.cc (find_modifiable_mems): Avoid exponential behavior

2024-01-15 Thread Vladimir Makarov



On 1/15/24 07:56, Maxim Kuvyrkov wrote:

Hi Vladimir,
Hi Jeff,

Richard and Alexander have reviewed this patch and [I assume] have no 
further comments.  OK to merge?



I trust Richard and Alexander therefore I did not do additional review 
of the patches and have no any comment.  Richard's or Alexander's 
approval is enough for comitting the patches.





[pushed][PR113354][LRA]: Fixing LRA failure on building MIPS GCC

2024-01-15 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113354

The patch was tested on building MIPS target.

The patch was successfully tested and bootstrapped on x86-64, ppc64le, 
aarch64.


commit 5f662bce28618ea5417f68a17d5c2d34b052ecb2
Author: Vladimir N. Makarov 
Date:   Mon Jan 15 10:19:39 2024 -0500

[PR113354][LRA]: Fixing LRA failure on building MIPS GCC

My recent patch for PR112918 triggered a hidden bug in LRA on MIPS.  A
pseudo is matched to a register constraint and assigned to a hard
registers at the first constraint sub-pass but later it is matched to
X constraint.  Keeping this pseudo in the register (MD0) prevents to
use the same register for another pseudo in the insn and this results
in LRA failure.  The patch fixes this by spilling the pseudo at the
constraint subpass when the chosen alternative constraint not require
hard register anymore.

gcc/ChangeLog:

PR middle-end/113354
* lra-constraints.cc (curr_insn_transform): Spill pseudo only used
in the insn if the corresponding operand does not require hard
register anymore.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index dc41bc3d6c6..3379b88ff22 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4491,23 +4491,18 @@ curr_insn_transform (bool check_only_p)
 	{
 	  if (goal_alt[i] == NO_REGS
 	  && REG_P (op)
-	  /* When we assign NO_REGS it means that we will not
-		 assign a hard register to the scratch pseudo by
-		 assigment pass and the scratch pseudo will be
-		 spilled.  Spilled scratch pseudos are transformed
-		 back to scratches at the LRA end.  */
-	  && ira_former_scratch_operand_p (curr_insn, i)
-	  && ira_former_scratch_p (REGNO (op)))
+	  && (regno = REGNO (op)) >= FIRST_PSEUDO_REGISTER
+	  /* We assigned a hard register to the pseudo in the past but now
+		 decided to spill it for the insn.  If the pseudo is used only
+		 in this insn, it is better to spill it here as we free hard
+		 registers for other pseudos referenced in the insn.  The most
+		 common case of this is a scratch register which will be
+		 transformed to scratch back at the end of LRA.  */
+	  && lra_get_regno_hard_regno (regno) >= 0
+	  && bitmap_single_bit_set_p (_reg_info[regno].insn_bitmap))
 	{
-	  int regno = REGNO (op);
 	  lra_change_class (regno, NO_REGS, "  Change to", true);
-	  if (lra_get_regno_hard_regno (regno) >= 0)
-		/* We don't have to mark all insn affected by the
-		   spilled pseudo as there is only one such insn, the
-		   current one.  */
-		reg_renumber[regno] = -1;
-	  lra_assert (bitmap_single_bit_set_p
-			  (_reg_info[REGNO (op)].insn_bitmap));
+	  reg_renumber[regno] = -1;
 	}
 	  /* We can do an optional reload.  If the pseudo got a hard
 	 reg, we might improve the code through inheritance.  If


[pushed][PR112918][LRA]: Fixing IRA ICE on m68k

2024-01-11 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112918

The patch was successfully bootstrapped and tested on x86_64, aarch64, 
ppc64le
commit 902a5931a1fbb04c65b48ca8b0f3827f6ff3b43e
Author: Vladimir N. Makarov 
Date:   Thu Jan 11 08:46:26 2024 -0500

[PR112918][LRA]: Fixing IRA ICE on m68k

Some GCC tests on m68K port of LRA is failed on `maximum number of
generated reload insns per insn achieved`.  The problem is in that for
subreg reload LRA can not narrow reg class more from ALL_REGS to
GENERAL_REGS and then to data regs or address regs.  The patch permits
narrowing reg class from reload insns if this results in successful
matching of reg operand.  This is the second version of the patch to
fix the PR.  This version adds matching with and without narrowing reg
class and preferring match without narrowing classes.

gcc/ChangeLog:

PR rtl-optimization/112918
* lra-constraints.cc (SMALL_REGISTER_CLASS_P): Move before in_class_p.
(in_class_p): Restrict condition for narrowing class in case of
allow_all_reload_class_changes_p.
(process_alt_operands): Try to match operand without and with
narrowing reg class.  Discourage narrowing the class.  Finish insn
matching only if there is no class narrowing.
(curr_insn_transform): Pass true to in_class_p for reg operand win.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index da7e1748d75..6132cd9844a 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -261,6 +261,13 @@ enough_allocatable_hard_regs_p (enum reg_class reg_class,
   return false;
 }
 
+/* True if C is a non-empty register class that has too few registers
+   to be safely used as a reload target class.	*/
+#define SMALL_REGISTER_CLASS_P(C)		\
+  (ira_class_hard_regs_num [(C)] == 1		\
+   || (ira_class_hard_regs_num [(C)] >= 1	\
+   && targetm.class_likely_spilled_p (C)))
+
 /* Return true if REG satisfies (or will satisfy) reg class constraint
CL.  Use elimination first if REG is a hard register.  If REG is a
reload pseudo created by this constraints pass, assume that it will
@@ -318,7 +325,11 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
   common_class = ira_reg_class_subset[rclass][cl];
   if (new_class != NULL)
 	*new_class = common_class;
-  return enough_allocatable_hard_regs_p (common_class, reg_mode);
+  return (enough_allocatable_hard_regs_p (common_class, reg_mode)
+	  /* Do not permit reload insn operand matching (new_class == NULL
+		 case) if the new class is too small.  */
+	  && (new_class != NULL || common_class == rclass
+		  || !SMALL_REGISTER_CLASS_P (common_class)));
 }
 }
 
@@ -923,13 +934,6 @@ operands_match_p (rtx x, rtx y, int y_hard_regno)
&& GET_MODE_SIZE (MODE).is_constant ()	\
&& !targetm.cannot_force_const_mem (MODE, X))
 
-/* True if C is a non-empty register class that has too few registers
-   to be safely used as a reload target class.	*/
-#define SMALL_REGISTER_CLASS_P(C)		\
-  (ira_class_hard_regs_num [(C)] == 1		\
-   || (ira_class_hard_regs_num [(C)] >= 1	\
-   && targetm.class_likely_spilled_p (C)))
-
 /* If REG is a reload pseudo, try to make its class satisfying CL.  */
 static void
 narrow_reload_pseudo_class (rtx reg, enum reg_class cl)
@@ -2137,6 +2141,7 @@ process_alt_operands (int only_alternative)
   /* True if output stack pointer reload should be generated for the current
  alternative.  */
   bool curr_alt_out_sp_reload_p;
+  bool curr_alt_class_change_p;
   rtx op;
   /* The register when the operand is a subreg of register, otherwise the
  operand itself.  */
@@ -2223,6 +2228,7 @@ process_alt_operands (int only_alternative)
   early_clobbered_regs_num = 0;
   curr_alt_out_sp_reload_p = false;
   curr_reuse_alt_p = true;
+  curr_alt_class_change_p = false;
   
   for (nop = 0; nop < n_operands; nop++)
 	{
@@ -2247,6 +2253,7 @@ process_alt_operands (int only_alternative)
 	  bool scratch_p;
 	  machine_mode mode;
 	  enum constraint_num cn;
+	  bool class_change_p = false;
 
 	  opalt_num = nalt * n_operands + nop;
 	  if (curr_static_id->operand_alternative[opalt_num].anything_ok)
@@ -2630,9 +2637,16 @@ process_alt_operands (int only_alternative)
    (this_alternative_exclude_start_hard_regs,
 hard_regno[nop]
 			win = true;
-		  else if (hard_regno[nop] < 0
-			   && in_class_p (op, this_alternative, NULL))
-			win = true;
+		  else if (hard_regno[nop] < 0)
+			{
+			  if (in_class_p (op, this_alternative, NULL))
+			win = true;
+			  else if (in_class_p (op, this_alternative, NULL, true))
+			{
+			  class_change_p = true;
+			  win = true;
+			}
+			}
 		}
 		  break;
 		}
@@ -2647,6 +2661,15 @@ process_alt_operands (int only_alternative)
 	  if (win)
 	{
 	  this_alternative_win = 

[pushed][PR112918][LRA]: Fixing IRA ICE on m68k

2023-12-18 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112918

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64.


The patch affects a sensitive part of LRA.  So I will monitor that the 
commit does not create serious failures on other targets. If it happens, 
I probably revert the patch.


commit 989e67f827b74b76e58abe137ce12d948af2290c
Author: Vladimir N. Makarov 
Date:   Mon Dec 18 17:12:23 2023 -0500

[PR112918][LRA]: Fixing IRA ICE on m68k

Some GCC tests on m68K port of LRA is failed on `maximum number of
generated reload insns per insn achieved`.  The problem is in that for
subreg reload LRA can not narrow reg class more from ALL_REGS to
GENERAL_REGS and then to data regs or address regs.  The patch permits
narowing reg class from reload insns if this results in succesful
matching of reg operand.

gcc/ChangeLog:

PR rtl-optimization/112918
* lra-constraints.cc (SMALL_REGISTER_CLASS_P): Move before in_class_p.
(in_class_p): Restrict condition for narrowing class in case of
allow_all_reload_class_changes_p.
(process_alt_operands): Pass true for
allow_all_reload_class_changes_p in calls of in_class_p.
(curr_insn_transform): Ditto for reg operand win.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index da7e1748d75..05479ab98dd 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -261,6 +261,13 @@ enough_allocatable_hard_regs_p (enum reg_class reg_class,
   return false;
 }
 
+/* True if C is a non-empty register class that has too few registers
+   to be safely used as a reload target class.	*/
+#define SMALL_REGISTER_CLASS_P(C)		\
+  (ira_class_hard_regs_num [(C)] == 1		\
+   || (ira_class_hard_regs_num [(C)] >= 1	\
+   && targetm.class_likely_spilled_p (C)))
+
 /* Return true if REG satisfies (or will satisfy) reg class constraint
CL.  Use elimination first if REG is a hard register.  If REG is a
reload pseudo created by this constraints pass, assume that it will
@@ -318,7 +325,11 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
   common_class = ira_reg_class_subset[rclass][cl];
   if (new_class != NULL)
 	*new_class = common_class;
-  return enough_allocatable_hard_regs_p (common_class, reg_mode);
+  return (enough_allocatable_hard_regs_p (common_class, reg_mode)
+	  /* Do not permit reload insn operand matching (new_class == NULL
+		 case) if the new class is too small.  */
+	  && (new_class != NULL || common_class == rclass
+		  || !SMALL_REGISTER_CLASS_P (common_class)));
 }
 }
 
@@ -923,13 +934,6 @@ operands_match_p (rtx x, rtx y, int y_hard_regno)
&& GET_MODE_SIZE (MODE).is_constant ()	\
&& !targetm.cannot_force_const_mem (MODE, X))
 
-/* True if C is a non-empty register class that has too few registers
-   to be safely used as a reload target class.	*/
-#define SMALL_REGISTER_CLASS_P(C)		\
-  (ira_class_hard_regs_num [(C)] == 1		\
-   || (ira_class_hard_regs_num [(C)] >= 1	\
-   && targetm.class_likely_spilled_p (C)))
-
 /* If REG is a reload pseudo, try to make its class satisfying CL.  */
 static void
 narrow_reload_pseudo_class (rtx reg, enum reg_class cl)
@@ -2631,7 +2635,7 @@ process_alt_operands (int only_alternative)
 hard_regno[nop]
 			win = true;
 		  else if (hard_regno[nop] < 0
-			   && in_class_p (op, this_alternative, NULL))
+			   && in_class_p (op, this_alternative, NULL, true))
 			win = true;
 		}
 		  break;
@@ -2675,7 +2679,7 @@ process_alt_operands (int only_alternative)
 			  reject++;
 			}
 		  if (in_class_p (operand_reg[nop],
-  this_costly_alternative, NULL))
+  this_costly_alternative, NULL, true))
 			{
 			  if (lra_dump_file != NULL)
 			fprintf
@@ -4388,7 +4392,7 @@ curr_insn_transform (bool check_only_p)
 
 	if (REG_P (reg) && (regno = REGNO (reg)) >= FIRST_PSEUDO_REGISTER)
 	  {
-	bool ok_p = in_class_p (reg, goal_alt[i], _class);
+	bool ok_p = in_class_p (reg, goal_alt[i], _class, true);
 
 	if (new_class != NO_REGS && get_reg_class (regno) != new_class)
 	  {


Re: [PATCH 1/2] emit-rtl, lra: Move lra's emit_inc to emit-rtl.cc

2023-12-14 Thread Vladimir Makarov



On 12/13/23 16:00, Alex Coplan wrote:

Hi,

In PR112906 we ICE because we try to use force_reg to reload an
auto-increment address, but force_reg can't do this.

With the aim of fixing the PR by supporting reloading arbitrary
addresses in pre-RA splitters, this patch generalizes
lra-constraints.cc:emit_inc and makes it available to the rest of the
compiler by moving the generalized version to emit-rtl.cc.

We observe that the separate IN parameter to LRA's emit_inc is
redundant, since the function is static and is only (statically) called
once in lra-constraints.cc, with in == value.  As such, we drop the IN
parameter and simplify the code accordingly.

The function was initially adopted from reload1.cc:inc_for_reload.

We wrap the emit_inc code in a virtual class to allow LRA to override
how reload pseudos are created, thereby preserving the existing LRA
behaviour as much as possible.

We then add a second (higher-level) routine to emit-rtl.cc,
force_reload_address, which can reload arbitrary addresses.  This uses
the generalized emit_inc code to handle the RTX_AUTOINC case.  The
second patch in this series uses force_reload_address to fix PR112906.

Since we intend to call address_reload_context::emit_autoinc from within
splitters, and the code lifted from LRA calls recog, we have to avoid
clobbering recog_data.  We do this by introducing a new RAII class for
saving/restoring recog_data on the stack.

Bootstrapped/regtested on aarch64-linux-gnu, bootstrapped on
x86_64-linux-gnu, OK for trunk?

OK for me.  Thank you.

gcc/ChangeLog:

PR target/112906
* emit-rtl.cc (address_reload_context::emit_autoinc): New.
(force_reload_address): New.
* emit-rtl.h (struct address_reload_context): Declare.
(force_reload_address): Declare.
* lra-constraints.cc (class lra_autoinc_reload_context): New.
(emit_inc): Drop IN parameter, invoke
code moved to emit-rtl.cc:address_reload_context::emit_autoinc.
(curr_insn_transform): Drop redundant IN parameter in call to
emit_inc.
* recog.h (class recog_data_saver): New.




[pushed][PR112875][LRA]: Fix an assert in lra elimination code

2023-12-08 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112875

The patch was successfully tested and bootstrapped on x86-64 and ppc64le.

commit 48cb51827c9eb991b92014a3f59d31eb237ce03f
Author: Vladimir N. Makarov 
Date:   Fri Dec 8 15:37:42 2023 -0500

[PR112875][LRA]: Fix an assert in lra elimination code

PR112875 test ran into a wrong assert (gcc_unreachable) in elimination
in a debug insn.  The insn seems ok.  So I change the assertion.
To be more accurate I made it the same as analogous reload pass code.

gcc/ChangeLog:

PR rtl-optimization/112875
* lra-eliminations.cc (lra_eliminate_regs_1): Change an assert.
Add ASM_OPERANDS case.

gcc/testsuite/ChangeLog:

PR rtl-optimization/112875
* gcc.target/i386/pr112875.c: New test.

diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index f3b75e08390..cf229b402da 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -666,6 +666,10 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode mem_mode,
   return x;
 
 case CLOBBER:
+case ASM_OPERANDS:
+  gcc_assert (insn && DEBUG_INSN_P (insn));
+  break;
+
 case SET:
   gcc_unreachable ();
 
diff --git a/gcc/testsuite/gcc.target/i386/pr112875.c b/gcc/testsuite/gcc.target/i386/pr112875.c
new file mode 100644
index 000..b704404b248
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112875.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-Oz -frounding-math -fno-dce -fno-trapping-math -fno-tree-dce -fno-tree-dse -g" } */
+long a, f;
+int b, c, d, g, h, i, j;
+char e;
+void k(long, int l, char t) {
+  char m = b, n = g, o = 0;
+  int p, q, r = h;
+  long s = g;
+  if (f) {
+q = t + (float)16777217;
+o = ~0;
+  }
+  if (e) {
+d = g + a;
+if (d % (a % l)) {
+  p = d;
+  n = b;
+}
+if (l) {
+  i = b;
+  r = a;
+  p = h;
+}
+if (s)
+  s = q;
+c = f;
+e += t;
+a = p;
+  }
+  j = r % n;
+  s += g / 0xc000 + !o;
+}


Re: [PATCH] v2: Add IntegerRange for -param=min-nondebug-insn-uid= and fix vector growing in LRA and vec [PR112411]

2023-12-08 Thread Vladimir Makarov



On 12/7/23 03:39, Jakub Jelinek wrote:

On Thu, Dec 07, 2023 at 09:36:22AM +0100, Jakub Jelinek wrote:

So, one way to fix the LRA issue would be just to use
   lra_insn_recog_data_len = index * 3U / 2;
   if (lra_insn_recog_data_len <= index)
 lra_insn_recog_data_len = index + 1;
basically do what vec.cc does.  I thought we can do better for
both vec.cc and LRA on 64-bit hosts even without growing the allocated
counters, but now that I look at it again, perhaps we can't.
The above overflows already with original alloc or lra_insn_recog_data_len
0x5556, where 0x555 * 3U / 2 is still 0x7fff
and so representable in the 32-bit, but 0x5556 * 3U / 2 is
1.  I thought (and the patch implements it) that we could use
alloc * (size_t) 3 / 2 so that on 64-bit hosts it wouldn't overflow
that quickly, but 0x5556 * (size_t) 3 / 2 there is 0x8001
which is still ok in unsigned, but given that vec.h then stores the
counter into unsigned m_alloc:31; bit-field, it is too much.

The patch below is what I've actually bootstrapped/regtested on
x86_64-linux and i686-linux, but given the above I think I should drop
the vec.cc hunk and change (size_t) 3 in the LRA hunk to 3U.

Here is so far untested adjusted patch, which does the computation
just in unsigned int rather than size_t, because doing it in size_t
wouldn't improve things.

2023-12-07  Jakub Jelinek  

PR middle-end/112411
* params.opt (-param=min-nondebug-insn-uid=): Add
IntegerRange(0, 1073741824).
* lra.cc (check_and_expand_insn_recog_data): Use 3U rather than 3
in * 3 / 2 computation and if the result is smaller or equal to
index, use index + 1.

* gcc.dg/params/blocksort-part.c: Add dg-skip-if for
--param min-nondebug-insn-uid=1073741824.

Jakub, if you are still waiting for an approval,  LRA change is ok for 
me with given max param.


Thank you for fixing this.





Re: [PATCH] lra: Updates of biggest mode for hard regs [PR112278]

2023-12-04 Thread Vladimir Makarov



On 12/3/23 05:13, Richard Sandiford wrote:

[Gah.  In my head I'd sent this a few weeks ago, but it turns out
  that I hadn't even got to the stage of writing the changlog...]

LRA keeps track of the biggest mode for both hard registers and
pseudos.  The updates assume that the modes are ordered, i.e. that
we can tell whether one is no bigger than the other at compile time.

That is (or at least seemed to be) a reasonable restriction for pseudos.
But it isn't necessarily so for hard registers, since the uses of hard
registers can be logically distinct.  The testcase is an example of this.

The biggest mode of hard registers is also special for other reasons.
As the existing comment says:

   /* A reg can have a biggest_mode of VOIDmode if it was only ever seen as
  part of a multi-word register.  In that case, just use the reg_rtx
  mode.  Do the same also if the biggest mode was larger than a register
  or we can not compare the modes.  Otherwise, limit the size to that of
  the biggest access in the function or to the natural mode at least.  */

This patch applies the same approach to the updates.

Tested on aarch64-linus-gnu (with and without SVE) and on x86_64-linux-gnu.
OK to install?


Sure.  Thank you for fixing this, Richard.




[pushed][PR112445][LRA]: Fix "unable to find a register to spill" error

2023-12-01 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112445

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
ppc64le.
commit 1390bf52c17a71834a1766c0222e4f8a74efb162
Author: Vladimir N. Makarov 
Date:   Fri Dec 1 11:46:37 2023 -0500

[PR112445][LRA]: Fix "unable to find a register to spill" error

PR112445 is a very complicated bug occurring from interaction of
constraint subpass, inheritance, and hard reg live range splitting.
It is hard to debug this PR only from LRA standard logs.  Therefore I
added dumping all func insns at the end of complicated sub-passes
(constraint, inheritance, undoing inheritance, hard reg live range
splitting, and rematerialization).  As such output can be quite big,
it is switched only one level 7 of -fira-verbose value.  The reason
for the bug is a skip of live-range splitting of hard reg (dx) on the
1st live range splitting subpass.  Splitting is done for reload
pseudos around an original insn and its reload insns but the subpass
did not recognize such insn pattern because previous inheritance and
undoing inheritance subpasses extended a bit reload pseudo live range.
Although we undid inheritance in question, the result code was a bit
different from a code before the corresponding inheritance pass.  The
following fixes the bug by restoring exact code before the
inheritance.

gcc/ChangeLog:

PR target/112445
* lra.h (lra): Add one more arg.
* lra-int.h (lra_verbose, lra_dump_insns): New externals.
(lra_dump_insns_if_possible): Ditto.
* lra.cc (lra_dump_insns): Dump all insns.
(lra_dump_insns_if_possible):  Dump all insns for lra_verbose >= 7.
(lra_verbose): New global.
(lra): Add new arg.  Setup lra_verbose from its value.
* lra-assigns.cc (lra_split_hard_reg_for): Dump insns if rtl
was changed.
* lra-remat.cc (lra_remat): Dump insns if rtl was changed.
* lra-constraints.cc (lra_inheritance): Dump insns.
(lra_constraints, lra_undo_inheritance): Dump insns if rtl
was changed.
(remove_inheritance_pseudos): Use restore reg if it is set up.
* ira.cc: (lra): Pass internal_flag_ira_verbose.

gcc/testsuite/ChangeLog:

PR target/112445
* gcc.target/i386/pr112445.c: New test.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index d7530f01380..b5c4c0e4af7 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -5970,7 +5970,7 @@ do_reload (void)
 
   ira_destroy ();
 
-  lra (ira_dump_file);
+  lra (ira_dump_file, internal_flag_ira_verbose);
   /* ???!!! Move it before lra () when we use ira_reg_equiv in
 	 LRA.  */
   vec_free (reg_equivs);
diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index d2ebcfd5056..7aa210e986f 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1835,6 +1835,7 @@ lra_split_hard_reg_for (void)
   if (spill_p)
 {
   bitmap_clear (_reload_pseudos);
+  lra_dump_insns_if_possible ("changed func after splitting hard regs");
   return true;
 }
   bitmap_clear (_reload_pseudos);
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 9b6a2af5b75..177c765ca13 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5537,6 +5537,8 @@ lra_constraints (bool first_p)
 	  lra_assert (df_regs_ever_live_p (hard_regno + j));
 	  }
 }
+  if (changed_p)
+lra_dump_insns_if_possible ("changed func after local");
   return changed_p;
 }
 
@@ -7277,7 +7279,7 @@ lra_inheritance (void)
   bitmap_release (_invariant_regs);
   bitmap_release (_only_regs);
   free (usage_insns);
-
+  lra_dump_insns_if_possible ("func after inheritance");
   timevar_pop (TV_LRA_INHERITANCE);
 }
 
@@ -7477,13 +7479,16 @@ remove_inheritance_pseudos (bitmap remove_pseudos)
 			   == get_regno (lra_reg_info[prev_sregno].restore_rtx
 		  && ! bitmap_bit_p (remove_pseudos, prev_sregno))
 		{
+		  int restore_regno = get_regno (lra_reg_info[sregno].restore_rtx);
+		  if (restore_regno < 0)
+			restore_regno = prev_sregno;
 		  lra_assert (GET_MODE (SET_SRC (prev_set))
-  == GET_MODE (regno_reg_rtx[sregno]));
+  == GET_MODE (regno_reg_rtx[restore_regno]));
 		  /* Although we have a single set, the insn can
 			 contain more one sregno register occurrence
 			 as a source.  Change all occurrences.  */
 		  lra_substitute_pseudo_within_insn (curr_insn, sregno,
-			 SET_SRC (prev_set),
+			 regno_reg_rtx[restore_regno],
 			 false);
 		  /* As we are finishing with processing the insn
 			 here, check the destination too as it might
@@ -7745,5 +7750,7 @@ lra_undo_inheritance (void)
   EXECUTE_IF_SET_IN_BITMAP (_split_regs, 0, regno, bi)
 lra_reg_info[regno].restore_rtx = NULL_RTX;
   change_p = undo_optional_reloads () || change_p;
+  if 

Re: [PATCH v3 2/8] Unify implementations of print_hard_reg_set()

2023-11-22 Thread Vladimir Makarov



On 11/22/23 06:14, Maxim Kuvyrkov wrote:

We currently have 3 implementations of print_hard_reg_set()
(all with the same name!) in ira-color.cc, ira-conflicts.cc, and
sel-sched-dump.cc.  This patch generalizes implementation in
ira-color.cc, and uses it in all other places.  The declaration
is added to hard-reg-set.h.

The motivation for this patch is the [upcoming] need for
print_hard_reg_set() in sched-deps.cc.

gcc/ChangeLog:

* hard-reg-set.h (print_hard_reg_set): Declare.
* ira-color.cc (print_hard_reg_set): Generalize a bit.
(debug_hard_reg_set, print_hard_regs_subforest,)
(setup_allocno_available_regs_num): Update.
* ira-conflicts.cc (print_hard_reg_set): Remove.
(print_allocno_conflicts): Use global print_hard_reg_set().
* sel-sched-dump.cc (print_hard_reg_set): Remove.
(dump_hard_reg_set): Use global print_hard_reg_set().
* sel-sched-dump.h (dump_hard_reg_set): Mark as DEBUG_FUNCTION.


OK for me.  Thank you for consolidation of the print code, Maxim.




[pushed] [PR112610] [IRA]: Fix using undefined dump file in IRA code during insn scheduling

2023-11-22 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112610

The patch was successfully tested and bootstrapped on x86-64.

commit 95f61de95bbcc2e4fb7020e27698140abea23788
Author: Vladimir N. Makarov 
Date:   Wed Nov 22 09:01:02 2023 -0500

[IRA]: Fix using undefined dump file in IRA code during insn scheduling

Part of IRA code is used for register pressure sensitive insn
scheduling and live range shrinkage.  Numerous changes of IRA resulted
in that this IRA code uses dump file passed by the scheduler and
internal ira dump file (in called functions) which can be undefined or
freed by the scheduler during compiling previous functions.  The patch
fixes this problem.  To reproduce the error valgrind should be used
and GCC should be compiled with valgrind annotations.  Therefor the
patch does not contain the test case.

gcc/ChangeLog:

PR rtl-optimization/112610
* ira-costs.cc: (find_costs_and_classes): Remove arg.
Use ira_dump_file for printing.
(print_allocno_costs, print_pseudo_costs): Ditto.
(ira_costs): Adjust call of find_costs_and_classes.
(ira_set_pseudo_classes): Set up and restore ira_dump_file.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index e0528e76a64..c3efd295e54 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1662,16 +1662,16 @@ scan_one_insn (rtx_insn *insn)
 
 
 
-/* Print allocnos costs to file F.  */
+/* Print allocnos costs to the dump file.  */
 static void
-print_allocno_costs (FILE *f)
+print_allocno_costs (void)
 {
   int k;
   ira_allocno_t a;
   ira_allocno_iterator ai;
 
   ira_assert (allocno_p);
-  fprintf (f, "\n");
+  fprintf (ira_dump_file, "\n");
   FOR_EACH_ALLOCNO (a, ai)
 {
   int i, rclass;
@@ -1681,32 +1681,34 @@ print_allocno_costs (FILE *f)
   enum reg_class *cost_classes = cost_classes_ptr->classes;
 
   i = ALLOCNO_NUM (a);
-  fprintf (f, "  a%d(r%d,", i, regno);
+  fprintf (ira_dump_file, "  a%d(r%d,", i, regno);
   if ((bb = ALLOCNO_LOOP_TREE_NODE (a)->bb) != NULL)
-	fprintf (f, "b%d", bb->index);
+	fprintf (ira_dump_file, "b%d", bb->index);
   else
-	fprintf (f, "l%d", ALLOCNO_LOOP_TREE_NODE (a)->loop_num);
-  fprintf (f, ") costs:");
+	fprintf (ira_dump_file, "l%d", ALLOCNO_LOOP_TREE_NODE (a)->loop_num);
+  fprintf (ira_dump_file, ") costs:");
   for (k = 0; k < cost_classes_ptr->num; k++)
 	{
 	  rclass = cost_classes[k];
-	  fprintf (f, " %s:%d", reg_class_names[rclass],
+	  fprintf (ira_dump_file, " %s:%d", reg_class_names[rclass],
 		   COSTS (costs, i)->cost[k]);
 	  if (flag_ira_region == IRA_REGION_ALL
 	  || flag_ira_region == IRA_REGION_MIXED)
-	fprintf (f, ",%d", COSTS (total_allocno_costs, i)->cost[k]);
+	fprintf (ira_dump_file, ",%d",
+		 COSTS (total_allocno_costs, i)->cost[k]);
 	}
-  fprintf (f, " MEM:%i", COSTS (costs, i)->mem_cost);
+  fprintf (ira_dump_file, " MEM:%i", COSTS (costs, i)->mem_cost);
   if (flag_ira_region == IRA_REGION_ALL
 	  || flag_ira_region == IRA_REGION_MIXED)
-	fprintf (f, ",%d", COSTS (total_allocno_costs, i)->mem_cost);
-  fprintf (f, "\n");
+	fprintf (ira_dump_file, ",%d",
+		 COSTS (total_allocno_costs, i)->mem_cost);
+  fprintf (ira_dump_file, "\n");
 }
 }
 
-/* Print pseudo costs to file F.  */
+/* Print pseudo costs to the dump file.  */
 static void
-print_pseudo_costs (FILE *f)
+print_pseudo_costs (void)
 {
   int regno, k;
   int rclass;
@@ -1714,21 +1716,21 @@ print_pseudo_costs (FILE *f)
   enum reg_class *cost_classes;
 
   ira_assert (! allocno_p);
-  fprintf (f, "\n");
+  fprintf (ira_dump_file, "\n");
   for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
 {
   if (REG_N_REFS (regno) <= 0)
 	continue;
   cost_classes_ptr = regno_cost_classes[regno];
   cost_classes = cost_classes_ptr->classes;
-  fprintf (f, "  r%d costs:", regno);
+  fprintf (ira_dump_file, "  r%d costs:", regno);
   for (k = 0; k < cost_classes_ptr->num; k++)
 	{
 	  rclass = cost_classes[k];
-	  fprintf (f, " %s:%d", reg_class_names[rclass],
+	  fprintf (ira_dump_file, " %s:%d", reg_class_names[rclass],
 		   COSTS (costs, regno)->cost[k]);
 	}
-  fprintf (f, " MEM:%i\n", COSTS (costs, regno)->mem_cost);
+  fprintf (ira_dump_file, " MEM:%i\n", COSTS (costs, regno)->mem_cost);
 }
 }
 
@@ -1939,7 +1941,7 @@ calculate_equiv_gains (void)
and their best costs.  Set up preferred, alternative and allocno
classes for pseudos.  */
 static void
-find_costs_and_classes (FILE *dump_file)
+find_costs_and_classes (void)
 {
   int i, k, start, max_cost_classes_num;
   int pass;
@@ -1991,8 +1993,8 @@ find_costs_and_classes (FILE *dump_file)
  classes to guide the selection.  */
   for (pass = start; pass <= flag_expensive_optimizations; pass++)
 {
-  if ((!allocno_p || internal_flag_ira_verbose > 0) && dump_file)
-	fprintf 

Re: [PATCH V3 4/7] ira: Support subreg copy

2023-11-17 Thread Vladimir Makarov



On 11/16/23 21:06, Lehua Ding wrote:

Hi Vladimir,

Thank you so much for your review. Based on your comments, I feel like 
there are a lot of issues, especially the long compile time issue. So 
I'm going to reorganize and refactor the patches so that as many of 
them as possible can be reviewed separately. this way there will be 
fewer patches to support subreg in the end. I plan to split it into 
four separate patches like bellow. What do you think?



I can wait for the new version patches.  The only issue is stage1 deadline.

In my opinion, I'd recommend to work on the patches more and start their 
submission right before GCC-14 release (somewhere in April).


You need a lot of testing for the patches: major targets (x86-64, 
aarhc64, ppc64), some big endian targets, a 32-bit targets. Knowing how 
even small changes in RA can affect many targets, e.g. GCC testsuite 
results (there are a lot of different target tests which expect a 
particular output),  it is better to do this on stabilized GCC and 
stage3 is the best time for this.  In any case I'll approve patches only 
if you have successful bootstraps and no GCC testsuite regression on 
x86-64, ppc64le/be, aarhc64, i686.


Also you have a lot of compile time performance issues which you need to 
address.  So I guess you will be overwhelmed by new different target PRs 
after committing the patches if you will do this now.  You will have 
more time and less pressure work if you commit these patches in April.


You changes are massive and in a critical part of GCC, it is better to 
do all of this on public git branch in order to people can try this and 
test their targets.


But it is up to you to decide when submit the patches.  Still besides 
approval of your patches, you need successful testing.  If new testsuite 
failures occur after submitting the patch and they are not fixed during 
short period of time, the patches should be reverted.



 1. live_subreg problem
2. conflict_hard_regs check refactoring
3. use object instead of allocno to create copies
4. support subreg coalesce
   4.1 ira: Apply live_subreg data to ira
   4.2 lra: Apply live_subreg data to lra
   4.3 ira: Support subreg liveness track
   4.4 lra: Support subreg liveness track

So for the two patches about LRA, maybe you can stop review and wait 
for the revised patchs.




Sure. So far I only had a quick glance on them.



Re: [PATCH V3 5/7] ira: Add all nregs >= 2 pseudos to tracke subreg list

2023-11-16 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

This patch relax the subreg track capability to all subreg registers.
The patch is ok for me when general issues I mentioned in my first email 
and the issue given below are fixed.

gcc/ChangeLog:

* ira-build.cc (get_reg_unit_size): New.
(has_same_nregs): New.
(ira_set_allocno_class): Adjust.


...

+
+/* Return true if TARGET_CLASS_MAX_NREGS and TARGET_HARD_REGNO_NREGS results is
+   same. It should be noted that some targets may not implement these two very
+   uniformly, and need to be debugged step by step. For example, in V3x1DI mode
+   in AArch64, TARGET_CLASS_MAX_NREGS returns 2 but TARGET_HARD_REGNO_NREGS
+   returns 3. They are in conflict and need to be repaired in the Hook of
+   AArch64.  */
+static bool
+has_same_nregs (ira_allocno_t a)
+{
+  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+if (REGNO_REG_CLASS (i) != NO_REGS
+   && reg_class_subset_p (REGNO_REG_CLASS (i), ALLOCNO_CLASS (a))
+   && ALLOCNO_NREGS (a) != hard_regno_nregs (i, ALLOCNO_MODE (a)))
+  return false;
+  return true;
+}
+


It is better to fix the problem source.  But sometimes it is hard to do 
this for all targets.  RA already has analogous code.  So it is ok for 
me.  The only thing is that it is too expensive to do this for each 
allocno.  You should implement some cache (class, mode)->result.





Re: [PATCH V3 4/7] ira: Support subreg copy

2023-11-16 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

This patch changes the previous way of creating a copy between allocnos to 
objects.

gcc/ChangeLog:

* ira-build.cc (find_allocno_copy): Removed.
(find_object): New.
(ira_create_copy): Adjust.
(add_allocno_copy_to_list): Adjust.
(swap_allocno_copy_ends_if_necessary): Adjust.
(ira_add_allocno_copy): Adjust.
(print_copy): Adjust.
(print_allocno_copies): Adjust.
(ira_flattening): Adjust.
* ira-color.cc (INCLUDE_VECTOR): Include vector.
(struct allocno_color_data): Adjust.
(struct allocno_hard_regs_subnode): Adjust.
(form_allocno_hard_regs_nodes_forest): Adjust.
(update_left_conflict_sizes_p): Adjust.
(struct update_cost_queue_elem): Adjust.
(queue_update_cost): Adjust.
(get_next_update_cost): Adjust.
(update_costs_from_allocno): Adjust.
(update_conflict_hard_regno_costs): Adjust.
(assign_hard_reg): Adjust.
(objects_conflict_by_live_ranges_p): New.
(allocno_thread_conflict_p): Adjust.
(object_thread_conflict_p): Ditto.
(merge_threads): Ditto.
(form_threads_from_copies): Ditto.
(form_threads_from_bucket): Ditto.
(form_threads_from_colorable_allocno): Ditto.
(init_allocno_threads): Ditto.
(add_allocno_to_bucket): Ditto.
(delete_allocno_from_bucket): Ditto.
(allocno_copy_cost_saving): Ditto.
(color_allocnos): Ditto.
(color_pass): Ditto.
(update_curr_costs): Ditto.
(coalesce_allocnos): Ditto.
(ira_reuse_stack_slot): Ditto.
(ira_initiate_assign): Ditto.
(ira_finish_assign): Ditto.
* ira-conflicts.cc (allocnos_conflict_for_copy_p): Ditto.
(REG_SUBREG_P): Ditto.
(subreg_move_p): New.
(regs_non_conflict_for_copy_p): New.
(subreg_reg_align_and_times_p): New.
(process_regs_for_copy): Ditto.
(add_insn_allocno_copies): Ditto.
(propagate_copies): Ditto.
* ira-emit.cc (add_range_and_copies_from_move_list): Ditto.
* ira-int.h (struct ira_allocno_copy): Ditto.
(ira_add_allocno_copy): Ditto.
(find_object): Exported.
(subreg_move_p): Exported.
* ira.cc (print_redundant_copies): Exported.

---
  gcc/ira-build.cc | 154 +++-
  gcc/ira-color.cc | 541 +++
  gcc/ira-conflicts.cc | 173 +++---
  gcc/ira-emit.cc  |  10 +-
  gcc/ira-int.h|  10 +-
  gcc/ira.cc   |   5 +-
  6 files changed, 646 insertions(+), 247 deletions(-)
The patch is mostly ok for me except that there are the same issues I 
mentioned in my 1st email. Not changing comments for functions with 
changed interface like function arg types and names (e.g. 
find_allocno_copy) is particularly bad.  It makes the comments confusing 
and wrong.  Also using just "adjust" in changelog entries is too brief.  
You should at least mention that function signature is changed.

diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
index a32693e69e4..13f0f7336ed 100644
--- a/gcc/ira-build.cc
+++ b/gcc/ira-build.cc

diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 8aed25144b9..099312bcdb3 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3.  If not see
  


  
-  ira_allocno_t next_thread_allocno;

+  ira_object_t *next_thread_objects;
+  /* The allocno all thread shared.  */
+  ira_allocno_t first_thread_allocno;
+  /* The offset start relative to the first_thread_allocno.  */
+  int first_thread_offset;
+  /* All allocnos belong to the thread.  */
+  bitmap thread_allocnos;


It is better to use bitmap_head instead of bitmap.  It permits to avoid 
allocation of bitmap_head for bitmap.  There are many places when 
bitmap_head in you patches can be better used than bitmap (it is 
especially profitable if there is significant probability of empty bitmap).


Of  course the patch cab be committed when all the patches are approved 
and fixed.




Re: [PATCH V3 3/7] ira: Support subreg live range track

2023-11-14 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

gcc/ChangeLog:

* hard-reg-set.h (struct HARD_REG_SET): New shift operator.
* ira-build.cc (ira_create_object): Adjust.
(find_object): New.
(find_object_anyway): New.
(ira_create_allocno): Adjust.
(get_range): New.
(ira_copy_allocno_objects): New.
(merge_hard_reg_conflicts): Adjust copy.
(create_cap_allocno): Adjust.
(find_subreg_p): New.
(add_subregs): New.
(create_insn_allocnos): Collect subreg.
(create_bb_allocnos): Ditto.
(move_allocno_live_ranges): Adjust.
(copy_allocno_live_ranges): Adjust.
(setup_min_max_allocno_live_range_point): Adjust.
* ira-color.cc (INCLUDE_MAP): include map.
(setup_left_conflict_sizes_p): Adjust conflict size.
(setup_profitable_hard_regs): Adjust.
(get_conflict_and_start_profitable_regs): Adjust.
(check_hard_reg_p): Adjust conflict check.
(assign_hard_reg): Adjust.
(push_allocno_to_stack): Adjust conflict size.
(improve_allocation): Adjust.
* ira-conflicts.cc (record_object_conflict): Simplify.
(build_object_conflicts): Adjust.
(build_conflicts): Adjust.
(print_allocno_conflicts): Adjust.
* ira-emit.cc (modify_move_list): Adjust.
* ira-int.h (struct ira_object): Adjust struct.
(struct ira_allocno): Adjust struct.
(ALLOCNO_NUM_OBJECTS): New accessor.
(ALLOCNO_UNIT_SIZE): Ditto.
(ALLOCNO_TRACK_SUBREG_P): Ditto.
(ALLOCNO_NREGS): Ditto.
(OBJECT_SUBWORD): Ditto.
(OBJECT_INDEX): Ditto.
(OBJECT_START): Ditto.
(OBJECT_NREGS): Ditto.
(find_object): Exported.
(find_object_anyway): Ditto.
(ira_copy_allocno_objects): Ditto.
(has_subreg_object_p): Ditto.
(get_full_object): Ditto.
* ira-lives.cc (INCLUDE_VECTOR): Include vector.
(add_onflict_hard_regs): New.
(add_onflict_hard_reg): New.
(make_hard_regno_dead): Adjust.
(make_object_live): Adjust.
(update_allocno_pressure_excess_length): Adjust.
(make_object_dead): Adjust.
(mark_pseudo_regno_live): Adjust.
(add_subreg_point): New.
(mark_pseudo_object_live): Adjust.
(mark_pseudo_regno_subword_live): Adjust.
(mark_pseudo_regno_subreg_live): Adjust.
(mark_pseudo_regno_subregs_live): Adjust.
(mark_pseudo_reg_live): Adjust.
(mark_pseudo_regno_dead): Adjust.
(mark_pseudo_object_dead): Adjust.
(mark_pseudo_regno_subword_dead): Adjust.
(mark_pseudo_regno_subreg_dead): Adjust.
(mark_pseudo_reg_dead): Adjust.
(process_single_reg_class_operands): Adjust.
(process_out_of_region_eh_regs): Adjust.
(add_conflict_from_region_landing_pads): Adjust.
(process_bb_node_lives): Adjust.
(class subreg_live_item): New class.
(create_subregs_live_ranges): New function.
(ira_create_allocno_live_ranges): Adjust.
* ira.cc (check_allocation): Adjust.


Again changeLog is too ambiguous.  Adjust to what?  For example, you 
should write what members are added to a structure instead of just 
"adjust structure".


General issues to your patch which I wrote in my 1st email are 
applicable here too.  Especially I'd mention typos in function names 
add_onflict_hard_reg[s].


There are too many changes should be done. So I'd like to see a new 
version of the patch.



---
  gcc/hard-reg-set.h   |  33 +++
  gcc/ira-build.cc | 235 +---
  gcc/ira-color.cc | 302 +-
  gcc/ira-conflicts.cc |  48 ++---
  gcc/ira-emit.cc  |   2 +-
  gcc/ira-int.h|  57 -
  gcc/ira-lives.cc | 500 ---
  gcc/ira.cc   |  52 ++---
  8 files changed, 907 insertions(+), 322 deletions(-)

--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -19,6 +19,7 @@ along with GCC; see the file COPYING3.  If not see
  .  */
  
  #include "config.h"

+#define INCLUDE_MAP
  #include "system.h"
  #include "coretypes.h"
  #include "backend.h"
@@ -852,18 +853,17 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
node_preorder_num = node->preorder_num;
node_set = node->hard_regs->set;
node_check_tick++;
+  /* Collect conflict objects.  */
+  std::map allocno_conflict_regs;
for (k = 0; k < nobj; k++)
  {
ira_object_t obj = ALLOCNO_OBJECT (a, k);
ira_object_t conflict_obj;
ira_object_conflict_iterator oci;
-
+
FOR_EACH_OBJECT_CONFLICT (obj, conflict_obj, oci)
{
- int size;
- ira_allocno_t conflict_a = OBJECT_ALLOCNO (conflict_obj);
- allocno_hard_regs_node_t conflict_node, temp_node;
- HARD_REG_SET conflict_node_set;
+ ira_allocno_t conflict_a = OBJECT_ALLOCNO (conflict_obj);
  

Re: [PATCH V3 2/7] ira: Switch to live_subreg data

2023-11-14 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

This patch switch the use of live_reg data to live_subreg data.

gcc/ChangeLog:

* ira-build.cc (create_bb_allocnos): Switch.
Switch to what? Although from the patch itself someone can figure it 
out, you should write it in the changelog entry.

(create_loop_allocnos): Ditto.
* ira-color.cc (ira_loop_edge_freq): Ditto.
* ira-emit.cc (generate_edge_moves): Ditto.
(add_ranges_and_copies): Ditto.
* ira-lives.cc (process_out_of_region_eh_regs): Ditto.
(add_conflict_from_region_landing_pads): Ditto.
(process_bb_node_lives): Ditto.
* ira.cc (find_moveable_pseudos): Ditto.
(interesting_dest_for_shprep_1): Ditto.
(allocate_initial_values): Ditto.
(ira): Ditto.

Besides general issues about all your patches I mentioned in my first 
email, the patch is ok to me (of course after the 1st patch "df: Add 
DF_LIVE_SUBREG problem" is approved and committed).





Re: [PATCH V3 1/7] df: Add DF_LIVE_SUBREG problem

2023-11-14 Thread Vladimir Makarov



On 11/14/23 12:18, Vladimir Makarov wrote:


On 11/14/23 03:38, Lehua Ding wrote:



This is perfectly fine, the code inside the live_subreg problem has a 
branch that goes through similar logic to live_reg if it finds no 
subreg inside the program. Then when the optimization level is less 
than 2, it doesn't track the subreg. By the way, I'd like to ask you 
if you have certain programs where RA has a big impact on compilation 
time to offer? Or any suggestions about it?


I've analyzed effect of your patches to -O2 compilation time on 
compilation of some old version of combine.c.  The total GCC 
compilation time increased by about 3%. I used x86_64 release mode 
compiler.  Here are my more detail findings:


RA compile time increased by 43%.

54% of this increase is due df_analyze time increase and 38% is due to 
overall ira_color increase (assign_hard_reg execution time increased 
in 50 times but still such big increase is 1/3 of overall ira_color 
increase).


Sorry, due to different inlining of assign_hard_reg I reported wrong 
numbers for this function (for version w/o patches only assigning on the 
region border was taken), the compilation times for this function is 
basically the same.
The rest (about 10%) of overall RA increase is mostly LRA increase due 
to lra_create_live_ranges.


To see where 6% GCC compilation time increase on SPEC2017 is spent 
would be more interesting but it needs a lot of time for analysis.







Re: [PATCH V3 1/7] df: Add DF_LIVE_SUBREG problem

2023-11-14 Thread Vladimir Makarov



On 11/14/23 03:38, Lehua Ding wrote:



This is perfectly fine, the code inside the live_subreg problem has a 
branch that goes through similar logic to live_reg if it finds no 
subreg inside the program. Then when the optimization level is less 
than 2, it doesn't track the subreg. By the way, I'd like to ask you 
if you have certain programs where RA has a big impact on compilation 
time to offer? Or any suggestions about it?


I've analyzed effect of your patches to -O2 compilation time on 
compilation of some old version of combine.c.  The total GCC compilation 
time increased by about 3%. I used x86_64 release mode compiler.  Here 
are my more detail findings:


RA compile time increased by 43%.

54% of this increase is due df_analyze time increase and 38% is due to 
overall ira_color increase (assign_hard_reg execution time increased in 
50 times but still such big increase is 1/3 of overall ira_color increase).


The rest (about 10%) of overall RA increase is mostly LRA increase due 
to lra_create_live_ranges.


To see where 6% GCC compilation time increase on SPEC2017 is spent would 
be more interesting but it needs a lot of time for analysis.





Re: [PATCH V3 1/7] df: Add DF_LIVE_SUBREG problem

2023-11-14 Thread Vladimir Makarov



On 11/14/23 04:03, Richard Biener wrote:


I suggest you farm bugzilla for the compile-time-hog / memory-hog testcases.
I do have a set of "large" testcases.  Scanning results points at
PRs 36262, 37448, 39326, 69609 all having RA in the 20% area at
-O0 -g.

It's also a good idea to take say cc1files (set of preprocessed sources
that produce GCCs cc1) and look at the overall impact of compile-time
and memory-usage of a change on those which are representative
for "normal" TUs as opposed to the PRs above which often are
large machine-generated TUs (an important area where GCC usually
shines, at least at -O1).

RA is expensive optimization pass in any compiler even if the fastest 
algorithms are used.


The most illustrative PR for this is 108500 where RA at -O0 spent 90% 
(200s) of compilation time.  But it is nothing in comparison with LLVM 
"fast" RA algorithm where LLVM-14 spent almost 100% or 41500s (200 times 
more than GCC) at -O0.


LLVM greedy RA is even worse I stopped LLVM after 120hours at -O1 when 
GCC spent 30min at -O1.  In contrast to LLVM, GCC RA also solves code 
selection task.


IMHO GCC is better scaling compiler and better compiler for big TUs and 
functions. When I worked on CRuby, I saw an interesting results of GCC 
vs LLVM. Clang-15 with -O3 produced 70% slower (on a simple Ruby test) 
Ruby basic interpreter code than GCC-12 with -O3.  Also Clang spends 20 
times more time to compile major Ruby interpreter file vm.c with huge 
major interpreter function (315s for clang vs 15s for GCC).






Re: [PATCH 0/5] Add support for operand-specific alignment requirements

2023-11-13 Thread Vladimir Makarov



On 11/12/23 09:52, Richard Sandiford wrote:

SME has various instructions that require aligned register tuples.
However, the associated tuple modes are already widely used and do
not need to be aligned in other contexts.  It therefore isn't
appropriate to force alignment in TARGET_HARD_REGNO_MODE_OK.

There are also strided loads and stores that require:

- (regno & 0x8) == 0 for 2-register tuples
- (regno & 0xc) == 0 for 4-register tuples

Although the requirements for strided loads and stores could be
enforced by C++ conditions on the insn, it's convenient to handle
them in the same way as alignment.

This series of patches therefore adds a way for register constraints
to specify which start registers are valid and which aren't.  Most of
the details are in the covering note to the first patch.

This is clearly changing a performance-sensitive part of the compiler.
I've tried to ensure that the overhead is only small for targets that
use the new feature.  Almost all of the new code gets optimised away
on targets that don't use the feature.

Richard Sandiford (5):
   Add register filter operand to define_register_constraint
   recog: Handle register filters
   lra: Handle register filters
   ira: Handle register filters
   Add an aligned_register_operand predicate

  gcc/common.md  |  28 
  gcc/doc/md.texi|  41 +++-
  gcc/doc/tm.texi|   3 +-
  gcc/doc/tm.texi.in |   3 +-
  gcc/genconfig.cc   |   2 +
  gcc/genpreds.cc| 146 -
  gcc/gensupport.cc  |  48 +-
  gcc/gensupport.h   |   3 +
  gcc/ira-build.cc   |   8 +++
  gcc/ira-color.cc   |  10 +++
  gcc/ira-int.h  |  14 
  gcc/ira-lives.cc   |  61 +
  gcc/lra-constraints.cc |  13 +++-
  gcc/recog.cc   |  14 +++-
  gcc/recog.h|  24 ++-
  gcc/reginfo.cc |   5 ++
  gcc/rtl.def|   6 +-
  gcc/target-globals.cc  |   6 +-
  gcc/target-globals.h   |   3 +
  19 files changed, 421 insertions(+), 17 deletions(-)

Collecting all occurrence constraints for IRA probably might result in 
worse allocation (when pseudo is spilled because of this) in comparison 
with using wider hard reg set and generating reload insns for some 
pseudo occurrences requiring stricter constraints.  Regional RA 
mitigates this issue.  In any case IRA changes is an improvement in 
comparison with using only hard_regno_mode_ok.  Using smaller 
constraints in certain cases for pseudos spilled after using the biggest 
constraint is just an idea for further RA improvement for targets using 
the filters. The only question is it worth to implement.


All IRA/LRA/reginfo patches are OK for me.  IMHO other changes are 
pretty strait forward not to ask somebody to review them.


Thank you, Richard.




Re: [PATCH V3 1/7] df: Add DF_LIVE_SUBREG problem

2023-11-13 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

This patch adds a live_subreg problem to extend the original live_reg to
track the liveness of subreg. We will only try to trace speudo registers
who's mode size is a multiple of nature size and eventually a small portion
of the inside will appear to use subreg. With live_reg problem, live_subreg
prbolem will have the following output. full_in/out mean the entire pesudo
live in/out, partial_in/out mean the subregs of the pesudo are live in/out,
and range_in/out indicates which part of the pesudo is live. all_in/out is
the union of full_in/out and partial_in/out:

I am not a maintainer or reviewer of data-flow analysis framework and 
can not approve this patch except changes in regs.h.  Richard Sandiford 
or Jeff Law as global reviewers probably can do this.


As for regs.h changes, they are ok for me after fixing general issues I 
mentioned in my previous email (two spaces after sentence ends in the 
comments).


I think all this code is a major compiler time and memory consumer in 
all set of the patches.  DF analysis is slow by itself even when only 
effective data structures as bitmaps are used but you are introducing 
even slower data structure as maps (I believe better performance data 
structure can be used instead).  In the very first version of LRA I used 
DFA but it made LRA so slow that I had to introduce own data structures 
which are faster in case of massive RTL changes in LRA.  The same 
problem exists for using generic C++ standard library data as vectors 
and maps for critical code.  It is hard to get a needed performance when 
the exact implementation can vary or be not what you need, e.g. vector 
initial capacity, growth etc.  But again the performance issues can be 
addressed later.





Re: [PATCH V3 0/7] ira/lra: Support subreg coalesce

2023-11-13 Thread Vladimir Makarov



On 11/12/23 07:08, Lehua Ding wrote:

V3 Changes:
   1. fix three ICE.
   2. rebase

Hi,

These patchs try to support subreg coalesce feature in
register allocation passes (ira and lra).

I've started review of v3 patches and here is my initial general 
criticism of your patches:


  * Absence of comments for some functions, e.g. for `HARD_REG_SET 
operator>> (unsigned int shift_amount) const`.


  * Adding significant functionality to existing functions is not 
reflected in the function comment, e.g. in ira_set_allocno_class.


  * A lot of typos, e.g. `pesudo` or `reprensent`.  I think you need to 
check spelling of you comments (I myself do spell checking in emacs by 
ispell-region command).


  * Grammar mistakes, e.g `Flag means need track subreg live range for 
the allocno`.  I understand English is not your native languages (as for 
me).  In case of some doubts I'd recommend to check grammar in ChatGPT 
(Proofread:  text).


  * Some local variables use upper case letters (e.g. `int A`) which 
should be used for macros or enums according to GNU coding standard 
(https://www.gnu.org/prep/standards/standards.html) .


  * Sometimes you put one space at the end of sentence.  Please see GNU 
coding standard and GCC coding conventions 
(https://gcc.gnu.org/codingconventions.html)


  * There is no uniformity in your code, e.g. sometimes you use 'i++', 
sometimes `++i` or `i += 1`.  Although the uniformity is not necessary, 
it makes a better impression about the patches.



I also did not find what targets did you use for testing.  I am asking 
this because I see new testsuite failures (apx-spill_to_egprs-1.c) even 
on x86-64.  It might be nothing as the test expects a specific code 
generation.


Also besides testing major targets I'd recommend testing at least one 
big endian target (I'd recommend ppc64be. gcc110.fsfrance.org could be 
used for this).  Plenty RA issues occur because BE targets are not tested.





Re: [PATCH 0/7] ira/lra: Support subreg coalesce

2023-11-13 Thread Vladimir Makarov



On 11/12/23 07:01, Lehua Ding wrote:
Thanks for the specint performance data. I'll do my best to get the 
compile time and memory issues fixed. I'm very curious to know if the 
way used to solve the subreg coalesce problem makes sense to you?


If it works,  it is ok for me.  There is always a room for any 
optimization even if it decreases compilation speed considerably. We 
just need to keep the same speed for optimization level <= 2.  We can 
put really expensive optimizations to -O3 or -Ofast.


Although the first thing I would try myself is to do subreg liveness 
analysis only locally (inside BBs).  The majority cases I saw to improve 
subreg RA were local (inside a BB).   For such approach, we probably 
would have only minor compiler speed slowdown and could use the 
optimization by default.




[pushed][PR112337][IRA]: Check autoinc and memory address after temporary equivalence substitution

2023-11-10 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112337

The patch was successfully bootstrapped an tested on x86-64, ppc64le, 
and aarch64.
commit b3d1d30eeed67c78e223c146a464d2fdd1dde894
Author: Vladimir N. Makarov 
Date:   Fri Nov 10 11:14:46 2023 -0500

[IRA]: Check autoinc and memory address after temporary equivalence substitution

My previous RA patches to take register equivalence into account do
temporary register equivalence substitution to find out that the
equivalence can be consumed by insns.  The insn with the substitution is
checked on validity using target-depended code.  This code expects that
autoinc operations work on register but this register can be substituted
by equivalent memory.  The patch fixes this problem.  The patch also adds
checking that the substitution can be consumed in memory address too.

gcc/ChangeLog:

PR target/112337
* ira-costs.cc: (validate_autoinc_and_mem_addr_p): New function.
(equiv_can_be_consumed_p): Use it.

gcc/testsuite/ChangeLog:

PR target/112337
* gcc.target/arm/pr112337.c: New.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 50f80779025..e0528e76a64 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1758,13 +1758,46 @@ process_bb_node_for_costs (ira_loop_tree_node_t loop_tree_node)
 process_bb_for_costs (bb);
 }
 
+/* Return true if all autoinc rtx in X change only a register and memory is
+   valid.  */
+static bool
+validate_autoinc_and_mem_addr_p (rtx x)
+{
+  enum rtx_code code = GET_CODE (x);
+  if (GET_RTX_CLASS (code) == RTX_AUTOINC)
+return REG_P (XEXP (x, 0));
+  const char *fmt = GET_RTX_FORMAT (code);
+  for (int i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+if (fmt[i] == 'e')
+  {
+	if (!validate_autoinc_and_mem_addr_p (XEXP (x, i)))
+	  return false;
+  }
+else if (fmt[i] == 'E')
+  {
+	for (int j = 0; j < XVECLEN (x, i); j++)
+	  if (!validate_autoinc_and_mem_addr_p (XVECEXP (x, i, j)))
+	return false;
+  }
+  /* Check memory after checking autoinc to guarantee that autoinc is already
+ valid for machine-dependent code checking memory address.  */
+  return (!MEM_P (x)
+	  || memory_address_addr_space_p (GET_MODE (x), XEXP (x, 0),
+	  MEM_ADDR_SPACE (x)));
+}
+
 /* Check that reg REGNO can be changed by TO in INSN.  Return true in case the
result insn would be valid one.  */
 static bool
 equiv_can_be_consumed_p (int regno, rtx to, rtx_insn *insn)
 {
   validate_replace_src_group (regno_reg_rtx[regno], to, insn);
-  bool res = verify_changes (0);
+  /* We can change register to equivalent memory in autoinc rtl.  Some code
+ including verify_changes assumes that autoinc contains only a register.
+ So check this first.  */
+  bool res = validate_autoinc_and_mem_addr_p (PATTERN (insn));
+  if (res)
+res = verify_changes (0);
   cancel_changes (0);
   return res;
 }
diff --git a/gcc/testsuite/gcc.target/arm/pr112337.c b/gcc/testsuite/gcc.target/arm/pr112337.c
new file mode 100644
index 000..5dacf0aa4f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr112337.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard" } */
+
+#pragma GCC arm "arm_mve_types.h"
+int32x4_t h(void *p) { return __builtin_mve_vldrwq_sv4si(p); }
+void g(int32x4_t);
+void f(int, int, int, short, int *p) {
+  int *bias = p;
+  for (;;) {
+int32x4_t d = h(bias);
+bias += 4;
+g(d);
+  }
+}


Re: [PATCH 0/7] ira/lra: Support subreg coalesce

2023-11-09 Thread Vladimir Makarov



On 11/7/23 22:47, Lehua Ding wrote:


Lehua Ding (7):
   ira: Refactor the handling of register conflicts to make it more
 general
   ira: Add live_subreg problem and apply to ira pass
   ira: Support subreg live range track
   ira: Support subreg copy
   ira: Add all nregs >= 2 pseudos to tracke subreg list
   lra: Apply live_subreg df_problem to lra pass
   lra: Support subreg live range track and conflict detect

Thank you very much for addressing subreg RA.  It is a big work.  I 
wanted to address this long time ago but have no time to do this by myself.


I tried to evaluate your patches on x86-64 (i7-9700k) release mode GCC.  
I used -O3 for SPEC2017 compilation.


Here are the results:

   baseline baseline(+patches)
specint2017:  8.51 vs 8.58 (+0.8%)
specfp2017:   21.1 vs 21.1 (+0%)
compile time: 2426.41s vs 2580.58s (+6.4%)

Spec2017 average code size change: -0.07%

Improving specint by 0.8% is impressive for me.

Unfortunately, it is achieved by decreasing compilation speed by 6.4% 
(although on smaller benchmark I saw only 3% slowdown). I don't know how 
but we should mitigate this speed degradation.  May be we can find a hot 
spot in the new code (but I think it is not a linear search pointed by 
Richard Biener as the object vectors most probably contain 1-2 elements) 
and this code spot can be improved, or we could use this only for 
-O3/fast, or the code can be function or target dependent.


I also find GCC consumes more memory with the patches. May be it can be 
improved too (although I am not sure about this).


I'll start to review the patches on the next week.  I don't expect that 
I'll find something serious to reject the patches but again we should 
work on mitigation of the compilation speed problem.  We can fill a new 
PR for this and resolve the problem during the release cycle.





[pushed] [IRA]: Fixing conflict calculation from region landing pads.

2023-11-09 Thread Vladimir Makarov

This is one more patch for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110215

The patch was successfully tested and bootstrapped on x86-64, aarch64, 
ppc64le.


commit df14f1c0582cd6742a37abf3a97f4c4bf0caf864
Author: Vladimir N. Makarov 
Date:   Thu Nov 9 08:51:15 2023 -0500

[IRA]: Fixing conflict calculation from region landing pads.

The following patch fixes conflict calculation from exception landing
pads.  The previous patch processed only one newly created landing pad.
Besides it was wrong, it also resulted in large memory consumption by IRA.

gcc/ChangeLog:

PR rtl-optimization/110215
* ira-lives.cc: (add_conflict_from_region_landing_pads): New
function.
(process_bb_node_lives): Use it.

diff --git a/gcc/ira-lives.cc b/gcc/ira-lives.cc
index bc8493856a4..81af5c06460 100644
--- a/gcc/ira-lives.cc
+++ b/gcc/ira-lives.cc
@@ -1214,6 +1214,32 @@ process_out_of_region_eh_regs (basic_block bb)
 
 #endif
 
+/* Add conflicts for object OBJ from REGION landing pads using CALLEE_ABI.  */
+static void
+add_conflict_from_region_landing_pads (eh_region region, ira_object_t obj,
+   function_abi callee_abi)
+{
+  ira_allocno_t a = OBJECT_ALLOCNO (obj);
+  rtx_code_label *landing_label;
+  basic_block landing_bb;
+
+  for (eh_landing_pad lp = region->landing_pads; lp ; lp = lp->next_lp)
+{
+  if ((landing_label = lp->landing_pad) != NULL
+	  && (landing_bb = BLOCK_FOR_INSN (landing_label)) != NULL
+	  && (region->type != ERT_CLEANUP
+	  || bitmap_bit_p (df_get_live_in (landing_bb),
+			   ALLOCNO_REGNO (a
+	{
+	  HARD_REG_SET new_conflict_regs
+	= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
+	  OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+	  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+	  return;
+	}
+}
+}
+
 /* Process insns of the basic block given by its LOOP_TREE_NODE to
update allocno live ranges, allocno hard register conflicts,
intersected calls, and register pressure info for allocnos for the
@@ -1385,23 +1411,9 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		  SET_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
 		}
 		  eh_region r;
-		  eh_landing_pad lp;
-		  rtx_code_label *landing_label;
-		  basic_block landing_bb;
 		  if (can_throw_internal (insn)
-		  && (r = get_eh_region_from_rtx (insn)) != NULL
-		  && (lp = gen_eh_landing_pad (r)) != NULL
-		  && (landing_label = lp->landing_pad) != NULL
-		  && (landing_bb = BLOCK_FOR_INSN (landing_label)) != NULL
-		  && (r->type != ERT_CLEANUP
-			  || bitmap_bit_p (df_get_live_in (landing_bb),
-	   ALLOCNO_REGNO (a
-		{
-		  HARD_REG_SET new_conflict_regs
-			= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
-		  OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
-		  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
-		}
+		  && (r = get_eh_region_from_rtx (insn)) != NULL)
+		add_conflict_from_region_landing_pads (r, obj, callee_abi);
 		  if (sparseset_bit_p (allocnos_processed, num))
 		continue;
 		  sparseset_set_bit (allocnos_processed, num);


Re: [RFC] Make genautomata.cc output reflect insn-attr.h expectation:

2023-11-01 Thread Vladimir Makarov



On 10/31/23 18:51, Edwin Lu wrote:

genattr.cc currently generates insn-attr.h with the following structure:

#if CPU_UNITS_QUERY
extern int get_cpu_unit_code (const char *);
extern int cpu_unit_reservation_p (state_t, int);
#endif
extern bool insn_has_dfa_reservation_p (rtx_insn *);

however genautomata.cc generates insn-automata.cc with the following structure:
#if CPU_UNITS_QUERY
int get_cpu_unit_code (const char * ) { ... }
int cpu_unit_reservation_p (state_t, int) { ... }
bool insn_has_dfa_reservation_p (rtx_insn *) { ... }
#endif

I'm not sure if insn_has_dfa_reservation_p is supposed to be a part of the
CPU_UNITS_QUERY conditional group or not. For consistency, I would like to
move it outside of the group.


No, it should  be not considered a part of cpu unit query group. The 
function just says that there is any cpu reservation by insns.


Two other functions say that the state is still reserving a particular 
cpu unit.  Using these 2 functions requires a lot of memory for their 
implementation and prevent further dfa minimizations.  The functions 
should be used mostly for VLIW CPUs when we need this information to 
generate machine insns (e.g, ia64 VLIW insn template).



This would move insn_has_dfa_reservation_p out of the #if CPU_UNITS_QUERY
conditional inside of insn-automata.cc. This would allow us to see if the
scheduler is trying to schedule an insn with a type which is not associated
with a cpu unit or insn reservation through the TARGET_SCHED_VARIABLE_ISSUE
hook.

If there is a reason for insn_has_dfa_reservation_p being within the
conditional, please let me know!


It seems a typo.

The patch is ok for me.  Thank you for finding this out.


gcc/Changelog:

* genautomata.cc (write_automata): move endif

Signed-off-by: Edwin Lu 
---
  gcc/genautomata.cc | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/genautomata.cc b/gcc/genautomata.cc
index 72f01686d6b..9dda25e5ba2 100644
--- a/gcc/genautomata.cc
+++ b/gcc/genautomata.cc
@@ -9503,9 +9503,9 @@ write_automata (void)
fprintf (output_file, "\n#if %s\n\n", CPU_UNITS_QUERY_MACRO_NAME);
output_get_cpu_unit_code_func ();
output_cpu_unit_reservation_p ();
-  output_insn_has_dfa_reservation_p ();
fprintf (output_file, "\n#endif /* #if %s */\n\n",
   CPU_UNITS_QUERY_MACRO_NAME);
+  output_insn_has_dfa_reservation_p ();
output_dfa_clean_insn_cache_func ();
output_dfa_start_func ();
output_dfa_finish_func ();




[pushed][PR111917][RA]: Fixing LRA cycling for multi-reg variable containing a fixed reg

2023-10-31 Thread Vladimir Makarov

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111971

Successfully bootstrapped and tested on x86-64, aarch64, pp64le.

commit df111406b4ea1fe2890e94d51655e571cf260d29
Author: Vladimir N. Makarov 
Date:   Tue Oct 31 10:54:43 2023 -0400

[RA]: Fixing LRA cycling for multi-reg variable containing a fixed reg

PR111971 test case uses a multi-reg variable containing a fixed reg.  LRA
rejects such multi-reg because of this when matching the constraint for
an asm insn.  The rejection results in LRA cycling.  The patch fixes this issue.

gcc/ChangeLog:

PR rtl-optimization/111971
* lra-constraints.cc: (process_alt_operands): Don't check start
hard regs for regs originated from register variables.

gcc/testsuite/ChangeLog:

PR rtl-optimization/111971
* gcc.target/powerpc/pr111971.c: New test.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index d10a2a3dc51..0607c8be7cb 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -2609,12 +2609,15 @@ process_alt_operands (int only_alternative)
 		  winreg = true;
 		  if (REG_P (op))
 		{
+		  tree decl;
 		  if (hard_regno[nop] >= 0
 			  && in_hard_reg_set_p (this_alternative_set,
 		mode, hard_regno[nop])
-			  && !TEST_HARD_REG_BIT
-			  (this_alternative_exclude_start_hard_regs,
-			   hard_regno[nop]))
+			  && ((REG_ATTRS (op) && (decl = REG_EXPR (op)) != NULL
+			   && VAR_P (decl) && DECL_HARD_REGISTER (decl))
+			  || !(TEST_HARD_REG_BIT
+   (this_alternative_exclude_start_hard_regs,
+hard_regno[nop]
 			win = true;
 		  else if (hard_regno[nop] < 0
 			   && in_class_p (op, this_alternative, NULL))
diff --git a/gcc/testsuite/gcc.target/powerpc/pr111971.c b/gcc/testsuite/gcc.target/powerpc/pr111971.c
new file mode 100644
index 000..7f058bd4820
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr111971.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+void
+foo (unsigned long long *a)
+{
+  register long long d asm ("r0") = 0x24;
+  long long n;
+  asm ("mr %0, %1" : "=r"(n) : "r"(d));
+  *a++ = n;
+}


[pushed] [RA]: Fixing i686 bootstrap failure because of pushing the equivalence patch

2023-10-27 Thread Vladimir Makarov
The following patch fixes i686 bootstrap failure because of my recent 
patch:


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112107

commit 7361b49d7fa3624cb3f1f825a22607d9d31986e5
Author: Vladimir N. Makarov 
Date:   Fri Oct 27 14:50:40 2023 -0400

[RA]: Fixing i686 bootstrap failure because of pushing the equivalence patch

GCC with my recent patch improving cost calculation for pseudos with
equivalence may generate different code with and without debug info
and as the result i686 bootstrap fails on i686.  The patch fixes this
bug.

gcc/ChangeLog:

PR rtl-optimization/112107
* ira-costs.cc: (calculate_equiv_gains): Use NONDEBUG_INSN_P
instead of INSN_P.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index c4086807076..50f80779025 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1871,7 +1871,8 @@ calculate_equiv_gains (void)
 	= ira_bb_nodes[bb->index].parent->regno_allocno_map;
   FOR_BB_INSNS (bb, insn)
 	{
-	  if (!INSN_P (insn) || !get_equiv_regno (PATTERN (insn), regno, subreg)
+	  if (!NONDEBUG_INSN_P (insn)
+	  || !get_equiv_regno (PATTERN (insn), regno, subreg)
 	  || !bitmap_bit_p (_pseudos, regno))
 	continue;
 	  rtx subst = ira_reg_equiv[regno].memory;


[pushed] [RA]: Add cost calculation for reg equivalence invariants

2023-10-27 Thread Vladimir Makarov
The following patch fixes one aarch64 GCC test failure resulted from my 
previous patch dealing with reg equivalences.


The patch was successfully bootstrapped and tested on x86-64, aarch64, 
ppc64le.


commit 9b03e1d20c00dca215b787a5e959db473325b660
Author: Vladimir N. Makarov 
Date:   Fri Oct 27 08:28:24 2023 -0400

[RA]: Add cost calculation for reg equivalence invariants

My recent patch improving cost calculation for pseudos with equivalence
resulted in failure of gcc.target/arm/eliminate.c on aarch64.  This patch
fixes this failure.

gcc/ChangeLog:

* ira-costs.cc: (get_equiv_regno, calculate_equiv_gains):
Process reg equivalence invariants.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index a59d45a6e24..c4086807076 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1784,6 +1784,7 @@ get_equiv_regno (rtx x, int , rtx )
 }
   if (REG_P (x)
   && (ira_reg_equiv[REGNO (x)].memory != NULL
+	  || ira_reg_equiv[REGNO (x)].invariant != NULL
 	  || ira_reg_equiv[REGNO (x)].constant != NULL))
 {
   regno = REGNO (x);
@@ -1826,6 +1827,7 @@ calculate_equiv_gains (void)
   for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
 if (ira_reg_equiv[regno].init_insns != NULL
 	&& (ira_reg_equiv[regno].memory != NULL
+	|| ira_reg_equiv[regno].invariant != NULL
 	|| (ira_reg_equiv[regno].constant != NULL
 		/* Ignore complicated constants which probably will be placed
 		   in memory:  */
@@ -1876,6 +1878,8 @@ calculate_equiv_gains (void)
 
 	  if (subst == NULL)
 	subst = ira_reg_equiv[regno].constant;
+	  if (subst == NULL)
+	subst = ira_reg_equiv[regno].invariant;
 	  ira_assert (subst != NULL);
 	  mode = PSEUDO_REGNO_MODE (regno);
 	  ira_init_register_move_cost_if_necessary (mode);


Re: [pushed] [RA]: Modify cost calculation for dealing with pseudo equivalences

2023-10-27 Thread Vladimir Makarov



On 10/27/23 09:56, Christophe Lyon wrote:

Hi Vladimir,

On Thu, 26 Oct 2023 at 16:00, Vladimir Makarov  wrote:

This is the second attempt to improve RA cost calculation for pseudos
with equivalences.  The patch explanation is in the log message.

The patch was successfully bootstrapped and tested on x86-64, aarch64,
and ppc64le.  The patch was also benchmarked on x86-64 spec2017.
specfp2017 performance did not changed, specint2017 improved by 0.3%.


As reported by our CI, this patch causes a regression on arm:
FAIL: gcc.target/arm/eliminate.c scan-assembler-times r0,[\\t ]*sp 3


For this testcase, we used to generate:
 str lr, [sp, #-4]!
 sub sp, sp, #12
 add r0, sp, #4
 bl  bar
 add r0, sp, #4
 bl  bar
 add r0, sp, #4
 bl  bar
 add sp, sp, #12
 ldr lr, [sp], #4
 bx  lr

After your patch, we generate:
 push{r4, lr}
 sub sp, sp, #8
 add r4, sp, #4
 mov r0, r4
 bl  bar
 mov r0, r4
 bl  bar
 mov r0, r4
 bl  bar
 add sp, sp, #8
 pop {r4, lr}
 bx  lr

which uses 1 more register and 1 more instruction.

Shall I file a bugzilla report for this?

I started to work on this right after I got the message (yesterday).  I 
already have a patch and am going to commit it during an hour.  So there 
is no need to fill the PR.




[pushed] [RA]: Modify cost calculation for dealing with pseudo equivalences

2023-10-26 Thread Vladimir Makarov
This is the second attempt to improve RA cost calculation for pseudos 
with equivalences.  The patch explanation is in the log message.


The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.  The patch was also benchmarked on x86-64 spec2017.  
specfp2017 performance did not changed, specint2017 improved by 0.3%.


commit f55cdce3f8dd8503e080e35be59c5f5390f6d95e
Author: Vladimir N. Makarov 
Date:   Thu Oct 26 09:50:40 2023 -0400

[RA]: Modfify cost calculation for dealing with equivalences

RISCV target developers reported that pseudos with equivalence used in
a loop can be spilled.  Simple changes of heuristics of cost
calculation of pseudos with equivalence or even ignoring equivalences
resulted in numerous testsuite failures on different targets or worse
spec2017 performance.  This patch implements more sophisticated cost
calculations of pseudos with equivalences.  The patch does not change
RA behaviour for targets still using the old reload pass instead of
LRA.  The patch solves the reported problem and improves x86-64
specint2017 a bit (specfp2017 performance stays the same).  The patch
takes into account how the equivalence will be used: will it be
integrated into the user insns or require an input reload insn.  It
requires additional pass over insns.  To compensate RA slow down, the
patch removes a pass over insns in the reload pass used by IRA before.
This also decouples IRA from reload more and will help to remove the
reload pass in the future if it ever happens.

gcc/ChangeLog:

* dwarf2out.cc (reg_loc_descriptor): Use lra_eliminate_regs when
LRA is used.
* ira-costs.cc: Include regset.h.
(equiv_can_be_consumed_p, get_equiv_regno, calculate_equiv_gains):
New functions.
(find_costs_and_classes): Call calculate_equiv_gains and redefine
mem_cost of pseudos with equivs when LRA is used.
* var-tracking.cc: Include ira.h and lra.h.
(vt_initialize): Use lra_eliminate_regs when LRA is used.

diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc
index 0ea73bf782e..1e0cec66c5e 100644
--- a/gcc/dwarf2out.cc
+++ b/gcc/dwarf2out.cc
@@ -14311,7 +14311,9 @@ reg_loc_descriptor (rtx rtl, enum var_init_status initialized)
  argument pointer and soft frame pointer rtx's.
  Use DW_OP_fbreg offset DW_OP_stack_value in this case.  */
   if ((rtl == arg_pointer_rtx || rtl == frame_pointer_rtx)
-  && eliminate_regs (rtl, VOIDmode, NULL_RTX) != rtl)
+  && (ira_use_lra_p
+	  ? lra_eliminate_regs (rtl, VOIDmode, NULL_RTX)
+	  : eliminate_regs (rtl, VOIDmode, NULL_RTX)) != rtl)
 {
   dw_loc_descr_ref result = NULL;
 
diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index d9e700e8947..a59d45a6e24 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tm_p.h"
 #include "insn-config.h"
 #include "regs.h"
+#include "regset.h"
 #include "ira.h"
 #include "ira-int.h"
 #include "addresses.h"
@@ -1757,6 +1758,145 @@ process_bb_node_for_costs (ira_loop_tree_node_t loop_tree_node)
 process_bb_for_costs (bb);
 }
 
+/* Check that reg REGNO can be changed by TO in INSN.  Return true in case the
+   result insn would be valid one.  */
+static bool
+equiv_can_be_consumed_p (int regno, rtx to, rtx_insn *insn)
+{
+  validate_replace_src_group (regno_reg_rtx[regno], to, insn);
+  bool res = verify_changes (0);
+  cancel_changes (0);
+  return res;
+}
+
+/* Return true if X contains a pseudo with equivalence.  In this case also
+   return the pseudo through parameter REG.  If the pseudo is a part of subreg,
+   return the subreg through parameter SUBREG.  */
+
+static bool
+get_equiv_regno (rtx x, int , rtx )
+{
+  subreg = NULL_RTX;
+  if (GET_CODE (x) == SUBREG)
+{
+  subreg = x;
+  x = SUBREG_REG (x);
+}
+  if (REG_P (x)
+  && (ira_reg_equiv[REGNO (x)].memory != NULL
+	  || ira_reg_equiv[REGNO (x)].constant != NULL))
+{
+  regno = REGNO (x);
+  return true;
+}
+  RTX_CODE code = GET_CODE (x);
+  const char *fmt = GET_RTX_FORMAT (code);
+
+  for (int i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+if (fmt[i] == 'e')
+  {
+	if (get_equiv_regno (XEXP (x, i), regno, subreg))
+	  return true;
+  }
+else if (fmt[i] == 'E')
+  {
+	for (int j = 0; j < XVECLEN (x, i); j++)
+	  if (get_equiv_regno (XVECEXP (x, i, j), regno, subreg))
+	return true;
+  }
+  return false;
+}
+
+/* A pass through the current function insns.  Calculate costs of using
+   equivalences for pseudos and store them in regno_equiv_gains.  */
+
+static void
+calculate_equiv_gains (void)
+{
+  basic_block bb;
+  int regno, freq, cost;
+  rtx subreg;
+  rtx_insn *insn;
+  machine_mode mode;
+  enum reg_class rclass;
+  bitmap_head equiv_pseudos;
+
+  ira_assert (allocno_p);
+  bitmap_initialize 

Re: [Backport RFA] lra: Avoid unfolded plus-0

2023-10-18 Thread Vladimir Makarov



On 10/18/23 09:37, Richard Sandiford wrote:

Vlad, is it OK if I backport the patch below to fix
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111528 ?  Jakub has
given a conditional OK on irc.


Ok.  It should be safe.  I don't expect any issues because of this.



Re: [PATCH] ira: Scale save/restore costs of callee save registers with block frequency

2023-10-05 Thread Vladimir Makarov



On 10/3/23 10:07, Surya Kumari Jangala wrote:

ira: Scale save/restore costs of callee save registers with block frequency

In assign_hard_reg(), when computing the costs of the hard registers, the
cost of saving/restoring a callee-save hard register in prolog/epilog is
taken into consideration. However, this cost is not scaled with the entry
block frequency. Without scaling, the cost of saving/restoring is quite
small and this can result in a callee-save register being chosen by
assign_hard_reg() even though there are free caller-save registers
available. Assigning a callee save register to a pseudo that is live
in the entire function and across a call will cause shrink wrap to fail.


Thank you for addressing this part of code.  Sometimes changes looking 
obvious have unpredicted results.  I remember experimenting with 
different heuristics for this code long time ago when 32-bit x86 target 
was the major one and this was the best variant I found.  Since a lot of 
changes happened since then, I decided to benchmark your change.


This change is increasing x86-64 spec2017 code size by 0.67% in 
average.  The increase is very stable for 20 spec2017 benchmarks. Only 
code for bwaves is smaller (by 0.01%).  The specfp2017 performance is 
the same.  There is one positive impact, specin2017 improved by 0.6% 
(8.59 vs 8.54) mainly because of improvement of xalamcbmk (2.5%) and 
exchange (5%).


So I propose to make this change only when it is not an optimization for 
the code size.  Also please be prepared that there might be testsuite 
failures on other targets: some targets are overconstrained by tests 
expecting specific generated code.



2023-10-03  Surya Kumari Jangala  

gcc/
PR rtl-optimization/111673
* ira-color.cc (assign_hard_reg): Scale save/restore costs of
callee save registers with block frequency.

gcc/testsuite/
PR rtl-optimization/111673
* gcc.target/powerpc/pr111673/c: New test.
---

diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index f2e8ea34152..eb20c52310d 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -2175,7 +2175,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
add_cost = ((ira_memory_move_cost[mode][rclass][0]
 + ira_memory_move_cost[mode][rclass][1])
* saved_nregs / hard_regno_nregs (hard_regno,
- mode) - 1);
+ mode) - 1)
+   * REG_FREQ_FROM_BB (ENTRY_BLOCK_PTR_FOR_FN (cfun));
cost += add_cost;
full_cost += add_cost;
  }
diff --git a/gcc/testsuite/gcc.target/powerpc/pr111673.c 
b/gcc/testsuite/gcc.target/powerpc/pr111673.c
new file mode 100644
index 000..e0c0f85460a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr111673.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+
+/* Verify there is an early return without the prolog and shrink-wrap
+   the function. */
+
+int f (int);
+int
+advance (int dz)
+{
+  if (dz > 0)
+return (dz + dz) * dz;
+  else
+return dz * f (dz);
+}
+
+/* { dg-final { scan-rtl-dump-times "Performing shrink-wrapping" 1 
"pro_and_epilogue" } } */





reverting patch to improve equiv cost calculation

2023-09-28 Thread Vladimir Makarov
I've got a lot of complaints about my recent patch to improve equiv cost 
calculation.  So I am reverting the patch.
commit 8552dcd8e4448c02fe230662093756b75dd94399
Author: Vladimir N. Makarov 
Date:   Thu Sep 28 11:53:51 2023 -0400

Revert "[RA]: Improve cost calculation of pseudos with equivalences"

This reverts commit 3c834d85f2ec42c60995c2b678196a06cb744959.

Although the patch improves x86-64 specfp2007, it also results in
performance and code size regression on different targets and
new GCC testsuite failures on tests expecting a specific output.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 8c93ace5094..d9e700e8947 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1947,8 +1947,15 @@ find_costs_and_classes (FILE *dump_file)
 	}
 	  if (i >= first_moveable_pseudo && i < last_moveable_pseudo)
 	i_mem_cost = 0;
-	  else
-	i_mem_cost -= equiv_savings;
+	  else if (equiv_savings < 0)
+	i_mem_cost = -equiv_savings;
+	  else if (equiv_savings > 0)
+	{
+	  i_mem_cost = 0;
+	  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
+		i_costs[k] += equiv_savings;
+	}
+
 	  best_cost = (1 << (HOST_BITS_PER_INT - 2)) - 1;
 	  best = ALL_REGS;
 	  alt_class = NO_REGS;


[pushed][RA]: Add flag for checking IRA in progress

2023-09-28 Thread Vladimir Makarov
I've pushed the following patch. The explanation is in commit message.  
The patch was successfully bootstrapped on x86-64.
commit 0c8ecbcd3cf7d7187d2017ad02b663a57123b417
Author: Vladimir N. Makarov 
Date:   Thu Sep 28 09:41:18 2023 -0400

[RA]: Add flag for checking IRA in progress

RISCV target developers need a flag to prevent creating
insns in IRA which can not be split after RA as they will need a
temporary reg.  The patch introduces such flag.

gcc/ChangeLog:

* rtl.h (lra_in_progress): Change type to bool.
(ira_in_progress): Add new extern.
* ira.cc (ira_in_progress): New global.
(pass_ira::execute): Set up ira_in_progress.
* lra.cc: (lra_in_progress): Change type to bool and initialize.
(lra): Use bool values for lra_in_progress.
* lra-eliminations.cc (init_elim_table): Ditto.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 0b0d460689d..d7530f01380 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -5542,6 +5542,9 @@ bool ira_conflicts_p;
 /* Saved between IRA and reload.  */
 static int saved_flag_ira_share_spill_slots;
 
+/* Set to true while in IRA.  */
+bool ira_in_progress = false;
+
 /* This is the main entry of IRA.  */
 static void
 ira (FILE *f)
@@ -6110,7 +6113,9 @@ public:
 }
   unsigned int execute (function *) final override
 {
+  ira_in_progress = true;
   ira (dump_file);
+  ira_in_progress = false;
   return 0;
 }
 
diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index 4daaff1a124..9ff4774cf5d 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -1294,14 +1294,14 @@ init_elim_table (void)
  will cause, e.g., gen_rtx_REG (Pmode, STACK_POINTER_REGNUM) to
  equal stack_pointer_rtx.  We depend on this. Threfore we switch
  off that we are in LRA temporarily.  */
-  lra_in_progress = 0;
+  lra_in_progress = false;
   for (ep = reg_eliminate; ep < _eliminate[NUM_ELIMINABLE_REGS]; ep++)
 {
   ep->from_rtx = gen_rtx_REG (Pmode, ep->from);
   ep->to_rtx = gen_rtx_REG (Pmode, ep->to);
   eliminable_reg_rtx[ep->from] = ep->from_rtx;
 }
-  lra_in_progress = 1;
+  lra_in_progress = true;
 }
 
 /* Function for initialization of elimination once per function.  It
diff --git a/gcc/lra.cc b/gcc/lra.cc
index 361f84fdacb..bcc00ff7d6b 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -2262,8 +2262,8 @@ update_inc_notes (void)
   }
 }
 
-/* Set to 1 while in lra.  */
-int lra_in_progress;
+/* Set to true while in LRA.  */
+bool lra_in_progress = false;
 
 /* Start of pseudo regnos before the LRA.  */
 int lra_new_regno_start;
@@ -2360,7 +2360,7 @@ lra (FILE *f)
   if (flag_checking)
 check_rtl (false);
 
-  lra_in_progress = 1;
+  lra_in_progress = true;
 
   lra_live_range_iter = lra_coalesce_iter = lra_constraint_iter = 0;
   lra_assignment_iter = lra_assignment_iter_after_spill = 0;
@@ -2552,7 +2552,7 @@ lra (FILE *f)
   ira_restore_scratches (lra_dump_file);
   lra_eliminate (true, false);
   lra_final_code_change ();
-  lra_in_progress = 0;
+  lra_in_progress = false;
   if (live_p)
 lra_clear_live_ranges ();
   lra_live_ranges_finish ();
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 102ad9b57a6..8e59cd5d156 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -4108,8 +4108,11 @@ extern int epilogue_completed;
 
 extern int reload_in_progress;
 
-/* Set to 1 while in lra.  */
-extern int lra_in_progress;
+/* Set to true while in IRA.  */
+extern bool ira_in_progress;
+
+/* Set to true while in LRA.  */
+extern bool lra_in_progress;
 
 /* This macro indicates whether you may create a new
pseudo-register.  */


[pushed] [PR111497][LRA]: Copy substituted equivalence

2023-09-25 Thread Vladimir Makarov

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111497

The patch was successfully tested and bootstrapped on x86-64 and aarch64.
commit 3c23defed384cf17518ad6c817d94463a445d21b
Author: Vladimir N. Makarov 
Date:   Mon Sep 25 16:19:50 2023 -0400

[PR111497][LRA]: Copy substituted equivalence

When we substitute the equivalence and it becomes shared, we can fail
to correctly update reg info used by LRA.  This can result in wrong
code generation, e.g. because of incorrect live analysis.  It can also
result in compiler crash as the pseudo survives RA.  This is what
exactly happened for the PR.  This patch solves this problem by
unsharing substituted equivalences.

gcc/ChangeLog:

PR middle-end/111497
* lra-constraints.cc (lra_constraints): Copy substituted
equivalence.
* lra.cc (lra): Change comment for calling unshare_all_rtl_again.

gcc/testsuite/ChangeLog:

PR middle-end/111497
* g++.target/i386/pr111497.C: new test.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 3aaa4906999..76a1393ab23 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5424,6 +5424,11 @@ lra_constraints (bool first_p)
 	   loc_equivalence_callback, curr_insn);
 	  if (old != *curr_id->operand_loc[0])
 		{
+		  /* If we substitute pseudo by shared equivalence, we can fail
+		 to update LRA reg info and this can result in many
+		 unexpected consequences.  So keep rtl unshared:  */
+		  *curr_id->operand_loc[0]
+		= copy_rtx (*curr_id->operand_loc[0]);
 		  lra_update_insn_regno_info (curr_insn);
 		  changed_p = true;
 		}
diff --git a/gcc/lra.cc b/gcc/lra.cc
index 563aff10b96..361f84fdacb 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -2579,9 +2579,8 @@ lra (FILE *f)
   if (inserted_p)
 commit_edge_insertions ();
 
-  /* Replacing pseudos with their memory equivalents might have
- created shared rtx.  Subsequent passes would get confused
- by this, so unshare everything here.  */
+  /* Subsequent passes expect that rtl is unshared, so unshare everything
+ here.  */
   unshare_all_rtl_again (get_insns ());
 
   if (flag_checking)
diff --git a/gcc/testsuite/g++.target/i386/pr111497.C b/gcc/testsuite/g++.target/i386/pr111497.C
new file mode 100644
index 000..a645bb95907
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr111497.C
@@ -0,0 +1,22 @@
+// { dg-do compile { target ia32 } }
+// { dg-options "-march=i686 -mtune=generic -fPIC -O2 -g" }
+
+class A;
+struct B { const char *b1; int b2; };
+struct C : B { C (const char *x, int y) { b1 = x; b2 = y; } };
+struct D : C { D (B x) : C (x.b1, x.b2) {} };
+struct E { E (A *); };
+struct F : E { D f1, f2, f3, f4, f5, f6; F (A *, const B &, const B &, const B &); };
+struct G : F { G (A *, const B &, const B &, const B &); };
+struct H { int h; };
+struct I { H i; };
+struct J { I *j; };
+struct A : J {};
+inline F::F (A *x, const B , const B , const B )
+  : E(x), f1(y), f2(z), f3(w), f4(y), f5(z), f6(w) {}
+G::G (A *x, const B , const B , const B ) : F(x, y, z, w)
+{
+  H *h = >j->i;
+  if (h)
+h->h++;
+}


Re: [PATCH 02/13] [APX EGPR] middle-end: Add index_reg_class with insn argument.

2023-09-22 Thread Vladimir Makarov



On 9/22/23 06:56, Hongyu Wang wrote:

Like base_reg_class, INDEX_REG_CLASS also does not support backend insn.
Add index_reg_class with insn argument for lra/reload usage.

gcc/ChangeLog:

* addresses.h (index_reg_class): New wrapper function like
base_reg_class.
* doc/tm.texi: Document INSN_INDEX_REG_CLASS.
* doc/tm.texi.in: Ditto.
* lra-constraints.cc (index_part_to_reg): Pass index_class.
(process_address_1): Calls index_reg_class with curr_insn and
replace INDEX_REG_CLASS with its return value index_cl.
* reload.cc (find_reloads_address): Likewise.
(find_reloads_address_1): Likewise.


The patch is ok for me to commit it to the trunk.  Thank you.

So all changes to the RA have been reviewed.  You just need an approval 
to the rest patches from an x86-64 maintainer.




Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-09-22 Thread Vladimir Makarov



On 9/22/23 06:56, Hongyu Wang wrote:

From: Kong Lingling 

Current reload infrastructure does not support selective base_reg_class
for backend insn. Add new macros with insn parameters to base_reg_class
for lra/reload usage.

gcc/ChangeLog:

* addresses.h (base_reg_class): Add insn argument and new macro
INSN_BASE_REG_CLASS.
(regno_ok_for_base_p_1): Add insn argument and new macro
REGNO_OK_FOR_INSN_BASE_P.
(regno_ok_for_base_p): Add insn argument and parse to ok_for_base_p_1.
* doc/tm.texi: Document INSN_BASE_REG_CLASS and
REGNO_OK_FOR_INSN_BASE_P.
* doc/tm.texi.in: Ditto.
* lra-constraints.cc (process_address_1): Pass insn to
base_reg_class.
(curr_insn_transform): Ditto.
* reload.cc (find_reloads): Ditto.
(find_reloads_address): Ditto.
(find_reloads_address_1): Ditto.
(find_reloads_subreg_address): Ditto.
* reload1.cc (maybe_fix_stack_asms): Ditto.


The patch is ok for committing to the trunk.  Thank you.

It would be nice to add to the documentation that INSN_BASE_REG_CLASS, 
INSN_INDEX_REG_CLASS, and REGNO_OK_FOR_INSN_BASE_P if defined have 
priority over older corresponding macros as it is already documented for 
REGNO_MODE_CODE_OK_FOR_BASE_P relating to REGNO_OK_FOR_BASE_P. But this 
small issue can be addressed later.





Re: [PATCH] ira: Consider save/restore costs of callee-save registers [PR110071]

2023-09-18 Thread Vladimir Makarov via Gcc-patches



On 9/15/23 10:48, Vladimir Makarov wrote:


On 9/14/23 06:45, Surya Kumari Jangala wrote:

ira: Consider save/restore costs of callee-save registers [PR110071]

In improve_allocation() routine, IRA checks for each allocno if spilling
any conflicting allocnos can improve the allocation of this allocno.
This routine computes the cost improvement for usage of each profitable
hard register for a given allocno. The existing code in
improve_allocation() does not consider the save/restore costs of callee
save registers while computing the cost improvement.

This can result in a callee save register being assigned to a pseudo
that is live in the entire function and across a call, overriding a
non-callee save register assigned to the pseudo by graph coloring. So
the entry basic block requires a prolog, thereby causing shrink wrap to
fail.


Yes, that can be a problem. The general idea is ok for me and common 
sense says me that the performance should be better but I would like 
to benchmark the patch on x86-64 spec2017 first.  Real applications 
have high register pressure and results might be not what we expect.  
So I'll do it, report the results, and give my approval if there is no 
big performance degradation.  I think the results will be ready on 
Monday.



I've benchmarked the patch on x86-64.  Specint2017 rate changed from 
8.54 to 8.51 and specfp2017 rate changed from 21.1 to 21.2. It is 
probably in a range of measurement error.


So the patch is ok for me to commit.  Thank you for working on the issue.




Re: [PATCH] ira: Consider save/restore costs of callee-save registers [PR110071]

2023-09-15 Thread Vladimir Makarov via Gcc-patches



On 9/14/23 06:45, Surya Kumari Jangala wrote:

ira: Consider save/restore costs of callee-save registers [PR110071]

In improve_allocation() routine, IRA checks for each allocno if spilling
any conflicting allocnos can improve the allocation of this allocno.
This routine computes the cost improvement for usage of each profitable
hard register for a given allocno. The existing code in
improve_allocation() does not consider the save/restore costs of callee
save registers while computing the cost improvement.

This can result in a callee save register being assigned to a pseudo
that is live in the entire function and across a call, overriding a
non-callee save register assigned to the pseudo by graph coloring. So
the entry basic block requires a prolog, thereby causing shrink wrap to
fail.


Yes, that can be a problem. The general idea is ok for me and common 
sense says me that the performance should be better but I would like to 
benchmark the patch on x86-64 spec2017 first.  Real applications have 
high register pressure and results might be not what we expect.  So I'll 
do it, report the results, and give my approval if there is no big 
performance degradation.  I think the results will be ready on Monday.





[pushed] [RA]: Improve cost calculation of pseudos with equivalences

2023-09-14 Thread Vladimir Makarov via Gcc-patches
I've committed the following patch.  The reason for this patch is 
explained in its commit message.


The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


commit 3c834d85f2ec42c60995c2b678196a06cb744959
Author: Vladimir N. Makarov 
Date:   Thu Sep 14 10:26:48 2023 -0400

[RA]: Improve cost calculation of pseudos with equivalences

RISCV target developers reported that RA can spill pseudo used in a
loop although there are enough registers to assign.  It happens when
the pseudo has an equivalence outside the loop and the equivalence is
not merged into insns using the pseudo.  IRA sets up that memory cost
to zero when the pseudo has an equivalence and it means that the
pseudo will be probably spilled.  This approach worked well for i686
(different approaches were benchmarked long time ago on spec2k).
Although common sense says that the code is wrong and this was
confirmed by RISCV developers.

I've tried the following patch on I7-9700k and it improved spec17 fp
by 1.5% (21.1 vs 20.8) although spec17 int is a bit worse by 0.45%
(8.54 vs 8.58).  The average generated code size is practically the
same (0.001% difference).

In the future we probably need to try more sophisticated cost
calculation which should take into account that the equiv can not be
combined in usage insns and the costs of reloads because of this.

gcc/ChangeLog:

* ira-costs.cc (find_costs_and_classes): Decrease memory cost
by equiv savings.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index d9e700e8947..8c93ace5094 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1947,15 +1947,8 @@ find_costs_and_classes (FILE *dump_file)
 	}
 	  if (i >= first_moveable_pseudo && i < last_moveable_pseudo)
 	i_mem_cost = 0;
-	  else if (equiv_savings < 0)
-	i_mem_cost = -equiv_savings;
-	  else if (equiv_savings > 0)
-	{
-	  i_mem_cost = 0;
-	  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
-		i_costs[k] += equiv_savings;
-	}
-
+	  else
+	i_mem_cost -= equiv_savings;
 	  best_cost = (1 << (HOST_BITS_PER_INT - 2)) - 1;
 	  best = ALL_REGS;
 	  alt_class = NO_REGS;


Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-09-14 Thread Vladimir Makarov via Gcc-patches



On 9/10/23 00:49, Hongyu Wang wrote:

Vladimir Makarov via Gcc-patches  于2023年9月9日周六 01:04写道:


On 8/31/23 04:20, Hongyu Wang wrote:

@@ -2542,6 +2542,8 @@ the code of the immediately enclosing expression 
(@code{MEM} for the top level
   of an address, @code{ADDRESS} for something that occurs in an
   @code{address_operand}).  @var{index_code} is the code of the corresponding
   index expression if @var{outer_code} is @code{PLUS}; @code{SCRATCH} 
otherwise.
+@code{insn} indicates insn specific base register class should be subset
+of the original base register class.
   @end defmac

I'd prefer more general description of 'insn' argument for the macros.
Something like that:

@code{insn} can be used to define an insn-specific base register class.


Sure, will adjust in the V2 patch.
Also, currently we reuse the old macro MODE_CODE_BASE_REG_CLASS, do
you think we need a new macro like INSN_BASE_REG_CLASS as other
parameters are actually unused? Then we don't need to change other
targets like avr/gcn.

I thought about this too.  Using new macros would be definitely worth to 
add, especially when you are already adding INSN_INDEX_REG_CLASS.


The names INSN_BASE_REG_CLASS instead of MODE_CODE_BASE_REG_CLASS and 
REGNO_OK_FOR_INSN_BASE_P instead of REGNO_MODE_CODE_OK_FOR_BASE_P are ok 
for me too.


When you submit the v2 patch, I'll review the RA part as soon as 
possible (actually I already looked at this) and most probably give my 
approval for the RA part because I prefer you current approach for RA 
instead of introducing new memory constraints.




Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-09-08 Thread Vladimir Makarov via Gcc-patches



On 8/31/23 04:20, Hongyu Wang wrote:

@@ -2542,6 +2542,8 @@ the code of the immediately enclosing expression 
(@code{MEM} for the top level
  of an address, @code{ADDRESS} for something that occurs in an
  @code{address_operand}).  @var{index_code} is the code of the corresponding
  index expression if @var{outer_code} is @code{PLUS}; @code{SCRATCH} otherwise.
+@code{insn} indicates insn specific base register class should be subset
+of the original base register class.
  @end defmac


I'd prefer more general description of 'insn' argument for the macros.  
Something like that:


@code{insn} can be used to define an insn-specific base register class.




[pushed][PR111225][LRA]: Don't reuse chosen insn alternative with special memory constraint

2023-09-07 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111225

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


commit f7bca44d97ad01b39f9d6e7809df7bf517eeb2fb
Author: Vladimir N. Makarov 
Date:   Thu Sep 7 09:59:10 2023 -0400

[LRA]: Don't reuse chosen insn alternative with special memory constraint

To speed up GCC, LRA reuses chosen alternative from previous
constraint subpass.  A spilled pseudo is considered ok for any memory
constraint although stack slot assigned to the pseudo later might not
satisfy the chosen alternative constraint.  As we don't consider all insn
alternatives on the subsequent LRA sub-passes, it might result in LRA failure
to generate the correct insn.  This patch solves the problem.

gcc/ChangeLog:

PR target/111225
* lra-constraints.cc (goal_reuse_alt_p): New global flag.
(process_alt_operands): Set up the flag.  Clear flag for chosen
alternative with special memory constraints.
(process_alt_operands): Set up used insn alternative depending on the flag.

gcc/testsuite/ChangeLog:

PR target/111225
* gcc.target/i386/pr111225.c: New test.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index c718bedff32..3aaa4906999 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -1462,6 +1462,9 @@ static int goal_alt_matches[MAX_RECOG_OPERANDS];
 static int goal_alt_dont_inherit_ops_num;
 /* Numbers of operands whose reload pseudos should not be inherited.  */
 static int goal_alt_dont_inherit_ops[MAX_RECOG_OPERANDS];
+/* True if we should try only this alternative for the next constraint sub-pass
+   to speed up the sub-pass.  */
+static bool goal_reuse_alt_p;
 /* True if the insn commutative operands should be swapped.  */
 static bool goal_alt_swapped;
 /* The chosen insn alternative.	 */
@@ -2130,6 +2133,7 @@ process_alt_operands (int only_alternative)
   int curr_alt_dont_inherit_ops_num;
   /* Numbers of operands whose reload pseudos should not be inherited.	*/
   int curr_alt_dont_inherit_ops[MAX_RECOG_OPERANDS];
+  bool curr_reuse_alt_p;
   /* True if output stack pointer reload should be generated for the current
  alternative.  */
   bool curr_alt_out_sp_reload_p;
@@ -2217,6 +2221,7 @@ process_alt_operands (int only_alternative)
   reject += static_reject;
   early_clobbered_regs_num = 0;
   curr_alt_out_sp_reload_p = false;
+  curr_reuse_alt_p = true;
   
   for (nop = 0; nop < n_operands; nop++)
 	{
@@ -2574,7 +2579,10 @@ process_alt_operands (int only_alternative)
 		  if (satisfies_memory_constraint_p (op, cn))
 			win = true;
 		  else if (spilled_pseudo_p (op))
-			win = true;
+			{
+			  curr_reuse_alt_p = false;
+			  win = true;
+			}
 		  break;
 		}
 		  break;
@@ -3318,6 +3326,7 @@ process_alt_operands (int only_alternative)
 	  goal_alt_offmemok[nop] = curr_alt_offmemok[nop];
 	}
 	  goal_alt_dont_inherit_ops_num = curr_alt_dont_inherit_ops_num;
+	  goal_reuse_alt_p = curr_reuse_alt_p;
 	  for (nop = 0; nop < curr_alt_dont_inherit_ops_num; nop++)
 	goal_alt_dont_inherit_ops[nop] = curr_alt_dont_inherit_ops[nop];
 	  goal_alt_swapped = curr_swapped;
@@ -4399,7 +4408,8 @@ curr_insn_transform (bool check_only_p)
 }
 
   lra_assert (goal_alt_number >= 0);
-  lra_set_used_insn_alternative (curr_insn, goal_alt_number);
+  lra_set_used_insn_alternative (curr_insn, goal_reuse_alt_p
+ ? goal_alt_number : LRA_UNKNOWN_ALT);
 
   if (lra_dump_file != NULL)
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr111225.c b/gcc/testsuite/gcc.target/i386/pr111225.c
new file mode 100644
index 000..5d92daf215b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111225.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fsanitize=thread -mforce-drap -mavx512cd" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int foo (__m256i x, __m256i y)
+{
+  __m256i a = x & ~y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}
+
+int bar (__m256i x, __m256i y)
+{
+  __m256i a = ~x & y;
+  return !__builtin_ia32_ptestz256 (a, a);
+}


Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-09-07 Thread Vladimir Makarov via Gcc-patches



On 9/7/23 02:23, Uros Bizjak wrote:

On Wed, Sep 6, 2023 at 9:43 PM Vladimir Makarov  wrote:


On 9/1/23 05:07, Hongyu Wang wrote:



I think the approach proposed by Intel developers is better.  In some way
we already use such approach when we pass memory mode to get the base
reg class.  Although we could use different memory constraints for
different modes when the possible base reg differs for some memory
modes.

Using special memory constraints probably can be implemented too (I
understand attractiveness of such approach for readability of the
machine description).  But in my opinion it will require much bigger
work in IRA/LRA/reload.  It also significantly slow down RA as we need
to process insn constraints for processing each memory in many places
(e.g. for calculation of reg classes and costs in IRA).  Still I think
there will be a few cases for this approach resulting in a bigger
probability of assigning hard reg out of specific base reg class and
this will result in additional reloads.

So the approach proposed by Intel is ok for me.  Although if x86 maintainers
are strongly against this approach and the changes in x86 machine
dependent code and Intel developers implement Uros approach, I am
ready to review this.  But still I prefer the current Intel developers
approach for reasons I mentioned above.

My above proposal is more or less a wish from a target maintainer PoV.
Ideally, we would have a bunch of different memory constraints, and a
target hook that returns corresponding BASE/INDEX reg classes.
However, I have no idea about the complexity of the implementation in
the infrastructure part of the compiler.

Basically, it needs introducing new hooks which return base and index 
classes from special memory constraints. When we process memory in an 
insn (a lot of places in IRA, LRA,reload) we should consider all 
possible memory insn constraints, take intersection of basic and index 
reg classes for the constraints and use them instead of the default base 
and reg classes.


The required functionality is absent in reload too.

I would say that it is a moderate size project (1-2 months for me).  It 
still requires to introduce new hooks and I guess there are few cases 
when we will still assign hard regs out of desirable base class for 
address pseudos and this will results in generation of additional reload 
insns.  It also means much more additional changes in RA source code and 
x86 machine dependent files.


Probably, with this approach there will be also edge cases when we need 
to solve new PRs because of LRA failures to generate the correct code 
but I believe they can be solved.


Therefore I lean toward the current Intel approach when to get base reg 
class we pass the insn as a parameter additionally to memory mode.





Re: [PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-09-06 Thread Vladimir Makarov via Gcc-patches



On 9/1/23 05:07, Hongyu Wang wrote:

Uros Bizjak via Gcc-patches  于2023年8月31日周四 18:16写道:

On Thu, Aug 31, 2023 at 10:20 AM Hongyu Wang  wrote:

From: Kong Lingling 

Current reload infrastructure does not support selective base_reg_class
for backend insn. Add insn argument to base_reg_class for
lra/reload usage.

I don't think this is the correct approach. Ideally, a memory
constraint should somehow encode its BASE/INDEX register class.
Instead of passing "insn", simply a different constraint could be used
in the constraint string of the relevant insn.

We tried constraint only at the beginning, but then we found the
reload infrastructure
does not work like that.

The BASE/INDEX reg classes are determined before choosing alternatives, in
process_address under curr_insn_transform. Process_address creates the mem
operand according to the BASE/INDEX reg class. Then, the memory operand
constraint check will evaluate the mem op with targetm.legitimate_address_p.

If we want to make use of EGPR in base/index we need to either extend BASE/INDEX
reg class in the backend, or, for specific insns, add a target hook to
tell reload
that the extended reg class with EGPR can be used to construct memory operand.

CC'd Vladimir as git send-mail failed to add recipient.



I think the approach proposed by Intel developers is better.  In some way
we already use such approach when we pass memory mode to get the base
reg class.  Although we could use different memory constraints for
different modes when the possible base reg differs for some memory
modes.

Using special memory constraints probably can be implemented too (I
understand attractiveness of such approach for readability of the
machine description).  But in my opinion it will require much bigger
work in IRA/LRA/reload.  It also significantly slow down RA as we need
to process insn constraints for processing each memory in many places
(e.g. for calculation of reg classes and costs in IRA).  Still I think
there will be a few cases for this approach resulting in a bigger
probability of assigning hard reg out of specific base reg class and
this will result in additional reloads.

So the approach proposed by Intel is ok for me.  Although if x86 maintainers
are strongly against this approach and the changes in x86 machine
dependent code and Intel developers implement Uros approach, I am
ready to review this.  But still I prefer the current Intel developers
approach for reasons I mentioned above.



Re: [pushed][LRA]: Spill pseudos assigned to fp when fp->sp elimination became impossible

2023-08-17 Thread Vladimir Makarov via Gcc-patches



On 8/17/23 07:19, senthilkumar.selva...@microchip.com wrote:

On Wed, 2023-08-16 at 12:13 -0400, Vladimir Makarov wrote:

EXTERNAL EMAIL: Do not click links or open attachments unless you know the 
content is safe

The attached patch fixes recently found wrong insn removal in LRA port
for AVR.

The patch was successfully tested and bootstrapped on x86-64 and aarch64.



Hi Vladimir,

   Thanks for working on this. After applying the patch, I'm seeing that the
   pseudo in the frame pointer that got spilled is taking up the same stack
   slot that was already assigned to a spilled pseudo, and that is causing 
execution
   failure (it is also causing a crash when building libgcc for avr)

...
   I tried a hacky workaround (see patch below) to create a new stack slot and
   assign the spilled pseudo to it, and that works.
   
   Not sure if that's the right way to do it though.


The general way of solution is right but I've just committed a different 
version of the patch.





[pushed][LRA]: When assigning stack slots to pseudos previously assigned to fp consider other spilled pseudos

2023-08-17 Thread Vladimir Makarov via Gcc-patches
The following patch fixes a problem with allocating the same stack slots 
to conflicting pseudos.  The problem exists only for AVR LRA port.


The patch was successfully bootstrapped and tested on x86-64 and aarch64.

commit c024867d1aa9d465e0236fc9d45d8e1d4bb6bd30
Author: Vladimir N. Makarov 
Date:   Thu Aug 17 11:57:45 2023 -0400

[LRA]: When assigning stack slots to pseudos previously assigned to fp 
consider other spilled pseudos

The previous LRA patch can assign slot of conflicting pseudos to
pseudos spilled after prohibiting fp->sp elimination.  This patch
fixes this problem.

gcc/ChangeLog:

* lra-spills.cc (assign_stack_slot_num_and_sort_pseudos): Moving
slots_num initialization from here ...
(lra_spill): ... to here before the 1st call of
assign_stack_slot_num_and_sort_pseudos.  Add the 2nd call after
fp->sp elimination.

diff --git a/gcc/lra-spills.cc b/gcc/lra-spills.cc
index 7e1d35b5e4e..a663a1931e3 100644
--- a/gcc/lra-spills.cc
+++ b/gcc/lra-spills.cc
@@ -363,7 +363,6 @@ assign_stack_slot_num_and_sort_pseudos (int *pseudo_regnos, 
int n)
 {
   int i, j, regno;
 
-  slots_num = 0;
   /* Assign stack slot numbers to spilled pseudos, use smaller numbers
  for most frequently used pseudos. */
   for (i = 0; i < n; i++)
@@ -628,6 +627,7 @@ lra_spill (void)
   /* Sort regnos according their usage frequencies.  */
   qsort (pseudo_regnos, n, sizeof (int), regno_freq_compare);
   n = assign_spill_hard_regs (pseudo_regnos, n);
+  slots_num = 0;
   assign_stack_slot_num_and_sort_pseudos (pseudo_regnos, n);
   for (i = 0; i < n; i++)
 if (pseudo_slots[pseudo_regnos[i]].mem == NULL_RTX)
@@ -635,6 +635,7 @@ lra_spill (void)
   if ((n2 = lra_update_fp2sp_elimination (pseudo_regnos)) > 0)
 {
   /* Assign stack slots to spilled pseudos assigned to fp.  */
+  assign_stack_slot_num_and_sort_pseudos (pseudo_regnos, n2);
   for (i = 0; i < n2; i++)
if (pseudo_slots[pseudo_regnos[i]].mem == NULL_RTX)
  assign_mem_slot (pseudo_regnos[i]);


[pushed][LRA]: Spill pseudos assigned to fp when fp->sp elimination became impossible

2023-08-16 Thread Vladimir Makarov via Gcc-patches
The attached patch fixes recently found wrong insn removal in LRA port 
for AVR.


The patch was successfully tested and bootstrapped on x86-64 and aarch64.


commit 748a77558ff37761faa234e19327ad1decaace33
Author: Vladimir N. Makarov 
Date:   Wed Aug 16 09:13:54 2023 -0400

[LRA]: Spill pseudos assigned to fp when fp->sp elimination became 
impossible

Porting LRA to AVR revealed that creating a stack slot can make fp->sp
elimination impossible.  The previous patches undoes fp assignment after
the stack slot creation but calculated wrongly live info after this.  This
resulted in wrong generation by deleting some still alive insns.  This
patch fixes this problem.

gcc/ChangeLog:

* lra-int.h (lra_update_fp2sp_elimination): Change the prototype.
* lra-eliminations.cc (spill_pseudos): Record spilled pseudos.
(lra_update_fp2sp_elimination): Ditto.
(update_reg_eliminate): Adjust spill_pseudos call.
* lra-spills.cc (lra_spill): Assign stack slots to pseudos spilled
in lra_update_fp2sp_elimination.

diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index 1f4e3fec9e0..3c58d4a3815 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -1086,18 +1086,18 @@ eliminate_regs_in_insn (rtx_insn *insn, bool replace_p, 
bool first_p,
   lra_update_insn_recog_data (insn);
 }
 
-/* Spill pseudos which are assigned to hard registers in SET.  Add
-   affected insns for processing in the subsequent constraint
-   pass.  */
-static void
-spill_pseudos (HARD_REG_SET set)
+/* Spill pseudos which are assigned to hard registers in SET, record them in
+   SPILLED_PSEUDOS unless it is null, and return the recorded pseudos number.
+   Add affected insns for processing in the subsequent constraint pass.  */
+static int
+spill_pseudos (HARD_REG_SET set, int *spilled_pseudos)
 {
-  int i;
+  int i, n;
   bitmap_head to_process;
   rtx_insn *insn;
 
   if (hard_reg_set_empty_p (set))
-return;
+return 0;
   if (lra_dump_file != NULL)
 {
   fprintf (lra_dump_file, "   Spilling non-eliminable hard regs:");
@@ -1107,6 +1107,7 @@ spill_pseudos (HARD_REG_SET set)
   fprintf (lra_dump_file, "\n");
 }
   bitmap_initialize (_process, _obstack);
+  n = 0;
   for (i = FIRST_PSEUDO_REGISTER; i < max_reg_num (); i++)
 if (lra_reg_info[i].nrefs != 0 && reg_renumber[i] >= 0
&& overlaps_hard_reg_set_p (set,
@@ -1116,6 +1117,8 @@ spill_pseudos (HARD_REG_SET set)
  fprintf (lra_dump_file, "  Spilling r%d(%d)\n",
   i, reg_renumber[i]);
reg_renumber[i] = -1;
+   if (spilled_pseudos != NULL)
+ spilled_pseudos[n++] = i;
bitmap_ior_into (_process, _reg_info[i].insn_bitmap);
   }
   lra_no_alloc_regs |= set;
@@ -1126,6 +1129,7 @@ spill_pseudos (HARD_REG_SET set)
lra_set_used_insn_alternative (insn, LRA_UNKNOWN_ALT);
   }
   bitmap_clear (_process);
+  return n;
 }
 
 /* Update all offsets and possibility for elimination on eliminable
@@ -1238,7 +1242,7 @@ update_reg_eliminate (bitmap insns_with_changed_offsets)
   }
   lra_no_alloc_regs |= temp_hard_reg_set;
   eliminable_regset &= ~temp_hard_reg_set;
-  spill_pseudos (temp_hard_reg_set);
+  spill_pseudos (temp_hard_reg_set, NULL);
   return result;
 }
 
@@ -1382,15 +1386,17 @@ process_insn_for_elimination (rtx_insn *insn, bool 
final_p, bool first_p)
 
 /* Update frame pointer to stack pointer elimination if we started with
permitted frame pointer elimination and now target reports that we can not
-   do this elimination anymore.  */
-void
-lra_update_fp2sp_elimination (void)
+   do this elimination anymore.  Record spilled pseudos in SPILLED_PSEUDOS
+   unless it is null, and return the recorded pseudos number.  */
+int
+lra_update_fp2sp_elimination (int *spilled_pseudos)
 {
+  int n;
   HARD_REG_SET set;
   class lra_elim_table *ep;
 
   if (frame_pointer_needed || !targetm.frame_pointer_required ())
-return;
+return 0;
   gcc_assert (!elimination_fp2sp_occured_p);
   if (lra_dump_file != NULL)
 fprintf (lra_dump_file,
@@ -1398,10 +1404,11 @@ lra_update_fp2sp_elimination (void)
   frame_pointer_needed = true;
   CLEAR_HARD_REG_SET (set);
   add_to_hard_reg_set (, Pmode, HARD_FRAME_POINTER_REGNUM);
-  spill_pseudos (set);
+  n = spill_pseudos (set, spilled_pseudos);
   for (ep = reg_eliminate; ep < _eliminate[NUM_ELIMINABLE_REGS]; ep++)
 if (ep->from == FRAME_POINTER_REGNUM && ep->to == STACK_POINTER_REGNUM)
   setup_can_eliminate (ep, false);
+  return n;
 }
 
 /* Entry function to do final elimination if FINAL_P or to update
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 633d9af8058..d0752c2ae50 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -414,7 +414,7 @@ extern int lra_get_elimination_hard_regno (int);
 extern rtx lra_eliminate_regs_1 (rtx_insn *, rtx, machine_mode,
 bool, bool, 

[pushed][LRA]: Process output stack pointer reloads before emitting reload insns

2023-08-14 Thread Vladimir Makarov via Gcc-patches

The patch fixes a failure of building aarch64 port with my yesterday patch.

The patch was successfully bootstrapped on x86-64 and aarch64.
commit c4760c0161f92b92361feba11836e3d066bb330c
Author: Vladimir N. Makarov 
Date:   Mon Aug 14 16:06:27 2023 -0400

[LRA]: Process output stack pointer reloads before emitting reload insns

Previous patch setting up asserts for processing stack pointer reloads
caught an error in code moving sp offset.  This resulted in failure of
building aarch64 port. The code wrongly processed insns beyond the
output reloads of the current insn.  This patch fixes it.

gcc/ChangeLog:

* lra-constraints.cc (curr_insn_transform): Process output stack
pointer reloads before emitting reload insns.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 8d9443adeb6..c718bedff32 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4840,7 +4840,6 @@ curr_insn_transform (bool check_only_p)
/* Most probably there are no enough registers to satisfy asm insn: */
lra_asm_insn_error (curr_insn);
 }
-  lra_process_new_insns (curr_insn, before, after, "Inserting insn reload");
   if (goal_alt_out_sp_reload_p)
 {
   /* We have an output stack pointer reload -- update sp offset: */
@@ -4863,6 +4862,7 @@ curr_insn_transform (bool check_only_p)
  }
   lra_assert (done_p);
 }
+  lra_process_new_insns (curr_insn, before, after, "Inserting insn reload");
   return change_p;
 }
 


Re: [pushed]LRA]: Fix asserts for output stack pointer reloads

2023-08-14 Thread Vladimir Makarov via Gcc-patches



On 8/14/23 14:37, Prathamesh Kulkarni wrote:

On Mon, 14 Aug 2023 at 06:39, Vladimir Makarov via Gcc-patches
 wrote:

The following patch fixes useless asserts in my latest patch
implementing output stack pointer reloads.

Hi Vladimir,
It seems that this patch caused the following ICE on aarch64-linux-gnu
while building cp-demangle.c:
compile:  
/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/./gcc/xgcc
-B/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/./gcc/
-B/usr/local/aarch64-unknown-linux-gnu/bin/
-B/usr/local/aarch64-unknown-linux-gnu/lib/ -isystem
/usr/local/aarch64-unknown-linux-gnu/include -isystem
/usr/local/aarch64-unknown-linux-gnu/sys-include -DHAVE_CONFIG_H -I..
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/../libiberty
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/../include
-D_GLIBCXX_SHARED
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/aarch64-unknown-linux-gnu/libstdc++-v3/include/aarch64-unknown-linux-gnu
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/aarch64-unknown-linux-gnu/libstdc++-v3/include
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/libsupc++
-g -O2 -DIN_GLIBCPP_V3 -Wno-error -c cp-demangle.c  -fPIC -DPIC -o
cp-demangle.o
during RTL pass: reload
cp-demangle.c: In function ‘d_demangle_callback.constprop’:
cp-demangle.c:6815:1: internal compiler error: in curr_insn_transform,
at lra-constraints.cc:4854
  6815 | }
   | ^
0xce6b37 curr_insn_transform
 ../../gcc/gcc/lra-constraints.cc:4854
0xce7887 lra_constraints(bool)
 ../../gcc/gcc/lra-constraints.cc:5478
0xccdfa7 lra(_IO_FILE*)
 ../../gcc/gcc/lra.cc:2419
0xc7e417 do_reload
 ../../gcc/gcc/ira.cc:5970
0xc7e417 execute
 ../../gcc/gcc/ira.cc:6156
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.


Sorry, I should have bootstrapped my patch on aarch64.

The asserts actually seems very useful as I found they caught a bug in 
my previous patch.


I'll push a patch fixing the problems after finishing bootstraps, 
probably in couple hours.


Thank you





[pushed]LRA]: Fix asserts for output stack pointer reloads

2023-08-13 Thread Vladimir Makarov via Gcc-patches
The following patch fixes useless asserts in my latest patch 
implementing output stack pointer reloads.
commit 18b417fe1a46d37738243267c1f559cd0acc4886
Author: Vladimir N. Makarov 
Date:   Sun Aug 13 20:54:58 2023 -0400

[LRA]: Fix asserts for output stack pointer reloads

The patch implementing output stack pointer reloads contained superfluous
asserts.  The patch makes them useful.

gcc/ChangeLog:

* lra-constraints.cc (curr_insn_transform): Set done_p up and
check it on true after processing output stack pointer reload.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 26239908747..8d9443adeb6 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4852,6 +4852,7 @@ curr_insn_transform (bool check_only_p)
&& SET_DEST (set) == stack_pointer_rtx)
  {
lra_assert (!done_p);
+   done_p = true;
curr_id->sp_offset = 0;
lra_insn_recog_data_t id = lra_get_insn_recog_data (insn);
id->sp_offset = sp_offset;
@@ -4860,7 +4861,7 @@ curr_insn_transform (bool check_only_p)
   "Moving sp offset from insn %u to %u\n",
   INSN_UID (curr_insn), INSN_UID (insn));
  }
-  lra_assert (!done_p);
+  lra_assert (done_p);
 }
   return change_p;
 }


[pushed][LRA]: Implement output stack pointer reloads

2023-08-11 Thread Vladimir Makarov via Gcc-patches
Sorry, I had some problems with email.  Therefore there are email 
duplication and they were sent to g...@gcc.gnu.org instead of 
gcc-patches@gcc.gnu.org



On 8/9/23 16:54, Vladimir Makarov wrote:




On 8/9/23 07:15, senthilkumar.selva...@microchip.com wrote:

Hi,

   After turning on FP -> SP elimination after Vlad fixed
   an elimination issue in 
https://gcc.gnu.org/git?p=gcc.git;a=commit;h=2971ff7b1d564ac04b537d907c70e6093af70832,

   I'm now running into reload failure if arithmetic is done on SP.

I think we can permit to stack pointer output reloads.  The only thing 
we need to update sp offset accurately for the original and reload 
insns.  I'll try to make the patch on this week.



The following patch fixes the problem.  The patch was successfully 
bootstrapped and tested on x86_64, aarch64, and ppc64le.


The test case is actually one from GCC test suite.

commit c0121083d07ffd4a8424f4be50de769d9ad0386d
Author: Vladimir N. Makarov 
Date:   Fri Aug 11 07:57:37 2023 -0400

[LRA]: Implement output stack pointer reloads

LRA prohibited output stack pointer reloads but it resulted in LRA
failure for AVR target which has no arithmetic insns working with the
stack pointer register.  Given patch implements the output stack
pointer reloads.

gcc/ChangeLog:

* lra-constraints.cc (goal_alt_out_sp_reload_p): New flag.
(process_alt_operands): Set the flag.
(curr_insn_transform): Modify stack pointer offsets if output
stack pointer reload is generated.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 09ff6de1657..26239908747 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -1466,6 +1466,8 @@ static int goal_alt_dont_inherit_ops[MAX_RECOG_OPERANDS];
 static bool goal_alt_swapped;
 /* The chosen insn alternative.	 */
 static int goal_alt_number;
+/* True if output reload of the stack pointer should be generated.  */
+static bool goal_alt_out_sp_reload_p;
 
 /* True if the corresponding operand is the result of an equivalence
substitution.  */
@@ -2128,6 +2130,9 @@ process_alt_operands (int only_alternative)
   int curr_alt_dont_inherit_ops_num;
   /* Numbers of operands whose reload pseudos should not be inherited.	*/
   int curr_alt_dont_inherit_ops[MAX_RECOG_OPERANDS];
+  /* True if output stack pointer reload should be generated for the current
+ alternative.  */
+  bool curr_alt_out_sp_reload_p;
   rtx op;
   /* The register when the operand is a subreg of register, otherwise the
  operand itself.  */
@@ -2211,7 +2216,8 @@ process_alt_operands (int only_alternative)
 	}
   reject += static_reject;
   early_clobbered_regs_num = 0;
-
+  curr_alt_out_sp_reload_p = false;
+  
   for (nop = 0; nop < n_operands; nop++)
 	{
 	  const char *p;
@@ -2682,12 +2688,10 @@ process_alt_operands (int only_alternative)
 	  bool no_regs_p;
 
 	  reject += op_reject;
-	  /* Never do output reload of stack pointer.  It makes
-		 impossible to do elimination when SP is changed in
-		 RTL.  */
-	  if (op == stack_pointer_rtx && ! frame_pointer_needed
+	  /* Mark output reload of the stack pointer.  */
+	  if (op == stack_pointer_rtx
 		  && curr_static_id->operand[nop].type != OP_IN)
-		goto fail;
+		curr_alt_out_sp_reload_p = true;
 
 	  /* If this alternative asks for a specific reg class, see if there
 		 is at least one allocatable register in that class.  */
@@ -3317,6 +3321,7 @@ process_alt_operands (int only_alternative)
 	  for (nop = 0; nop < curr_alt_dont_inherit_ops_num; nop++)
 	goal_alt_dont_inherit_ops[nop] = curr_alt_dont_inherit_ops[nop];
 	  goal_alt_swapped = curr_swapped;
+	  goal_alt_out_sp_reload_p = curr_alt_out_sp_reload_p;
 	  best_overall = overall;
 	  best_losers = losers;
 	  best_reload_nregs = reload_nregs;
@@ -4836,6 +4841,27 @@ curr_insn_transform (bool check_only_p)
 	lra_asm_insn_error (curr_insn);
 }
   lra_process_new_insns (curr_insn, before, after, "Inserting insn reload");
+  if (goal_alt_out_sp_reload_p)
+{
+  /* We have an output stack pointer reload -- update sp offset: */
+  rtx set;
+  bool done_p = false;
+  poly_int64 sp_offset = curr_id->sp_offset;
+  for (rtx_insn *insn = after; insn != NULL_RTX; insn = NEXT_INSN (insn))
+	if ((set = single_set (insn)) != NULL_RTX
+	&& SET_DEST (set) == stack_pointer_rtx)
+	  {
+	lra_assert (!done_p);
+	curr_id->sp_offset = 0;
+	lra_insn_recog_data_t id = lra_get_insn_recog_data (insn);
+	id->sp_offset = sp_offset;
+	if (lra_dump_file != NULL)
+	  fprintf (lra_dump_file,
+		   "Moving sp offset from insn %u to %u\n",
+		   INSN_UID (curr_insn), INSN_UID (insn));
+	  }
+  lra_assert (!done_p);
+}
   return change_p;
 }
 


Re: [PATCH] rtl-optimization/110587 - speedup find_hard_regno_for_1

2023-08-08 Thread Vladimir Makarov via Gcc-patches



On 8/7/23 09:18, Richard Biener wrote:

On Wed, 2 Aug 2023, Richard Biener wrote:


On Mon, 31 Jul 2023, Jeff Law wrote:



On 7/31/23 04:54, Richard Biener via Gcc-patches wrote:

On Tue, 25 Jul 2023, Richard Biener wrote:


The following applies a micro-optimization to find_hard_regno_for_1,
re-ordering the check so we can easily jump-thread by using an else.
This reduces the time spent in this function by 15% for the testcase
in the PR.

Bootstrap & regtest running on x86_64-unknown-linux-gnu, OK if that
passes?

Ping.


Thanks,
Richard.

  PR rtl-optimization/110587
  * lra-assigns.cc (find_hard_regno_for_1): Re-order checks.
---
   gcc/lra-assigns.cc | 9 +
   1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index b8582dcafff..d2ebcfd5056 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -522,14 +522,15 @@ find_hard_regno_for_1 (int regno, int *cost, int
@@ try_only_hard_regno,
   r2 != NULL;
   r2 = r2->start_next)
{
- if (r2->regno >= lra_constraint_new_regno_start
+ if (live_pseudos_reg_renumber[r2->regno] < 0
+ && r2->regno >= lra_constraint_new_regno_start
   && lra_reg_info[r2->regno].preferred_hard_regno1 >= 0
- && live_pseudos_reg_renumber[r2->regno] < 0
   && rclass_intersect_p[regno_allocno_class_array[r2->regno]])
 sparseset_set_bit (conflict_reload_and_inheritance_pseudos,
   r2->regno);
- if (live_pseudos_reg_renumber[r2->regno] >= 0
- && rclass_intersect_p[regno_allocno_class_array[r2->regno]])
+ else if (live_pseudos_reg_renumber[r2->regno] >= 0
+  && rclass_intersect_p
+   [regno_allocno_class_array[r2->regno]])
 sparseset_set_bit (live_range_hard_reg_pseudos, r2->regno);

My biggest concern here would be r2->regno < 0  in the new code which could
cause an OOB array reference in the first condition of the test.

Isn't that the point if the original ordering?  Test that r2->regno is
reasonable before using it as an array index?

Note the original code is

   if (r2->regno >= lra_constraint_new_regno_start
...
  if (live_pseudos_reg_renumber[r2->regno] >= 0
...

so we are going to access live_pseudos_reg_renumber[r2->regno]
independent on the r2->regno >= lra_constraint_new_regno_start check,
so I don't think that's the point of the original ordering.  Note
I preserved the ordering with respect to other array accesses,
the speedup seen is because we now have the


if (live_pseudos_reg_renumber[r2->regno] < 0
...
else if (live_pseudos_reg_renumber[r2->regno] >= 0
 ...

structure directly exposed which helps the compiler.

I think the check on r2->regno is to decide whether to alter
conflict_reload_and_inheritance_pseudos or
live_range_hard_reg_pseudos (so it's also somewhat natural to check
that first).

So - OK?


Richard, sorry, I overlooked this thread.

Yes, it is OK to commit.  In general Jeff has a reasonable concern but 
in this case r2->regno is always >= 0 and I can not imagine reasons that 
we will change algorithm in the future in such way when it is not true.






[pushed][LRA] Check input insn pattern hard regs against early clobber hard regs for live info

2023-08-04 Thread Vladimir Makarov via Gcc-patches
The following patch fixes a problem found by LRA port for avr target.  
The problem description is in the commit message.


The patch was successfully bootstrapped and tested on x86-64 and aarch64.
commit abf953042ace471720c1dc284b5f38e546fc0595
Author: Vladimir N. Makarov 
Date:   Fri Aug 4 08:04:44 2023 -0400

LRA: Check input insn pattern hard regs against early clobber hard regs for live info

For the test case LRA generates wrong code for AVR cpymem_qi insn:

(insn 16 15 17 3 (parallel [
(set (mem:BLK (reg:HI 26 r26) [0  A8])
(mem:BLK (reg:HI 30 r30) [0  A8]))
(unspec [
(const_int 0 [0])
] UNSPEC_CPYMEM)
(use (reg:QI 52))
(clobber (reg:HI 26 r26))
(clobber (reg:HI 30 r30))
(clobber (reg:QI 0 r0))
(clobber (reg:QI 52))
]) "t.c":16:22 132 {cpymem_qi}

The insn gets the same value in r26 and r30.  The culprit is clobbering
r30 and using r30 as input.  For such situation LRA wrongly assumes that
r30 does not live before the insn.  The patch is fixing it.

gcc/ChangeLog:

* lra-lives.cc (process_bb_lives): Check input insn pattern hard regs
against early clobber hard regs.

gcc/testsuite/ChangeLog:

* gcc.target/avr/lra-cpymem_qi.c: New.

diff --git a/gcc/lra-lives.cc b/gcc/lra-lives.cc
index f7a3ba8d76a..f60e564da82 100644
--- a/gcc/lra-lives.cc
+++ b/gcc/lra-lives.cc
@@ -989,7 +989,7 @@ process_bb_lives (basic_block bb, int _point, bool dead_insn_p)
 	/* We can have early clobbered non-operand hard reg and
 	   the same hard reg as an insn input.  Don't make hard
 	   reg dead before the insns.  */
-	for (reg2 = curr_id->regs; reg2 != NULL; reg2 = reg2->next)
+	for (reg2 = curr_static_id->hard_regs; reg2 != NULL; reg2 = reg2->next)
 	  if (reg2->type != OP_OUT && reg2->regno == reg->regno)
 		break;
 	if (reg2 == NULL)
diff --git a/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c b/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c
new file mode 100644
index 000..fdffb445b45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/lra-cpymem_qi.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mmcu=avr51 -Os" } */
+
+#include 
+
+struct A
+{
+  unsigned int a;
+  unsigned char c1, c2;
+  bool b1 : 1;
+};
+
+void
+foo (const struct A *x, int y)
+{
+  int s = 0, i;
+  for (i = 0; i < y; ++i)
+{
+  const struct A a = x[i];
+  s += a.b1 ? 1 : 0;
+}
+  if (s != 0)
+__builtin_abort ();
+}
+
+/* { dg-final { scan-assembler-not "movw\[^\n\r]*r26,r30" } } */


Re: [PING][PATCH] ira: update allocated_hardreg_p[] in improve_allocation() [PR110254]

2023-08-02 Thread Vladimir Makarov via Gcc-patches



On 8/1/23 01:20, Surya Kumari Jangala wrote:

Ping

Sorry for delay with the answer. I was on vacation.

On 21/07/23 3:43 pm, Surya Kumari Jangala via Gcc-patches wrote:

The improve_allocation() routine does not update the
allocated_hardreg_p[] array after an allocno is assigned a register.

If the register chosen in improve_allocation() is one that already has
been assigned to a conflicting allocno, then allocated_hardreg_p[]
already has the corresponding bit set to TRUE, so nothing needs to be
done.

But improve_allocation() can also choose a register that has not been
assigned to a conflicting allocno, and also has not been assigned to any
other allocno. In this case, allocated_hardreg_p[] has to be updated.

The patch is OK for me.  Thank you for finding and fixing this issue.

2023-07-21  Surya Kumari Jangala  

gcc/
PR rtl-optimization/PR110254
* ira-color.cc (improve_allocation): Update array


I guess you missed the next line in the changelog.  I suspect it should 
be "Update array allocated_hard_reg_p."


Please, fix it before committing the patch.


---

diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 1fb2958bddd..5807d6d26f6 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -3340,6 +3340,10 @@ improve_allocation (void)
}
/* Assign the best chosen hard register to A.  */
ALLOCNO_HARD_REGNO (a) = best;
+
+  for (j = nregs - 1; j >= 0; j--)
+   allocated_hardreg_p[best + j] = true;
+
if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
fprintf (ira_dump_file, "Assigning %d to a%dr%d\n",
 best, ALLOCNO_NUM (a), ALLOCNO_REGNO (a));




Re: [PATCH] rtl-optimization/110587 - remove quadratic regno_in_use_p

2023-08-01 Thread Vladimir Makarov via Gcc-patches



On 7/25/23 09:40, Richard Biener wrote:

The following removes the code checking whether a noop copy
is between something involved in the return sequence composed
of a SET and USE.  Instead of checking for this special-case
the following makes us only ever remove noop copies between
pseudos - which is the case that is necessary for IRA/LRA
interfacing to function according to the comment.  That makes
looking for the return reg special case unnecessary, reducing
the compile-time in LRA non-specific to zero for the testcase.

Bootstrapped and tested on x86_64-unknown-linux-gnu with
all languages and {,-m32}.

OK?


Richard, sorry for the delay with the answer.  I was on vacation.

There is a lot of history of changes of the code.  I believe your change 
is right.  I don't think that RTL will ever contain noop return move 
insn involving the return hard register especially after removing hard 
reg propagation couple years ago, at least IRA/LRA do not generate such 
insns during its work.


So the patch is OK for me.  I specially like that the big part of code 
is removed.  No code, no problem (including performance one).  Thank you 
for the patch.



PR rtl-optimization/110587
* lra-spills.cc (return_regno_p): Remove.
(regno_in_use_p): Likewise.
(lra_final_code_change): Do not remove noop moves
between hard registers.
---
  gcc/lra-spills.cc | 69 +--
  1 file changed, 1 insertion(+), 68 deletions(-)

diff --git a/gcc/lra-spills.cc b/gcc/lra-spills.cc
index 3a7bb7e8cd9..fe58f162d05 100644
--- a/gcc/lra-spills.cc
+++ b/gcc/lra-spills.cc
@@ -705,72 +705,6 @@ alter_subregs (rtx *loc, bool final_p)
return res;
  }




[pushed][LRA]: Fix sparc bootstrap after recent patch for fp elimination for avr LRA port

2023-07-21 Thread Vladimir Makarov via Gcc-patches
The following patch fixes sparc solaris bootstrap.  The explanation of 
the patch is in the commit message.


The patch was successfully bootstrap on x86-64, aarch64, and sparc64 
solaris.


commit d17be8f7f36abe257a7d026dad61e5f8d14bdafc
Author: Vladimir N. Makarov 
Date:   Fri Jul 21 20:28:50 2023 -0400

[LRA]: Fix sparc bootstrap after recent patch for fp elimination for avr 
LRA port

The recent patch for fp elimination for avr LRA port modified an assert
which can be wrong for targets using hard frame pointer different from
frame pointer.  Also for such ports spilling pseudos assigned to fp
was wrong too in the new code.  Although this code is not used for any 
target
currently using LRA except for avr.  Given patch fixes the issues.

gcc/ChangeLog:

* lra-eliminations.cc (update_reg_eliminate): Fix the assert.
(lra_update_fp2sp_elimination): Use HARD_FRAME_POINTER_REGNUM
instead of FRAME_POINTER_REGNUM to spill pseudos.

diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index cf0aa94b69a..1f4e3fec9e0 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -1179,8 +1179,7 @@ update_reg_eliminate (bitmap insns_with_changed_offsets)
  gcc_assert (ep->to_rtx != stack_pointer_rtx
  || (ep->from == FRAME_POINTER_REGNUM
  && !elimination_fp2sp_occured_p)
- || (ep->from != FRAME_POINTER_REGNUM
- && ep->from < FIRST_PSEUDO_REGISTER
+ || (ep->from < FIRST_PSEUDO_REGISTER
  && fixed_regs [ep->from]));
 
  /* Mark that is not eliminable anymore.  */
@@ -1398,7 +1397,7 @@ lra_update_fp2sp_elimination (void)
 " Frame pointer can not be eliminated anymore\n");
   frame_pointer_needed = true;
   CLEAR_HARD_REG_SET (set);
-  add_to_hard_reg_set (, Pmode, FRAME_POINTER_REGNUM);
+  add_to_hard_reg_set (, Pmode, HARD_FRAME_POINTER_REGNUM);
   spill_pseudos (set);
   for (ep = reg_eliminate; ep < _eliminate[NUM_ELIMINABLE_REGS]; ep++)
 if (ep->from == FRAME_POINTER_REGNUM && ep->to == STACK_POINTER_REGNUM)


Re: [pushed][LRA]: Check and update frame to stack pointer elimination after stack slot allocation

2023-07-21 Thread Vladimir Makarov via Gcc-patches



On 7/20/23 16:45, Rainer Orth wrote:

Hi Vladimir,


The following patch is necessary for porting avr to LRA.

The patch was successfully bootstrapped and tested on x86-64, aarch64, and
ppc64le.

There is still avr poring problem with reloading of subreg of frame
pointer.  I'll address it later on this week.

this patch most likely broke sparc-sun-solaris2.11 bootstrap:

/var/gcc/regression/master/11.4-gcc/build/./gcc/xgcc 
-B/var/gcc/regression/master/11.4-gcc/build/./gcc/ 
-B/vol/gcc/sparc-sun-solaris2.11/bin/ -B/vol/gcc/sparc-sun-solaris2.11/lib/ 
-isystem /vol/gcc/sparc-sun-solaris2.11/include -isystem 
/vol/gcc/sparc-sun-solaris2.11/sys-include   -fchecking=1 -c -g -O2   -W -Wall 
-gnatpg -nostdinc   g-alleve.adb -o g-alleve.o
+===GNAT BUG DETECTED==+
| 14.0.0 20230720 (experimental) [master 
506f068e7d01ad2fb107185b8fb204a0ec23785c] (sparc-sun-solaris2.11) GCC error:|
| in update_reg_eliminate, at lra-eliminations.cc:1179 |
| Error detected around g-alleve.adb:4132:8

This is in stage 3.  I haven't investigated further yet.


Thank you for reporting this.  I'll try to fix on this week.  I have a 
patch but unfortunately bootstrap is too slow.  If the patch does not 
work, I'll revert the original patch.





[pushed][LRA]: Exclude reloading of frame pointer in subreg for some cases

2023-07-20 Thread Vladimir Makarov via Gcc-patches
The following patch improves code for avr LRA port.  More explanation 
for the patch can be found in the commit message.


The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.
commit 4b8878fbf7b74ea5c3405c9f558df0517036f131
Author: Vladimir N. Makarov 
Date:   Thu Jul 20 14:34:26 2023 -0400

[LRA]: Exclude reloading of frame pointer in subreg for some cases

LRA for avr port reloads frame pointer in subreg although we can just
simplify the subreg.  It results in generation of bad performance code.  
The following
patch fixes this.

gcc/ChangeLog:

* lra-constraints.cc (simplify_operand_subreg): Check frame pointer
simplification.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 76a155e99c2..f3784cf5a5b 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -1797,6 +1797,16 @@ simplify_operand_subreg (int nop, machine_mode reg_mode)
   alter_subreg (curr_id->operand_loc[nop], false);
   return true;
 }
+  auto fp_subreg_can_be_simplified_after_reload_p = [] (machine_mode innermode,
+   poly_uint64 offset,
+   machine_mode mode) {
+reload_completed = 1;
+bool res = simplify_subreg_regno (FRAME_POINTER_REGNUM,
+ innermode,
+ offset, mode) >= 0;
+reload_completed = 0;
+return res;
+  };
   /* Force a reload of the SUBREG_REG if this is a constant or PLUS or
  if there may be a problem accessing OPERAND in the outer
  mode.  */
@@ -1809,6 +1819,12 @@ simplify_operand_subreg (int nop, machine_mode reg_mode)
   >= hard_regno_nregs (hard_regno, mode))
&& simplify_subreg_regno (hard_regno, innermode,
 SUBREG_BYTE (operand), mode) < 0
+   /* Exclude reloading of frame pointer in subreg if frame pointer can not
+ be simplified here only because the reload is not finished yet.  */
+   && (hard_regno != FRAME_POINTER_REGNUM
+  || !fp_subreg_can_be_simplified_after_reload_p (innermode,
+  SUBREG_BYTE 
(operand),
+  mode))
/* Don't reload subreg for matching reload.  It is actually
  valid subreg in LRA.  */
&& ! LRA_SUBREG_P (operand))


[pushed][LRA]: Check and update frame to stack pointer elimination after stack slot allocation

2023-07-19 Thread Vladimir Makarov via Gcc-patches

The following patch is necessary for porting avr to LRA.

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


There is still avr poring problem with reloading of subreg of frame 
pointer.  I'll address it later on this week.


commit 2971ff7b1d564ac04b537d907c70e6093af70832
Author: Vladimir N. Makarov 
Date:   Wed Jul 19 09:35:37 2023 -0400

[LRA]: Check and update frame to stack pointer elimination after stack slot 
allocation

Avr is an interesting target which does not use stack pointer to
address stack slots.  The elimination of stack pointer to frame pointer
is impossible if there are stack slots.  During LRA works, the
stack slots can be allocated and used and the elimination can be done
anymore.  The situation can be complicated even more if some pseudos
were allocated to the frame pointer.

gcc/ChangeLog:

* lra-int.h (lra_update_fp2sp_elimination): New prototype.
(lra_asm_insn_error): New prototype.
* lra-spills.cc (remove_pseudos): Add check for pseudo slot memory
existence.
(lra_spill): Call lra_update_fp2sp_elimination.
* lra-eliminations.cc: Remove trailing spaces.
(elimination_fp2sp_occured_p): New static flag.
(lra_eliminate_regs_1): Set the flag up.
(update_reg_eliminate): Modify the assert for stack to frame
pointer elimination.
(lra_update_fp2sp_elimination): New function.
(lra_eliminate): Clear flag elimination_fp2sp_occured_p.

gcc/testsuite/ChangeLog:

* gcc.target/avr/lra-elim.c: New test.

diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index 68225339cb6..cf0aa94b69a 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -286,7 +286,7 @@ move_plus_up (rtx x)
 {
   rtx subreg_reg;
   machine_mode x_mode, subreg_reg_mode;
-  
+
   if (GET_CODE (x) != SUBREG || !subreg_lowpart_p (x))
 return x;
   subreg_reg = SUBREG_REG (x);
@@ -309,6 +309,9 @@ move_plus_up (rtx x)
   return x;
 }
 
+/* Flag that we already did frame pointer to stack pointer elimination.  */
+static bool elimination_fp2sp_occured_p = false;
+
 /* Scan X and replace any eliminable registers (such as fp) with a
replacement (such as sp) if SUBST_P, plus an offset.  The offset is
a change in the offset between the eliminable register and its
@@ -366,6 +369,9 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode 
mem_mode,
{
  rtx to = subst_p ? ep->to_rtx : ep->from_rtx;
 
+ if (ep->to_rtx == stack_pointer_rtx && ep->from == 
FRAME_POINTER_REGNUM)
+   elimination_fp2sp_occured_p = true;
+
  if (maybe_ne (update_sp_offset, 0))
{
  if (ep->to_rtx == stack_pointer_rtx)
@@ -396,9 +402,12 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode 
mem_mode,
  poly_int64 offset, curr_offset;
  rtx to = subst_p ? ep->to_rtx : ep->from_rtx;
 
+ if (ep->to_rtx == stack_pointer_rtx && ep->from == 
FRAME_POINTER_REGNUM)
+   elimination_fp2sp_occured_p = true;
+
  if (! update_p && ! full_p)
return gen_rtx_PLUS (Pmode, to, XEXP (x, 1));
- 
+
  if (maybe_ne (update_sp_offset, 0))
offset = ep->to_rtx == stack_pointer_rtx ? update_sp_offset : 0;
  else
@@ -456,6 +465,9 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode 
mem_mode,
{
  rtx to = subst_p ? ep->to_rtx : ep->from_rtx;
 
+ if (ep->to_rtx == stack_pointer_rtx && ep->from == 
FRAME_POINTER_REGNUM)
+   elimination_fp2sp_occured_p = true;
+
  if (maybe_ne (update_sp_offset, 0))
{
  if (ep->to_rtx == stack_pointer_rtx)
@@ -500,7 +512,7 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode 
mem_mode,
 case LE:  case LT:   case LEU:case LTU:
   {
rtx new0 = lra_eliminate_regs_1 (insn, XEXP (x, 0), mem_mode,
-subst_p, update_p, 
+subst_p, update_p,
 update_sp_offset, full_p);
rtx new1 = XEXP (x, 1)
   ? lra_eliminate_regs_1 (insn, XEXP (x, 1), mem_mode,
@@ -749,7 +761,7 @@ mark_not_eliminable (rtx x, machine_mode mem_mode)
  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), 
{
  poly_int64 size = GET_MODE_SIZE (mem_mode);
- 
+
 #ifdef PUSH_ROUNDING
  /* If more bytes than MEM_MODE are pushed, account for
 them.  */
@@ -822,7 +834,7 @@ mark_not_eliminable (rtx x, machine_mode mem_mode)
{
  /* See if this is setting the replacement hard register for
 an elimination.
-
+
 If DEST is the hard frame pointer, we do nothing because
 we assume 

[pushed][RA][PR109520]: Catch error when there are no enough registers for asm insn

2023-07-13 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109520

The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


commit b175b4887f928118af997f6d4d75097a64dcec5d
Author: Vladimir N. Makarov 
Date:   Thu Jul 13 10:42:17 2023 -0400

[RA][PR109520]: Catch error when there are no enough registers for asm insn

Asm insn unlike other insns can have so many operands whose
constraints can not be satisfied.  It results in LRA cycling for such
test case.  The following patch catches such situation and reports the
problem.

PR middle-end/109520

gcc/ChangeLog:

* lra-int.h (lra_insn_recog_data): Add member asm_reloads_num.
(lra_asm_insn_error): New prototype.
* lra.cc: Include rtl_error.h.
(lra_set_insn_recog_data): Initialize asm_reloads_num.
(lra_asm_insn_error): New func whose code is taken from ...
* lra-assigns.cc (lra_split_hard_reg_for): ... here.  Use lra_asm_insn_error.
* lra-constraints.cc (curr_insn_transform): Check reloads nummber for asm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109520.c: New test.

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 2f95121df06..3555926af66 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1851,20 +1851,8 @@ lra_split_hard_reg_for (void)
   insn = lra_insn_recog_data[u]->insn;
   if (asm_noperands (PATTERN (insn)) >= 0)
 	{
-	  lra_asm_error_p = asm_p = true;
-	  error_for_asm (insn,
-			 "% operand has impossible constraints");
-	  /* Avoid further trouble with this insn.  */
-	  if (JUMP_P (insn))
-	{
-	  ira_nullify_asm_goto (insn);
-	  lra_update_insn_regno_info (insn);
-	}
-	  else
-	{
-	  PATTERN (insn) = gen_rtx_USE (VOIDmode, const0_rtx);
-	  lra_set_insn_deleted (insn);
-	}
+	  asm_p = true;
+	  lra_asm_insn_error (insn);
 	}
   else if (!asm_p)
 	{
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 9bfc88149ff..0c6912d6e7d 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -4813,6 +4813,10 @@ curr_insn_transform (bool check_only_p)
   lra_update_operator_dups (curr_id);
   /* Something changes -- process the insn.	 */
   lra_update_insn_regno_info (curr_insn);
+  if (asm_noperands (PATTERN (curr_insn)) >= 0
+	  && ++curr_id->asm_reloads_num >= FIRST_PSEUDO_REGISTER)
+	/* Most probably there are no enough registers to satisfy asm insn: */
+	lra_asm_insn_error (curr_insn);
 }
   lra_process_new_insns (curr_insn, before, after, "Inserting insn reload");
   return change_p;
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 4dbe6672f3a..a32359e5772 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -209,6 +209,9 @@ public:
  debug insn.  LRA_NON_CLOBBERED_ALT means ignoring any earlier
  clobbers for the insn.  */
   int used_insn_alternative;
+  /* Defined for asm insn and it is how many times we already generated reloads
+ for the asm insn.  */
+  int asm_reloads_num;
   /* SP offset before the insn relative to one at the func start.  */
   poly_int64 sp_offset;
   /* The insn itself.  */
@@ -307,6 +310,7 @@ extern void lra_delete_dead_insn (rtx_insn *);
 extern void lra_emit_add (rtx, rtx, rtx);
 extern void lra_emit_move (rtx, rtx);
 extern void lra_update_dups (lra_insn_recog_data_t, signed char *);
+extern void lra_asm_insn_error (rtx_insn *insn);
 
 extern void lra_process_new_insns (rtx_insn *, rtx_insn *, rtx_insn *,
    const char *);
diff --git a/gcc/lra.cc b/gcc/lra.cc
index c8b3f139acd..563aff10b96 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -106,6 +106,7 @@ along with GCC; see the file COPYING3.	If not see
 #include "backend.h"
 #include "target.h"
 #include "rtl.h"
+#include "rtl-error.h"
 #include "tree.h"
 #include "predict.h"
 #include "df.h"
@@ -536,6 +537,27 @@ lra_update_dups (lra_insn_recog_data_t id, signed char *nops)
 	*id->dup_loc[i] = *id->operand_loc[nop];
 }
 
+/* Report asm insn error and modify the asm insn.  */
+void
+lra_asm_insn_error (rtx_insn *insn)
+{
+  lra_asm_error_p = true;
+  error_for_asm (insn,
+		 "% operand has impossible constraints"
+		 " or there are not enough registers");
+  /* Avoid further trouble with this insn.  */
+  if (JUMP_P (insn))
+{
+  ira_nullify_asm_goto (insn);
+  lra_update_insn_regno_info (insn);
+}
+  else
+{
+  PATTERN (insn) = gen_rtx_USE (VOIDmode, const0_rtx);
+  lra_set_insn_deleted (insn);
+}
+}
+
 
 
 /* This page contains code dealing with info about registers in the
@@ -973,6 +995,7 @@ lra_set_insn_recog_data (rtx_insn *insn)
   lra_insn_recog_data[uid] = data;
   data->insn = insn;
   data->used_insn_alternative = LRA_UNKNOWN_ALT;
+  data->asm_reloads_num = 0;
   data->icode = icode;
   data->regs = NULL;
   if (DEBUG_INSN_P (insn))
diff --git a/gcc/testsuite/gcc.target/i386/pr109520.c 

Re: [IRA] Skip empty register classes in setup_reg_class_relations

2023-07-13 Thread Vladimir Makarov via Gcc-patches



On 7/12/23 07:05, senthilkumar.selva...@microchip.com wrote:

Hi,

   I've been spending some (spare) time trying to get LRA working
   for the avr target.


Thank you for addressing this problem.

The code you changing is very sensitive and was a source of multiple PRs 
in the past.  But I found the change your propose logical and I think it 
will not create problems.  Still please be alert and revert the patch if 
people reports the problem with this change.



  After making a couple of changes to get
   libgcc going, I'm now hitting an assert at
   lra-constraints.cc:4423 for a subarch (avrtiny) that has a
   couple of regclasses with no available registers.

   The assert fires because in_class_p (correctly) returns
   false for get_reg_class (regno) = ALL_REGS, and new_class =
   NO_LD_REGS. For avrtiny, NO_LD_REGS is an empty regset, and
   therefore hard_reg_set_subset_p (NO_LD_REGS, lra_no_alloc_regs)
   is always true, making in_class_p return false.

   in_class_p picks NO_LD_REGS as new_class because common_class =
   ira_reg_class_subset[ALL_REGS][NO_REGS] evaluates as
   NO_LD_REGS. This appears wrong to me - it should be NO_REGS
   instead (lra-constraints.cc:4421 checks for NO_REGS).

   ira.cc:setup_reg_class_relations sets up
   ira_reg_class_subset (among other things), and the problem
   appears to be a missing continue statement if
   reg_class_contents[cl3] (in the innermost loop) is empty.

   In this case, for cl1 = ALL_REGS and cl2 = NO_REGS, cl3 =
   NO_LD_REGS, temp_hard_regset and temp_set2 are both empty, and
   hard_reg_subset_p (, ) is always true, so
   ira_reg_class_subset[ALL_REGS][NO_REGS] ends up being set to
   cl3 = NO_LD_REGS. Adding a continue if hard_reg_set_empty_p 
(temp_hard_regset)
   fixes the problem for me.

   Does the below patch look ok? Bootstrapping and regression
   testing passed on x86_64.

OK.



Re: [pushed][LRA][PR110372]: Refine reload pseudo class

2023-07-12 Thread Vladimir Makarov via Gcc-patches



On 7/12/23 12:22, Richard Sandiford wrote:

Vladimir Makarov  writes:

On 7/12/23 06:07, Richard Sandiford wrote:

Vladimir Makarov via Gcc-patches  writes:

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 73fbef29912..2f95121df06 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1443,10 +1443,11 @@ assign_by_spills (void)
 pass.  Indicate that it is no longer spilled.  */
  bitmap_clear_bit (_spilled_pseudos, regno);
  assign_hard_regno (hard_regno, regno);
- if (! reload_p)
-   /* As non-reload pseudo assignment is changed we
-  should reconsider insns referring for the
-  pseudo.  */
+ if (! reload_p || regno_allocno_class_array[regno] == ALL_REGS)

Is this test meaningful on all targets?  We have some for which
GENERAL_REGS == ALL_REGS (e.g. nios2 and nvptx), so ALL_REGS can
be a valid allocation class.


Richard, thank you for the question.

As I remember nvptx does not use IRA/LRA.

I don't think it is a problem.  For targets with GENERAL_REGS ==
ALL_REGS, it only results in one more insn processing on the next
constraint sub-pass.

Ah, ok, thanks.  If there's no risk of cycling then I agree it
doesn't matter.
No. There is no additional risk of cycling as insn processing only 
starts after assigning hard reg to the reload pseudo and it can happens 
only once for the reload pseudo before spilling sub-pass.




Re: [pushed][LRA][PR110372]: Refine reload pseudo class

2023-07-12 Thread Vladimir Makarov via Gcc-patches



On 7/12/23 06:07, Richard Sandiford wrote:

Vladimir Makarov via Gcc-patches  writes:

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 73fbef29912..2f95121df06 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1443,10 +1443,11 @@ assign_by_spills (void)
 pass.  Indicate that it is no longer spilled.  */
  bitmap_clear_bit (_spilled_pseudos, regno);
  assign_hard_regno (hard_regno, regno);
- if (! reload_p)
-   /* As non-reload pseudo assignment is changed we
-  should reconsider insns referring for the
-  pseudo.  */
+ if (! reload_p || regno_allocno_class_array[regno] == ALL_REGS)

Is this test meaningful on all targets?  We have some for which
GENERAL_REGS == ALL_REGS (e.g. nios2 and nvptx), so ALL_REGS can
be a valid allocation class.


Richard, thank you for the question.

As I remember nvptx does not use IRA/LRA.

I don't think it is a problem.  For targets with GENERAL_REGS == 
ALL_REGS, it only results in one more insn processing on the next 
constraint sub-pass.


I could do more accurate solution but it would need introducing new data 
(flags) for pseudos which I'd like to avoid.




[pushed][LRA][PR110372]: Refine reload pseudo class

2023-07-07 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110372

The patch was successfully bootstrapped and tested on x86-64.
commit 1f7e5a7b91862b999aab88ee0319052aaf00f0f1
Author: Vladimir N. Makarov 
Date:   Fri Jul 7 09:53:38 2023 -0400

LRA: Refine reload pseudo class

For given testcase a reload pseudo happened to occur only in reload
insns created on one constraint sub-pass.  Therefore its initial class
(ALL_REGS) was not refined and the reload insns were not processed on
the next constraint sub-passes.  This resulted into the wrong insn.

PR rtl-optimization/110372

gcc/ChangeLog:

* lra-assigns.cc (assign_by_spills): Add reload insns involving
reload pseudos with non-refined class to be processed on the next
sub-pass.
* lra-constraints.cc (enough_allocatable_hard_regs_p): New func.
(in_class_p): Use it.
(print_curr_insn_alt): New func.
(process_alt_operands): Use it.  Improve debug info.
(curr_insn_transform): Use print_curr_insn_alt.  Refine reload
pseudo class if it is not refined yet.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110372.c: New.

diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 73fbef29912..2f95121df06 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1443,10 +1443,11 @@ assign_by_spills (void)
 		 pass.  Indicate that it is no longer spilled.  */
 	  bitmap_clear_bit (_spilled_pseudos, regno);
 	  assign_hard_regno (hard_regno, regno);
-	  if (! reload_p)
-		/* As non-reload pseudo assignment is changed we
-		   should reconsider insns referring for the
-		   pseudo.  */
+	  if (! reload_p || regno_allocno_class_array[regno] == ALL_REGS)
+		/* As non-reload pseudo assignment is changed we should
+		   reconsider insns referring for the pseudo.  Do the same if a
+		   reload pseudo did not refine its class which can happens
+		   when the pseudo occurs only in reload insns.  */
 		bitmap_set_bit (_pseudo_bitmap, regno);
 	}
 	}
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 4dc2d70c402..123ff662cbc 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -233,6 +233,34 @@ get_reg_class (int regno)
   return NO_REGS;
 }
 
+/* Return true if REG_CLASS has enough allocatable hard regs to keep value of
+   REG_MODE.  */
+static bool
+enough_allocatable_hard_regs_p (enum reg_class reg_class,
+enum machine_mode reg_mode)
+{
+  int i, j, hard_regno, class_size, nregs;
+  
+  if (hard_reg_set_subset_p (reg_class_contents[reg_class], lra_no_alloc_regs))
+return false;
+  class_size = ira_class_hard_regs_num[reg_class];
+  for (i = 0; i < class_size; i++)
+{
+  hard_regno = ira_class_hard_regs[reg_class][i];
+  nregs = hard_regno_nregs (hard_regno, reg_mode);
+  if (nregs == 1)
+	return true;
+  for (j = 0; j < nregs; j++)
+	if (TEST_HARD_REG_BIT (lra_no_alloc_regs, hard_regno + j)
+	|| ! TEST_HARD_REG_BIT (reg_class_contents[reg_class],
+hard_regno + j))
+	  break;
+  if (j >= nregs)
+	return true;
+}
+  return false;
+}
+
 /* Return true if REG satisfies (or will satisfy) reg class constraint
CL.  Use elimination first if REG is a hard register.  If REG is a
reload pseudo created by this constraints pass, assume that it will
@@ -252,7 +280,6 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
   enum reg_class rclass, common_class;
   machine_mode reg_mode;
   rtx src;
-  int class_size, hard_regno, nregs, i, j;
   int regno = REGNO (reg);
 
   if (new_class != NULL)
@@ -291,26 +318,7 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class,
   common_class = ira_reg_class_subset[rclass][cl];
   if (new_class != NULL)
 	*new_class = common_class;
-  if (hard_reg_set_subset_p (reg_class_contents[common_class],
- lra_no_alloc_regs))
-	return false;
-  /* Check that there are enough allocatable regs.  */
-  class_size = ira_class_hard_regs_num[common_class];
-  for (i = 0; i < class_size; i++)
-	{
-	  hard_regno = ira_class_hard_regs[common_class][i];
-	  nregs = hard_regno_nregs (hard_regno, reg_mode);
-	  if (nregs == 1)
-	return true;
-	  for (j = 0; j < nregs; j++)
-	if (TEST_HARD_REG_BIT (lra_no_alloc_regs, hard_regno + j)
-		|| ! TEST_HARD_REG_BIT (reg_class_contents[common_class],
-	hard_regno + j))
-	  break;
-	  if (j >= nregs)
-	return true;
-	}
-  return false;
+  return enough_allocatable_hard_regs_p (common_class, reg_mode);
 }
 }
 
@@ -2046,6 +2054,23 @@ update_and_check_small_class_inputs (int nop, int nalt,
   return false;
 }
 
+/* Print operand constraints for alternative ALT_NUMBER of the current
+   insn.  */
+static void
+print_curr_insn_alt (int alt_number)
+{
+  for (int i = 0; i < curr_static_id->n_operands; i++)
+{
+  const char *p = 

[pushed] [RA] [PR110215] Ignore conflicts for some pseudos from insns throwing a final exception

2023-06-16 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110215

The patch was successfully tested and bootstrapped on x86-64, aarch64, 
and ppc64le.


It is difficult to make a stable test for the PR.  So there is not test 
in the patch.


commit 154c69039571c66b3a6d16ecfa9e6ff22942f59f
Author: Vladimir N. Makarov 
Date:   Fri Jun 16 11:12:32 2023 -0400

RA: Ignore conflicts for some pseudos from insns throwing a final exception

IRA adds conflicts to the pseudos from insns can throw exceptions
internally even if the exception code is final for the function and
the pseudo value is not used in the exception code.  This results in
spilling a pseudo in a loop (see
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110215).

The following patch fixes the problem.

PR rtl-optimization/110215

gcc/ChangeLog:

* ira-lives.cc: Include except.h.
(process_bb_node_lives): Ignore conflicts from cleanup exceptions
when the pseudo does not live at the exception landing pad.

diff --git a/gcc/ira-lives.cc b/gcc/ira-lives.cc
index 6a3901ee234..bc8493856a4 100644
--- a/gcc/ira-lives.cc
+++ b/gcc/ira-lives.cc
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ira-int.h"
 #include "sparseset.h"
 #include "function-abi.h"
+#include "except.h"
 
 /* The code in this file is similar to one in global but the code
works on the allocno basis and creates live ranges instead of
@@ -1383,14 +1384,24 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		  SET_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj));
 		  SET_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
 		}
-		  if (can_throw_internal (insn))
+		  eh_region r;
+		  eh_landing_pad lp;
+		  rtx_code_label *landing_label;
+		  basic_block landing_bb;
+		  if (can_throw_internal (insn)
+		  && (r = get_eh_region_from_rtx (insn)) != NULL
+		  && (lp = gen_eh_landing_pad (r)) != NULL
+		  && (landing_label = lp->landing_pad) != NULL
+		  && (landing_bb = BLOCK_FOR_INSN (landing_label)) != NULL
+		  && (r->type != ERT_CLEANUP
+			  || bitmap_bit_p (df_get_live_in (landing_bb),
+	   ALLOCNO_REGNO (a
 		{
-		  OBJECT_CONFLICT_HARD_REGS (obj)
-			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
-		  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
-			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
+		  HARD_REG_SET new_conflict_regs
+			= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
+		  OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+		  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
 		}
-
 		  if (sparseset_bit_p (allocnos_processed, num))
 		continue;
 		  sparseset_set_bit (allocnos_processed, num);


Re: [pushed] [PR109541] RA: Constrain class of pic offset table pseudo to general regs

2023-06-07 Thread Vladimir Makarov via Gcc-patches


On 6/7/23 12:20, Jeff Law wrote:



On 6/7/23 09:35, Vladimir Makarov via Gcc-patches wrote:

The following patch fixes



-ENOPATCH


Sorry, here is the patch.

commit 08ca31fb27841cb7f3bff7086be6f139136be1a7
Author: Vladimir N. Makarov 
Date:   Wed Jun 7 09:51:54 2023 -0400

RA: Constrain class of pic offset table pseudo to general regs

On some targets an integer pseudo can be assigned to a FP reg.  For
pic offset table pseudo it means we will reload the pseudo in this
case and, as a consequence, memory containing the pseudo might be
recognized as wrong one.  The patch fix this problem.

PR target/109541

gcc/ChangeLog:

* ira-costs.cc: (find_costs_and_classes): Constrain classes of pic
  offset table pseudo to a general reg subset.

gcc/testsuite/ChangeLog:

* gcc.target/sparc/pr109541.c: New.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index ae8304ff938..d9e700e8947 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -2016,6 +2016,16 @@ find_costs_and_classes (FILE *dump_file)
 	  ira_assert (regno_aclass[i] != NO_REGS
 			  && ira_reg_allocno_class_p[regno_aclass[i]]);
 	}
+	  if (pic_offset_table_rtx != NULL
+	  && i == (int) REGNO (pic_offset_table_rtx))
+	{
+	  /* For some targets, integer pseudos can be assigned to fp
+		 regs.  As we don't want reload pic offset table pseudo, we
+		 should avoid using non-integer regs.  */
+	  regno_aclass[i]
+		= ira_reg_class_intersect[regno_aclass[i]][GENERAL_REGS];
+	  alt_class = ira_reg_class_intersect[alt_class][GENERAL_REGS];
+	}
 	  if ((new_class
 	   = (reg_class) (targetm.ira_change_pseudo_allocno_class
 			  (i, regno_aclass[i], best))) != regno_aclass[i])
diff --git a/gcc/testsuite/gcc.target/sparc/pr109541.c b/gcc/testsuite/gcc.target/sparc/pr109541.c
new file mode 100644
index 000..1360f101930
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/pr109541.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -mcpu=niagara4 -fpic -w" } */
+
+int rhash_sha512_process_block_A, rhash_sha512_process_block_i,
+rhash_sha512_process_block_block, rhash_sha512_process_block_W_0;
+
+unsigned rhash_sha512_process_block_W_2;
+
+void rhash_sha512_process_block (void)
+{
+  unsigned C, E, F, G, H, W_0, W_4, W_9, W_5, W_3, T1;
+
+  for (; rhash_sha512_process_block_i; rhash_sha512_process_block_i += 6) {
+T1 = F + (rhash_sha512_process_block_W_2 += 6);
+rhash_sha512_process_block_A += H & G + (W_5 += rhash_sha512_process_block_W_0);
+H = C & T1 & E ^ F + (W_9 += rhash_sha512_process_block_W_0);
+G = T1 ^ 6 + (W_0 += rhash_sha512_process_block_block);
+F = (unsigned) 
+T1 = (unsigned) ( + (W_3 += rhash_sha512_process_block_block > 9 > W_4));
+C = (unsigned) (T1 + );
+W_4 += W_5 += rhash_sha512_process_block_W_0;
+  }
+}


[pushed] [PR109541] RA: Constrain class of pic offset table pseudo to general regs

2023-06-07 Thread Vladimir Makarov via Gcc-patches

The following patch fixes

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109541

The patch was successfully bootstrapped and tested on x86-64, aarcha64, 
and ppc64le.




[pushed] LRA: Update insn sp offset if its input reload changes SP

2023-05-30 Thread Vladimir Makarov via Gcc-patches
The following patch fixes an LRA bug triggered by switching H8300 target 
from reload to LRA.  The description of the problem is in the commit 
message.


The patch was successfully bootstrapped and tested on x86-64, aarch64, 
and ppc64le.


commit 30038a207c10a2783fa2695b62c7c8458ef05e73
Author: Vladimir N. Makarov 
Date:   Tue May 30 15:54:28 2023 -0400

LRA: Update insn sp offset if its input reload changes SP

The patch fixes a bug when there is input reload changing SP.  The bug was
triggered by switching H8300 target to LRA.  The insn in question is

(insn 21 20 22 2 (set (mem/f:SI (pre_dec:SI (reg/f:SI 7 sp)) [3  S4 A32])
(reg/f:SI 31)) "j.c":10:3 19 {*movsi}
 (expr_list:REG_DEAD (reg/f:SI 31)
(expr_list:REG_ARGS_SIZE (const_int 4 [0x4])
(nil

The memory address is reloaded but the SP offset for the original insn was 
not updated.

gcc/ChangeLog:

* lra-int.h (lra_update_sp_offset): Add the prototype.
* lra.cc (setup_sp_offset): Change the return type.  Use
lra_update_sp_offset.
* lra-eliminations.cc (lra_update_sp_offset): New function.
(lra_process_new_insns): Push the current insn to reprocess if the
input reload changes sp offset.

diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
index 4220639..68225339cb6 100644
--- a/gcc/lra-eliminations.cc
+++ b/gcc/lra-eliminations.cc
@@ -1308,6 +1308,16 @@ init_elimination (void)
   setup_elimination_map ();
 }
 
+/* Update and return stack pointer OFFSET after processing X.  */
+poly_int64
+lra_update_sp_offset (rtx x, poly_int64 offset)
+{
+  curr_sp_change = offset;
+  mark_not_eliminable (x, VOIDmode);
+  return curr_sp_change;
+}
+
+
 /* Eliminate hard reg given by its location LOC.  */
 void
 lra_eliminate_reg_if_possible (rtx *loc)
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index a400a0f85e2..4dbe6672f3a 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -412,6 +412,7 @@ extern rtx lra_eliminate_regs_1 (rtx_insn *, rtx, 
machine_mode,
 extern void eliminate_regs_in_insn (rtx_insn *insn, bool, bool, poly_int64);
 extern void lra_eliminate (bool, bool);
 
+extern poly_int64 lra_update_sp_offset (rtx, poly_int64);
 extern void lra_eliminate_reg_if_possible (rtx *);
 
 
diff --git a/gcc/lra.cc b/gcc/lra.cc
index eb3ee1f8b63..c8b3f139acd 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -1838,10 +1838,10 @@ push_insns (rtx_insn *from, rtx_insn *to)
   lra_push_insn (insn);
 }
 
-/* Set up sp offset for insn in range [FROM, LAST].  The offset is
+/* Set up and return sp offset for insns in range [FROM, LAST].  The offset is
taken from the next BB insn after LAST or zero if there in such
insn.  */
-static void
+static poly_int64
 setup_sp_offset (rtx_insn *from, rtx_insn *last)
 {
   rtx_insn *before = next_nonnote_nondebug_insn_bb (last);
@@ -1849,7 +1849,11 @@ setup_sp_offset (rtx_insn *from, rtx_insn *last)
   ? 0 : lra_get_insn_recog_data (before)->sp_offset);
 
   for (rtx_insn *insn = from; insn != NEXT_INSN (last); insn = NEXT_INSN 
(insn))
-lra_get_insn_recog_data (insn)->sp_offset = offset;
+{
+  lra_get_insn_recog_data (insn)->sp_offset = offset;
+  offset = lra_update_sp_offset (PATTERN (insn), offset);
+}
+  return offset;
 }
 
 /* Emit insns BEFORE before INSN and insns AFTER after INSN.  Put the
@@ -1875,8 +1879,25 @@ lra_process_new_insns (rtx_insn *insn, rtx_insn *before, 
rtx_insn *after,
   if (cfun->can_throw_non_call_exceptions)
copy_reg_eh_region_note_forward (insn, before, NULL);
   emit_insn_before (before, insn);
+  poly_int64 old_sp_offset = lra_get_insn_recog_data (insn)->sp_offset;
+  poly_int64 new_sp_offset = setup_sp_offset (before, PREV_INSN (insn));
+  if (maybe_ne (old_sp_offset, new_sp_offset))
+   {
+ if (lra_dump_file != NULL)
+   {
+ fprintf (lra_dump_file, "Changing sp offset from ");
+ print_dec (old_sp_offset, lra_dump_file);
+ fprintf (lra_dump_file, " to ");
+ print_dec (new_sp_offset, lra_dump_file);
+ fprintf (lra_dump_file, " for insn");
+ dump_rtl_slim (lra_dump_file, insn, NULL, -1, 0);
+   }
+ lra_get_insn_recog_data (insn)->sp_offset = new_sp_offset;
+ eliminate_regs_in_insn (insn, false, false,
+ old_sp_offset - new_sp_offset);
+ lra_push_insn (insn);
+   }
   push_insns (PREV_INSN (insn), PREV_INSN (before));
-  setup_sp_offset (before, PREV_INSN (insn));
 }
   if (after != NULL_RTX)
 {


Re: [PATCH] Only use NO_REGS in cost calculation when !hard_regno_mode_ok for GENERAL_REGS and mode.

2023-05-25 Thread Vladimir Makarov via Gcc-patches



On 5/17/23 02:57, liuhongt wrote:

r14-172-g0368d169492017 replaces GENERAL_REGS with NO_REGS in cost
calculation when the preferred register class are not known yet.
It regressed powerpc PR109610 and PR109858, it looks too aggressive to use
NO_REGS when mode can be allocated with GENERAL_REGS.
The patch takes a step back, still use GENERAL_REGS when
hard_regno_mode_ok for mode and GENERAL_REGS, otherwise uses NO_REGS.
Kewen confirmed the patch fixed PR109858, I vefiried it also fixed PR109610.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big performance impact for SPEC2017 on icelake server.
Ok for trunk?

gcc/ChangeLog:

* ira-costs.cc (scan_one_insn): Only use NO_REGS in cost
calculation when !hard_regno_mode_ok for GENERAL_REGS and
mode, otherwise still use GENERAL_REGS.


Thank you for the patch.  It looks good for me.  It is ok to commit it 
into the trunk.





Re: [PATCH] ira: Don't create copies for earlyclobbered pairs

2023-05-08 Thread Vladimir Makarov via Gcc-patches



On 5/5/23 12:59, Richard Sandiford wrote:

This patch follows on from g:9f635bd13fe9e85872e441b6f3618947f989909a
("the previous patch").  To start by quoting that:

If an insn requires two operands to be tied, and the input operand dies
in the insn, IRA acts as though there were a copy from the input to the
output with the same execution frequency as the insn.  Allocating the
same register to the input and the output then saves the cost of a move.

If there is no such tie, but an input operand nevertheless dies
in the insn, IRA creates a similar move, but with an eighth of the
frequency.  This helps to ensure that chains of instructions reuse
registers in a natural way, rather than using arbitrarily different
registers for no reason.

This heuristic seems to work well in the vast majority of cases.
However, the problem fixed in the previous patch was that we
could create a copy for an operand pair even if, for all relevant
alternatives, the output and input register classes did not have
any registers in common.  It is then impossible for the output
operand to reuse the dying input register.

This left unfixed a further case where copies don't make sense:
there is no point trying to reuse the dying input register if,
for all relevant alternatives, the output is earlyclobbered and
the input doesn't match the output.  (Matched earlyclobbers are fine.)

Handling that case fixes several existing XFAILs and helps with
a follow-on aarch64 patch.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  A SPEC2017 run
on aarch64 showed no differences outside the noise.  Also, I tried
compiling gcc.c-torture, gcc.dg, and g++.dg for at least one target
per cpu directory, using the options -Os -fno-schedule-insns{,2}.
The results below summarise the tests that showed a difference in LOC:

Target   Tests   GoodBad   DeltaBest   Worst  Median
==   =   ===   =   =  ==
amdgcn-amdhsa   14  7  7   3 -18  10  -1
arm-linux-gnueabihf 16 15  1 -22  -4   2  -1
csky-elf 6  6  0 -21  -6  -2  -4
hppa64-hp-hpux11.23  5  5  0  -7  -2  -1  -1
ia64-linux-gnu  16 16  0 -70 -15  -1  -3
m32r-elf53  1 52  64  -2   8   1
mcore-elf2  2  0  -8  -6  -2  -6
microblaze-elf 285283  2-909 -68   4  -1
mmix 7  7  0   -2101   -2091  -1  -1
msp430-elf   1  1  0  -4  -4  -4  -4
pru-elf  8  6  2 -12  -6   2  -2
rx-elf  22 18  4 -40  -5   6  -2
sparc-linux-gnu 15 14  1 -40  -8   1  -2
sparc-wrs-vxworks   15 14  1 -40  -8   1  -2
visium-elf   2  1  1   0  -2   2  -2
xstormy16-elf1  1  0  -2  -2  -2  -2

with other targets showing no sensitivity to the patch.  The only
target that seems to be negatively affected is m32r-elf; otherwise
the patch seems like an extremely minor but still clear improvement.

OK to install?


Yes, Richard.

Thank you for measuring the patch effect.  I wish other people would do 
the same for patches affecting generated code performance.




Re: [PATCH 1/2] Use NO_REGS in cost calculation when the preferred register class are not known yet.

2023-04-21 Thread Vladimir Makarov via Gcc-patches



On 4/19/23 20:46, liuhongt via Gcc-patches wrote:

1547  /* If this insn loads a parameter from its stack slot, then it
1548 represents a savings, rather than a cost, if the parameter is
1549 stored in memory.  Record this fact.
1550
1551 Similarly if we're loading other constants from memory (constant
1552 pool, TOC references, small data areas, etc) and this is the only
1553 assignment to the destination pseudo.

At that time, preferred regclass is unknown, and GENERAL_REGS is used to
record memory move cost, but it's not accurate especially for large vector
modes, i.e. 512-bit vector in x86 which would most probably allocate with
SSE_REGS instead of GENERAL_REGS. Using GENERAL_REGS here will overestimate
the cost of this load and make RA propagate the memeory operand into many
consume instructions which causes worse performance.


For this case GENERAL_REGS was used in GCC practically all the time.  
You can check this in the old regclass.c file (existing until IRA 
introduction).


But I guess it is ok to use NO_REGS for this to promote more usage of 
registers instead of equiv memory and as a lot of code was changed since 
then (the old versions of GCC even did not support vector regs).


Although it would be nice to do some benchmarking (SPEC is preferable) 
for such kind of changes.


On the other hand, I expect that any performance regression (if any) 
will be reported anyway.


The patch is ok for me.  You can commit it into the trunk.

Thank you for addressing this issue.


Fortunately, NO_REGS is used to record the best scenario, so the patch uses
NO_REGS instead of GENERAL_REGS here, it could help RA in PR108707.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/108707
* ira-costs.cc (scan_one_insn): Use NO_REGS instead of
GENERAL_REGS when preferred reg_class is not known.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108707.c: New test.




[pushed] [LRA]: Exclude some hard regs for multi-reg inout reload pseudos used in asm in different mode

2023-04-20 Thread Vladimir Makarov via Gcc-patches
The following patch fixes test failure of 20030222-1.c on moxie port.  
But the problem can occur on other targets.  The patch actually 
implements the old reload approach for the test case.


The patch was successfully tested and bootstrapped on x86-64, aarch64, 
and ppc64le.


commit 51703ac3c722cd94011ab5b499921f6c9fe9fab5
Author: Vladimir N. Makarov 
Date:   Thu Apr 20 10:02:13 2023 -0400

[LRA]: Exclude some hard regs for multi-reg inout reload pseudos used in 
asm in different mode

See gcc.c-torture/execute/20030222-1.c.  Consider the code for 32-bit (e.g. 
BE) target:
  int i, v; long x; x = v; asm ("" : "=r" (i) : "0" (x));
We generate the following RTL with reload insns:
  1. subreg:si(x:di, 0) = 0;
  2. subreg:si(x:di, 4) = v:si;
  3. t:di = x:di, dead x;
  4. asm ("" : "=r" (subreg:si(t:di,4)) : "0" (t:di))
  5. i:si = subreg:si(t:di,4);
If we assign hard reg of x to t, dead code elimination will remove insn #2
and we will use unitialized hard reg.  So exclude the hard reg of x for t.
We could ignore this problem for non-empty asm using all x value but it is 
hard to
check that the asm are expanded into insn realy using x and setting r.
The old reload pass used the same approach.

gcc/ChangeLog

* lra-constraints.cc (match_reload): Exclude some hard regs for
multi-reg inout reload pseudos used in asm in different mode.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index b231cb60529..4dc2d70c402 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -1022,6 +1022,34 @@ match_reload (signed char out, signed char *ins, signed 
char *outs,
 are ordered.  */
   if (partial_subreg_p (outmode, inmode))
{
+ bool asm_p = asm_noperands (PATTERN (curr_insn)) >= 0;
+ int hr;
+ HARD_REG_SET temp_hard_reg_set;
+ 
+ if (asm_p && (hr = get_hard_regno (out_rtx)) >= 0
+ && hard_regno_nregs (hr, inmode) > 1)
+   {
+ /* See gcc.c-torture/execute/20030222-1.c.
+Consider the code for 32-bit (e.g. BE) target:
+  int i, v; long x; x = v; asm ("" : "=r" (i) : "0" (x));
+We generate the following RTL with reload insns:
+  1. subreg:si(x:di, 0) = 0;
+  2. subreg:si(x:di, 4) = v:si;
+  3. t:di = x:di, dead x;
+  4. asm ("" : "=r" (subreg:si(t:di,4)) : "0" (t:di))
+  5. i:si = subreg:si(t:di,4);
+If we assign hard reg of x to t, dead code elimination
+will remove insn #2 and we will use unitialized hard reg.
+So exclude the hard reg of x for t.  We could ignore this
+problem for non-empty asm using all x value but it is hard to
+check that the asm are expanded into insn realy using x
+and setting r.  */
+ CLEAR_HARD_REG_SET (temp_hard_reg_set);
+ if (exclude_start_hard_regs != NULL)
+   temp_hard_reg_set = *exclude_start_hard_regs;
+ SET_HARD_REG_BIT (temp_hard_reg_set, hr);
+ exclude_start_hard_regs = _hard_reg_set;
+   }
  reg = new_in_reg
= lra_create_new_reg_with_unique_value (inmode, in_rtx, goal_class,
exclude_start_hard_regs,


Re: [PATCH] Check hard_regno_mode_ok before setting lowest memory move cost for the mode with different reg classes.

2023-04-05 Thread Vladimir Makarov via Gcc-patches



On 4/4/23 21:29, Jeff Law wrote:



On 4/3/23 23:13, liuhongt via Gcc-patches wrote:

There's a potential performance issue when backend returns some
unreasonable value for the mode which can be never be allocate with
reg class.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk(or GCC14 stage1)?

gcc/ChangeLog:

PR rtl-optimization/109351
* ira.cc (setup_class_subset_and_memory_move_costs): Check
hard_regno_mode_ok before setting lowest memory move cost for
the mode with different reg classes.
Not a regression *and* changing register allocation.  This seems like 
it should defer to gcc-14.


Yes, I am agree.  It should wait for gcc-14, especially when we are 
close to the release. Also the testing x86-64 is not enough for such 
changes (although I tried ppc64le and did not find any problem).


Cost related patches for RA frequently result in new testsuite failures 
on some targets.  Even if the change seems obvious and expected to 
improve the generated code.


Target dependent code sometimes defines correctly the costs only for 
some possible cases and making less dependent from this pitfall is 
good.  So I think the patch moves us to the right direction.


The patch is ok for me to commit it to the trunk after the gcc-13 
release and if arm64 testing shows no GCC testsuite regression.


Thank you for working on this issue.




[pushed][PR109052] LRA: Implement commutative operands exchange for combining secondary memory reload and original insn

2023-03-31 Thread Vladimir Makarov via Gcc-patches

This is one more patch for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109052

The patch adds trying commutative operands exchange for recently 
implemented combining secondary memory reload and original insn:


The patch was successfully bootstrapped and tested on x86_64.

commit 378d19cfebfa2bc4f693dfc9e6f0dd993e7c45f7
Author: Vladimir N. Makarov 
Date:   Fri Mar 31 11:04:44 2023 -0400

LRA: Implement commutative operands exchange for combining secondary memory reload and original insn

The patch implements trying commutative operands exchange for
combining secondary memory reload and original insn.

PR rtl-optimization/109052

gcc/ChangeLog:

* lra-constraints.cc: (combine_reload_insn): New function.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109052-2.c: New.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 405b8b92f5e..ff4e8f06063 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5061,7 +5061,23 @@ combine_reload_insn (rtx_insn *from, rtx_insn *to)
   curr_insn = to;
   curr_id = lra_get_insn_recog_data (curr_insn);
   curr_static_id = curr_id->insn_static_data;
-  ok_p = !curr_insn_transform (true);
+  for (bool swapped_p = false;;)
+	{
+	  ok_p = !curr_insn_transform (true);
+	  if (ok_p || curr_static_id->commutative < 0)
+	break;
+	  swap_operands (curr_static_id->commutative);
+	  if (lra_dump_file != NULL)
+	{
+	  fprintf (lra_dump_file,
+		   "Swapping %scombined insn operands:\n",
+		   swapped_p ? "back " : "");
+	  dump_insn_slim (lra_dump_file, to);
+	}
+	  if (swapped_p)
+	break;
+	  swapped_p = true;
+	}
   curr_insn = saved_insn;
   curr_id = lra_get_insn_recog_data (curr_insn);
   curr_static_id = curr_id->insn_static_data;
diff --git a/gcc/testsuite/gcc.target/i386/pr109052-2.c b/gcc/testsuite/gcc.target/i386/pr109052-2.c
new file mode 100644
index 000..337d1f49c2b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109052-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mfpmath=both -mavx -fno-math-errno" } */
+
+double foo (double a, double b)
+{
+  double z = __builtin_fmod (a, 3.14);
+  return z * b;
+}
+
+/* { dg-final { scan-assembler-not "vmulsd\[ \t]\+%xmm\[0-9]\+, %xmm\[0-9]\+, %xmm\[0-9]\+"} } */


[pushed] [PR109137] LRA: Do not repeat inheritance and live range splitting in case of asm error

2023-03-22 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109137

The patch was successfully bootstrapped and tested on x86-64.

commit 81d762cbec9685c2f2571da21d48f42c42eff33b
Author: Vladimir N. Makarov 
Date:   Wed Mar 22 12:33:11 2023 -0400

LRA: Do not repeat inheritance and live range splitting in case of asm error

LRA was trying to do live range splitting again and again as there were
no enough regs for asm.  This patch solves the problem.

PR target/109137

gcc/ChangeLog:

* lra.cc (lra): Do not repeat inheritance and live range splitting
when asm error is found.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109137.c: New.

diff --git a/gcc/lra.cc b/gcc/lra.cc
index f7fdd601e71..eb3ee1f8b63 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -2453,7 +2453,7 @@ lra (FILE *f)
 		  lra_hard_reg_split_p = true;
 		}
 	}
-	  while (fails_p);
+	  while (fails_p && !lra_asm_error_p);
 	  if (! live_p) {
 	/* We need the correct reg notes for work of constraint sub-pass.  */
 	lra_create_live_ranges (true, true);
diff --git a/gcc/testsuite/gcc.target/i386/pr109137.c b/gcc/testsuite/gcc.target/i386/pr109137.c
new file mode 100644
index 000..ffd8e8c574b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109137.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-m32 -O3 -march=znver1 -fPIC -mfpmath=sse -w" } */
+#include 
+typedef struct {
+  char bytestream_end;
+} CABACContext;
+int get_cabac___trans_tmp_3, get_cabac_tmp, get_cabac_c,
+decode_cabac_mb_intra4x4_pred_mode_mode, ff_h264_decode_mb_cabac_h_0,
+ff_h264_decode_mb_cabac_bit;
+typedef struct {
+  char intra4x4_pred_mode_cache[2];
+} H264SliceContext;
+H264SliceContext ff_h264_decode_mb_cabac_sl;
+void ff_h264_decode_mb_cabac(void) {
+  memset((void*)ff_h264_decode_mb_cabac_h_0, 6, 48);
+  int i;
+  for (;; i++) {
+__asm__(""/* { dg-error "'asm' operand has impossible constraints" } */
+: "="(ff_h264_decode_mb_cabac_bit), "="(get_cabac_c),
+  "="(get_cabac_c), "="(get_cabac_tmp)
+: "r"(get_cabac___trans_tmp_3),
+  "r"(__builtin_offsetof(CABACContext, bytestream_end))
+: "ecx");
+ff_h264_decode_mb_cabac_sl.intra4x4_pred_mode_cache[i] =
+decode_cabac_mb_intra4x4_pred_mode_mode;
+  }
+}
+


[pushed] [PR109052] LRA: Implement combining secondary memory reload and original insn

2023-03-17 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109052

The patch was successfully bootstrapped and tested on x86-64, i686, 
aarch64, and ppc64le.
commit 57688950b9328cbb4a9c21eb3199f9132b5119d3
Author: Vladimir N. Makarov 
Date:   Fri Mar 17 08:58:58 2023 -0400

LRA: Implement combining secondary memory reload and original insn

LRA creates secondary memory reload insns but do not try to combine it
with the original insn.  This patch implements a simple insn combining
for such cases in LRA.

PR rtl-optimization/109052

gcc/ChangeLog:

* lra-constraints.cc: Include hooks.h.
(combine_reload_insn): New function.
(lra_constraints): Call it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109052.c: New.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index c38566a7451..95b534e1a70 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -110,6 +110,7 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
+#include "hooks.h"
 #include "target.h"
 #include "rtl.h"
 #include "tree.h"
@@ -5001,6 +5002,96 @@ contains_reloaded_insn_p (int regno)
   return false;
 }
 
+/* Try combine secondary memory reload insn FROM for insn TO into TO insn.
+   FROM should be a load insn (usually a secondary memory reload insn).  Return
+   TRUE in case of success.  */
+static bool
+combine_reload_insn (rtx_insn *from, rtx_insn *to)
+{
+  bool ok_p;
+  rtx_insn *saved_insn;
+  rtx set, from_reg, to_reg, op;
+  enum reg_class to_class, from_class;
+  int n, nop;
+  signed char changed_nops[MAX_RECOG_OPERANDS + 1];
+  lra_insn_recog_data_t id = lra_get_insn_recog_data (to);
+  struct lra_static_insn_data *static_id = id->insn_static_data;
+  
+  /* Check conditions for second memory reload and original insn:  */
+  if ((targetm.secondary_memory_needed
+   == hook_bool_mode_reg_class_t_reg_class_t_false)
+  || NEXT_INSN (from) != to || CALL_P (to)
+  || id->used_insn_alternative == LRA_UNKNOWN_ALT
+  || (set = single_set (from)) == NULL_RTX)
+return false;
+  from_reg = SET_DEST (set);
+  to_reg = SET_SRC (set);
+  /* Ignore optional reloads: */
+  if (! REG_P (from_reg) || ! REG_P (to_reg)
+  || bitmap_bit_p (_optional_reload_pseudos, REGNO (from_reg)))
+return false;
+  to_class = lra_get_allocno_class (REGNO (to_reg));
+  from_class = lra_get_allocno_class (REGNO (from_reg));
+  /* Check that reload insn is a load:  */
+  if (to_class != NO_REGS || from_class == NO_REGS)
+return false;
+  for (n = nop = 0; nop < static_id->n_operands; nop++)
+{
+  if (static_id->operand[nop].type != OP_IN)
+	continue;
+  op = *id->operand_loc[nop];
+  if (!REG_P (op) || REGNO (op) != REGNO (from_reg))
+	continue;
+  *id->operand_loc[nop] = to_reg;
+  changed_nops[n++] = nop;
+}
+  changed_nops[n] = -1;
+  lra_update_dups (id, changed_nops);
+  lra_update_insn_regno_info (to);
+  ok_p = recog_memoized (to) >= 0;
+  if (ok_p)
+{
+  /* Check that combined insn does not need any reloads: */
+  saved_insn = curr_insn;
+  curr_insn = to;
+  curr_id = lra_get_insn_recog_data (curr_insn);
+  curr_static_id = curr_id->insn_static_data;
+  ok_p = !curr_insn_transform (true);
+  curr_insn = saved_insn;
+  curr_id = lra_get_insn_recog_data (curr_insn);
+  curr_static_id = curr_id->insn_static_data;
+}
+  if (ok_p)
+{
+  id->used_insn_alternative = -1;
+  lra_push_insn_and_update_insn_regno_info (to);
+  if (lra_dump_file != NULL)
+	{
+	  fprintf (lra_dump_file, "Use combined insn:\n");
+	  dump_insn_slim (lra_dump_file, to);
+	}
+  return true;
+}
+  if (lra_dump_file != NULL)
+{
+  fprintf (lra_dump_file, "Failed combined insn:\n");
+  dump_insn_slim (lra_dump_file, to);
+}
+  for (int i = 0; i < n; i++)
+{
+  nop = changed_nops[i];
+  *id->operand_loc[nop] = from_reg;
+}
+  lra_update_dups (id, changed_nops);
+  lra_update_insn_regno_info (to);
+  if (lra_dump_file != NULL)
+{
+  fprintf (lra_dump_file, "Restoring insn after failed combining:\n");
+  dump_insn_slim (lra_dump_file, to);
+}
+  return false;
+}
+
 /* Entry function of LRA constraint pass.  Return true if the
constraint pass did change the code.	 */
 bool
@@ -5010,6 +5101,7 @@ lra_constraints (bool first_p)
   int i, hard_regno, new_insns_num;
   unsigned int min_len, new_min_len, uid;
   rtx set, x, reg, dest_reg;
+  rtx_insn *original_insn;
   basic_block last_bb;
   bitmap_iterator bi;
 
@@ -5119,6 +5211,7 @@ lra_constraints (bool first_p)
   new_insns_num = 0;
   last_bb = NULL;
   changed_p = false;
+  original_insn = NULL;
   while ((new_min_len = lra_insn_stack_length ()) != 0)
 {
   curr_insn = lra_pop_insn ();
@@ -5133,7 +5226,12 @@ lra_constraints (bool first_p)
 	{
 	  min_len = new_min_len;
 	  new_insns_num = 0;
+	  

[pushed] [PR108999] LRA: For clobbered regs use operand mode instead of the biggest mode

2023-03-09 Thread Vladimir Makarov via Gcc-patches

The following patch solves

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108999

The patch was successfully bootstrapped and tested on i686, x86-64, 
aarch64, and ppc64 be/le.
commit 3c75631fc09a22f2513fab80ef502c2a8b0f9121
Author: Vladimir N. Makarov 
Date:   Thu Mar 9 08:41:09 2023 -0500

LRA: For clobbered regs use operand mode instead of the biggest mode

LRA is too conservative in calculation of conflicts with clobbered regs by
using the biggest access mode.  This results in failure of possible reg
coalescing and worse code.  This patch solves the problem.

PR rtl-optimization/108999

gcc/ChangeLog:

* lra-constraints.cc (process_alt_operands): Use operand modes for
clobbered regs instead of the biggest access mode.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr108999.c: New.

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index dbfaf0485a5..c38566a7451 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -3108,7 +3108,8 @@ process_alt_operands (int only_alternative)
 	  lra_assert (operand_reg[i] != NULL_RTX);
 	  clobbered_hard_regno = hard_regno[i];
 	  CLEAR_HARD_REG_SET (temp_set);
-	  add_to_hard_reg_set (_set, biggest_mode[i], clobbered_hard_regno);
+	  add_to_hard_reg_set (_set, GET_MODE (*curr_id->operand_loc[i]),
+			   clobbered_hard_regno);
 	  first_conflict_j = last_conflict_j = -1;
 	  for (j = 0; j < n_operands; j++)
 	if (j == i
diff --git a/gcc/testsuite/gcc.target/aarch64/pr108999.c b/gcc/testsuite/gcc.target/aarch64/pr108999.c
new file mode 100644
index 000..a34db85be83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr108999.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+#include 
+
+void subreg_coalesce5 (
+svbool_t pg, int64_t* base, int n,
+int64_t *in1, int64_t *in2, int64_t*out
+)
+{
+svint64x2_t result = svld2_s64 (pg, base);
+
+for (int i = 0; i < n; i += 1) {
+svint64_t v18 = svld1_s64(pg, in1 + i);
+svint64_t v19 = svld1_s64(pg, in2 + i);
+result.__val[0] = svmad_s64_z(pg, v18, v19, result.__val[0]);
+result.__val[1] = svmad_s64_z(pg, v18, v19, result.__val[1]);
+}
+svst2_s64(pg, out, result);
+}
+
+/* { dg-final { scan-assembler-not {[ \t]*mov[ \t]*z[0-9]+\.d} } } */


[pushed][PR90706] IRA: Use minimal cost for hard register movement

2023-03-02 Thread Vladimir Makarov via Gcc-patches

The following patch is for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90706

The patch was successfully bootstrapped and tested on i686, x86-64, 
aarch64, ppc64le.


commit 23661e39df76e07fb4ce1ea015379c7601d947ef
Author: Vladimir N. Makarov 
Date:   Thu Mar 2 16:29:05 2023 -0500

IRA: Use minimal cost for hard register movement

This is the 2nd attempt to fix PR90706.  IRA calculates wrong AVR
costs for moving general hard regs of SFmode.  This was the reason for
spilling a pseudo in the PR.  In this patch we use smaller move cost
of hard reg in its natural and operand modes.

PR rtl-optimization/90706

gcc/ChangeLog:

* ira-costs.cc: Include print-rtl.h.
(record_reg_classes, scan_one_insn): Add code to print debug info.
(record_operand_costs): Find and use smaller cost for hard reg
move.

gcc/testsuite/ChangeLog:

* gcc.target/avr/pr90706.c: New.

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 4c28171f27d..c0fdef807dd 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ira-int.h"
 #include "addresses.h"
 #include "reload.h"
+#include "print-rtl.h"
 
 /* The flags is set up every time when we calculate pseudo register
classes through function ira_set_pseudo_classes.  */
@@ -503,6 +504,18 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
   int insn_allows_mem[MAX_RECOG_OPERANDS];
   move_table *move_in_cost, *move_out_cost;
   short (*mem_cost)[2];
+  const char *p;
+
+  if (ira_dump_file != NULL && internal_flag_ira_verbose > 5)
+{
+  fprintf (ira_dump_file, "Processing insn %u", INSN_UID (insn));
+  if (INSN_CODE (insn) >= 0
+	  && (p = get_insn_name (INSN_CODE (insn))) != NULL)
+	fprintf (ira_dump_file, " {%s}", p);
+  fprintf (ira_dump_file, " (freq=%d)\n",
+	   REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn)));
+  dump_insn_slim (ira_dump_file, insn);
+  }
 
   for (i = 0; i < n_ops; i++)
 insn_allows_mem[i] = 0;
@@ -526,6 +539,21 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
 	  continue;
 	}
 
+  if (ira_dump_file != NULL && internal_flag_ira_verbose > 5)
+	{
+	  fprintf (ira_dump_file, "  Alt %d:", alt);
+	  for (i = 0; i < n_ops; i++)
+	{
+	  p = constraints[i];
+	  if (*p == '\0')
+		continue;
+	  fprintf (ira_dump_file, "  (%d) ", i);
+	  for (; *p != '\0' && *p != ',' && *p != '#'; p++)
+		fputc (*p, ira_dump_file);
+	}
+	  fprintf (ira_dump_file, "\n");
+	}
+
   for (i = 0; i < n_ops; i++)
 	{
 	  unsigned char c;
@@ -593,12 +621,16 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
 		 register, this alternative can't be used.  */
 
 		  if (classes[j] == NO_REGS)
-		alt_fail = 1;
-		  /* Otherwise, add to the cost of this alternative
-		 the cost to copy the other operand to the hard
-		 register used for this operand.  */
+		{
+		  alt_fail = 1;
+		}
 		  else
-		alt_cost += copy_cost (ops[j], mode, classes[j], 1, NULL);
+		/* Otherwise, add to the cost of this alternative the cost
+		   to copy the other operand to the hard register used for
+		   this operand.  */
+		{
+		  alt_cost += copy_cost (ops[j], mode, classes[j], 1, NULL);
+		}
 		}
 	  else
 		{
@@ -1021,18 +1053,45 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
   for (i = 0; i < n_ops; i++)
 	if (REG_P (ops[i]) && REGNO (ops[i]) >= FIRST_PSEUDO_REGISTER)
 	  {
+	int old_cost;
+	bool cost_change_p = false;
 	struct costs *pp = op_costs[i], *qq = this_op_costs[i];
 	int *pp_costs = pp->cost, *qq_costs = qq->cost;
 	int scale = 1 + (recog_data.operand_type[i] == OP_INOUT);
 	cost_classes_t cost_classes_ptr
 	  = regno_cost_classes[REGNO (ops[i])];
 
-	pp->mem_cost = MIN (pp->mem_cost,
+	old_cost = pp->mem_cost;
+	pp->mem_cost = MIN (old_cost,
 (qq->mem_cost + op_cost_add) * scale);
 
+	if (ira_dump_file != NULL && internal_flag_ira_verbose > 5
+		&& pp->mem_cost < old_cost)
+	  {
+		cost_change_p = true;
+		fprintf (ira_dump_file, "op %d(r=%u) new costs MEM:%d",
+			 i, REGNO(ops[i]), pp->mem_cost);
+	  }
 	for (k = cost_classes_ptr->num - 1; k >= 0; k--)
-	  pp_costs[k]
-		= MIN (pp_costs[k], (qq_costs[k] + op_cost_add) * scale);
+	  {
+		old_cost = pp_costs[k];
+		pp_costs[k]
+		  = MIN (old_cost, (qq_costs[k] + op_cost_add) * scale);
+		if (ira_dump_file != NULL && internal_flag_ira_verbose > 5
+		&& pp_costs[k] < old_cost)
+		  {
+		if (!cost_change_p)
+		  fprintf (ira_dump_file, "op %d(r=%u) new costs",
+			   i, REGNO(ops[i]));
+		cost_change_p = true;
+		fprintf (ira_dump_file, " %s:%d",
+			 reg_class_names[cost_classes_ptr->classes[k]],
+			 pp_costs[k]);
+		  }
+	  }
+	if (ira_dump_file != NULL && internal_flag_ira_verbose > 5

[pushed] [PR108774] RA: Clear reg equiv caller_save_p flag when clearing defined_p flag

2023-02-13 Thread Vladimir Makarov via Gcc-patches

The following patch solves

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108774

The patch was successfully bootstrapped and tested on i686, x86_64, and 
aarch64.
commit a33e3dcbd15e73603796e30b5eeec11a0c8bacec
Author: Vladimir N. Makarov 
Date:   Mon Feb 13 16:05:04 2023 -0500

RA: Clear reg equiv caller_save_p flag when clearing defined_p flag

IRA can invalidate initially setup equivalence in setup_reg_equiv.
Flag caller_saved was not cleared during invalidation although
init_insns were cleared.  It resulted in segmentation fault in
get_equiv.  Clearing the flag solves the problem.  For more
precaution I added clearing the flag in other places too although it
might be not necessary.

PR rtl-optimization/108774

gcc/ChangeLog:

* ira.cc (ira_update_equiv_info_by_shuffle_insn): Clear equiv
caller_save_p flag when clearing defined_p flag.
(setup_reg_equiv): Ditto.
* lra-constraints.cc (lra_constraints): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108774.c: New.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 9f9af808f63..6c7f4901e4c 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -2725,6 +2725,7 @@ ira_update_equiv_info_by_shuffle_insn (int to_regno, int from_regno, rtx_insn *i
 	  return;
 	}
   ira_reg_equiv[to_regno].defined_p = false;
+  ira_reg_equiv[to_regno].caller_save_p = false;
   ira_reg_equiv[to_regno].memory
 	= ira_reg_equiv[to_regno].constant
 	= ira_reg_equiv[to_regno].invariant
@@ -4193,6 +4194,7 @@ setup_reg_equiv (void)
 			if (ira_reg_equiv[i].memory == NULL_RTX)
 			  {
 			ira_reg_equiv[i].defined_p = false;
+			ira_reg_equiv[i].caller_save_p = false;
 			ira_reg_equiv[i].init_insns = NULL;
 			break;
 			  }
@@ -4203,6 +4205,7 @@ setup_reg_equiv (void)
 	  }
 	  }
 	ira_reg_equiv[i].defined_p = false;
+	ira_reg_equiv[i].caller_save_p = false;
 	ira_reg_equiv[i].init_insns = NULL;
 	break;
   }
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index dd4f68bbfc0..dbfaf0485a5 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5100,7 +5100,8 @@ lra_constraints (bool first_p)
 			 && (targetm.preferred_reload_class
 			 (x, lra_get_allocno_class (i)) == NO_REGS))
 			|| contains_symbol_ref_p (x
-	  ira_reg_equiv[i].defined_p = false;
+	  ira_reg_equiv[i].defined_p
+		= ira_reg_equiv[i].caller_save_p = false;
 	if (contains_reg_p (x, false, true))
 	  ira_reg_equiv[i].profitable_p = false;
 	if (get_equiv (reg) != reg)
diff --git a/gcc/testsuite/gcc.target/i386/pr108774.c b/gcc/testsuite/gcc.target/i386/pr108774.c
new file mode 100644
index 000..482bc490cde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108774.c
@@ -0,0 +1,11 @@
+/* PR target/108774 */
+/* { dg-do compile  { target x86_64-*-* } } */
+/* { dg-options "-Os -ftrapv -mcmodel=large" } */
+
+int i, j;
+
+void
+foo (void)
+{
+  i = ((1 << j) - 1) >> j;
+}


[pushed] [PR108754] RA: Use caller save equivalent memory only for LRA

2023-02-10 Thread Vladimir Makarov via Gcc-patches

The following patch should  solve

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108754

The patch simply switches off a new optimization for targets using the 
old reload pass.


The patch was successfully bootstrapped on x86-64.

commit 7757567358a84c3774cb972350bd7ea299daaa8d
Author: Vladimir N. Makarov 
Date:   Fri Feb 10 12:17:07 2023 -0500

RA: Use caller save equivalent memory only for LRA

Recently I submitted a patch to reuse memory with constant address for
caller saves optimization for constant or pure function call.  It
seems to work only for targets using LRA instead of the old reload
pass.  So the patch switches off this optimization when the old reload
pass is used.

PR middle-end/108754

gcc/ChangeLog:

* ira.cc (update_equiv_regs): Set up ira_reg_equiv for
valid_combine only when ira_use_lra_p is true.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index d0b6ea062e8..9f9af808f63 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -3773,7 +3773,7 @@ update_equiv_regs (void)
 		{
 		  note = set_unique_reg_note (insn, REG_EQUIV, replacement);
 		}
-		  else
+		  else if (ira_use_lra_p)
 		{
 		  /* We still can use this equivalence for caller save
 			 optimization in LRA.  Mark this.  */


[pushed] [PR108500] RA: Use simple LRA for huge functions

2023-02-10 Thread Vladimir Makarov via Gcc-patches

The following patch is for

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108500

The patch improves compilation speed.  Compilation time of the biggest 
test in the PR decreases from 1235s to 709s.


The patch was successfully bootstrapped on x86-64.
commit 02371cdd755d2b53fb580d3e8209c44e0c45c337
Author: Vladimir N. Makarov 
Date:   Fri Feb 10 11:12:37 2023 -0500

RA: Use simple LRA for huge functions

The PR108500 test contains a huge function and RA spends a lot of time
to compile the test with -O0.  The patch decreases compilation time
considerably for huge functions.  Compilation time for the PR test
decreases from 1235s to 709s on Intel i7-13600K.

PR tree-optimization/108500

gcc/ChangeLog:

* params.opt (ira-simple-lra-insn-threshold): Add new param.
* ira.cc (ira): Use the param to switch on simple LRA.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 6143db06c52..d0b6ea062e8 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -5624,12 +5624,16 @@ ira (FILE *f)
 if (DF_REG_DEF_COUNT (i) || DF_REG_USE_COUNT (i))
   num_used_regs++;
 
-  /* If there are too many pseudos and/or basic blocks (e.g. 10K
- pseudos and 10K blocks or 100K pseudos and 1K blocks), we will
- use simplified and faster algorithms in LRA.  */
+  /* If there are too many pseudos and/or basic blocks (e.g. 10K pseudos and
+ 10K blocks or 100K pseudos and 1K blocks) or we have too many function
+ insns, we will use simplified and faster algorithms in LRA.  */
   lra_simple_p
-= ira_use_lra_p
-  && num_used_regs >= (1U << 26) / last_basic_block_for_fn (cfun);
+= (ira_use_lra_p
+   && (num_used_regs >= (1U << 26) / last_basic_block_for_fn (cfun)
+   /* max uid is a good evaluation of the number of insns as most
+  optimizations are done on tree-SSA level.  */
+   || ((uint64_t) get_max_uid ()
+	   > (uint64_t) param_ira_simple_lra_insn_threshold * 1000)));
 
   if (lra_simple_p)
 {
diff --git a/gcc/params.opt b/gcc/params.opt
index 8a128c321c9..c7913d9063a 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -302,6 +302,10 @@ The number of registers in each class kept unused by loop invariant motion.
 Common Joined UInteger Var(param_ira_max_conflict_table_size) Init(1000) Param Optimization
 Max size of conflict table in MB.
 
+-param=ira-simple-lra-insn-threshold=
+Common Joined UInteger Var(param_ira_simple_lra_insn_threshold) Init(1000) Param Optimization
+Approximate function insn number in 1K units triggering simple local RA.
+
 -param=ira-max-loops-num=
 Common Joined UInteger Var(param_ira_max_loops_num) Init(100) Param Optimization
 Max loops number for regional RA.


[pushed] [PR103541] RA: Implement reuse of equivalent memory for caller saves optimization (version 2)

2023-02-09 Thread Vladimir Makarov via Gcc-patches

This is another try to solve

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103541

The patch was successfully bootstrapped (--enable-languages=all) and 
tested on x86, x86-64, aarch64
commit 1ad898d18904ac68432ba9b8ffa2b083d007cc2d
Author: Vladimir N. Makarov 
Date:   Thu Feb 9 15:18:48 2023 -0500

RA: Implement reuse of equivalent memory for caller saves optimization (2nd version)

The test pr103541.c shows opportunity to reuse memory with constant address for
caller saves optimization for constant or pure function call.  The patch
implements the memory reuse.

PR rtl-optimization/103541
PR rtl-optimization/108711

gcc/ChangeLog:

* ira.h (struct ira_reg_equiv_s): Add new field caller_save_p.
* ira.cc (validate_equiv_mem): Check memref address variance.
(no_equiv): Clear caller_save_p flag.
(update_equiv_regs): Define caller save equivalence for
valid_combine.
(setup_reg_equiv): Clear defined_p flag for caller save equivalence.
* lra-constraints.cc (lra_copy_reg_equiv): Add new arg
call_save_p.  Use caller save equivalence depending on the arg.
(split_reg): Adjust the call.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr103541.c: New.
* g++.target/i386/pr108711.C: New.

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 66df03e8a59..6143db06c52 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -3070,6 +3070,8 @@ validate_equiv_mem_from_store (rtx dest, const_rtx set ATTRIBUTE_UNUSED,
 info->equiv_mem_modified = true;
 }
 
+static int equiv_init_varies_p (rtx x);
+
 enum valid_equiv { valid_none, valid_combine, valid_reload };
 
 /* Verify that no store between START and the death of REG invalidates
@@ -3113,7 +3115,8 @@ validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
 	 been changed and all hell breaks loose.  */
 	  ret = valid_combine;
 	  if (!MEM_READONLY_P (memref)
-	  && !RTL_CONST_OR_PURE_CALL_P (insn))
+	  && (!RTL_CONST_OR_PURE_CALL_P (insn)
+		  || equiv_init_varies_p (XEXP (memref, 0
 	return valid_none;
 	}
 
@@ -3414,6 +3417,7 @@ no_equiv (rtx reg, const_rtx store ATTRIBUTE_UNUSED,
   if (reg_equiv[regno].is_arg_equivalence)
 return;
   ira_reg_equiv[regno].defined_p = false;
+  ira_reg_equiv[regno].caller_save_p = false;
   ira_reg_equiv[regno].init_insns = NULL;
   for (; list; list = list->next ())
 {
@@ -3766,7 +3770,18 @@ update_equiv_regs (void)
 		{
 		  replacement = copy_rtx (SET_SRC (set));
 		  if (validity == valid_reload)
-		note = set_unique_reg_note (insn, REG_EQUIV, replacement);
+		{
+		  note = set_unique_reg_note (insn, REG_EQUIV, replacement);
+		}
+		  else
+		{
+		  /* We still can use this equivalence for caller save
+			 optimization in LRA.  Mark this.  */
+		  ira_reg_equiv[regno].caller_save_p = true;
+		  ira_reg_equiv[regno].init_insns
+			= gen_rtx_INSN_LIST (VOIDmode, insn,
+	 ira_reg_equiv[regno].init_insns);
+		}
 		}
 	}
 
@@ -4156,7 +4171,7 @@ setup_reg_equiv (void)
 		   legitimate, we ignore such REG_EQUIV notes.  */
 		if (memory_operand (x, VOIDmode))
 		  {
-		ira_reg_equiv[i].defined_p = true;
+		ira_reg_equiv[i].defined_p = !ira_reg_equiv[i].caller_save_p;
 		ira_reg_equiv[i].memory = x;
 		continue;
 		  }
diff --git a/gcc/ira.h b/gcc/ira.h
index 58b50dbe8a2..3d35025a46e 100644
--- a/gcc/ira.h
+++ b/gcc/ira.h
@@ -175,8 +175,11 @@ extern struct target_ira *this_target_ira;
 /* Major structure describing equivalence info for a pseudo.  */
 struct ira_reg_equiv_s
 {
-  /* True if we can use this equivalence.  */
+  /* True if we can use this as a general equivalence.  */
   bool defined_p;
+  /* True if we can use this equivalence only for caller save/restore
+ location.  */
+  bool caller_save_p;
   /* True if the usage of the equivalence is profitable.  */
   bool profitable_p;
   /* Equiv. memory, constant, invariant, and initializing insns of
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 7bffbc07ee2..dd4f68bbfc0 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -5771,14 +5771,17 @@ choose_split_class (enum reg_class allocno_class,
   return best_cl;
 }
 
-/* Copy any equivalence information from ORIGINAL_REGNO to NEW_REGNO.
-   It only makes sense to call this function if NEW_REGNO is always
-   equal to ORIGINAL_REGNO.  */
+/* Copy any equivalence information from ORIGINAL_REGNO to NEW_REGNO.  It only
+   makes sense to call this function if NEW_REGNO is always equal to
+   ORIGINAL_REGNO.  Set up defined_p flag when caller_save_p flag is set up and
+   CALL_SAVE_P is true.  */
 
 static void
-lra_copy_reg_equiv (unsigned int new_regno, unsigned int original_regno)
+lra_copy_reg_equiv (unsigned int new_regno, unsigned int original_regno,
+		bool call_save_p)
 {
-  if (!ira_reg_equiv[original_regno].defined_p)

  1   2   3   4   5   6   7   8   9   10   >