Re: [PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191, take 2)

2017-03-22 Thread Bernd Schmidt

On 03/22/2017 04:38 PM, Uros Bizjak wrote:


LGTM, but I don't want to step on Bernd's toes, so let's wait for his opinion.


I was waiting for yours really, that's the one that counts.


Bernd



Re: [PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191, take 2)

2017-03-22 Thread Uros Bizjak
On Mon, Mar 20, 2017 at 12:15 PM, Jakub Jelinek  wrote:
> Hi!
>
> On Fri, Mar 10, 2017 at 07:57:39PM +0100, Jakub Jelinek wrote:
>> On Fri, Mar 10, 2017 at 07:52:37PM +0100, Bernd Schmidt wrote:
>> > On 03/10/2017 06:53 PM, Jakub Jelinek wrote:
>> > > +
>> > > +template 
>> > > +static inline rtx
>> > > +ix86_delegitimize_address_tmpl (rtx x)
>> > >  {
>> >
>> > Why is this a template and not a function arg?
>>
>> That was just an attempt to ensure that the compiler actually
>> either inlines it, or doesn't share code between the two versions, so the
>> base_term_p argument isn't checked at runtime.
>> But, as I said, I have no problem changing that, i.e.
>> remove the template line, s/_tmpl/_1/, add , bool base_term_p
>> and tweak both callers.
>
> Here is the other variant of the patch with inline function instead of
> template.  Also bootstrapped/regtested on x86_64-linux and i686-linux, ok
> for trunk (or is the other patch ok)?
>
> 2017-03-20  Jakub Jelinek  
>
> PR rtl-optimization/63191
> * config/i386/i386.c (ix86_delegitimize_address): Turn into small
> wrapper function, moved the whole old content into ...
> (ix86_delegitimize_address_1): ... this.  New inline function.
> (ix86_find_base_term): Use ix86_delegitimize_address_1 with
> true as last argument instead of ix86_delegitimize_address.

LGTM, but I don't want to step on Bernd's toes, so let's wait for his opinion.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2017-03-07 20:04:52.0 +0100
> +++ gcc/config/i386/i386.c  2017-03-10 14:46:24.351775710 +0100
> @@ -17255,10 +17255,16 @@ ix86_delegitimize_tls_address (rtx orig_
> has a different PIC label for each routine but the DWARF debugging
> information is not associated with any particular routine, so it's
> necessary to remove references to the PIC label from RTL stored by
> -   the DWARF output code.  */
> +   the DWARF output code.
>
> -static rtx
> -ix86_delegitimize_address (rtx x)
> +   This helper is used in the normal ix86_delegitimize_address
> +   entrypoint (e.g. used in the target delegitimization hook) and
> +   in ix86_find_base_term.  As compile time memory optimization, we
> +   avoid allocating rtxes that will not change anything on the outcome
> +   of the callers (find_base_value and find_base_term).  */
> +
> +static inline rtx
> +ix86_delegitimize_address_1 (rtx x, bool base_term_p)
>  {
>rtx orig_x = delegitimize_mem_from_attrs (x);
>/* addend is NULL or some rtx if x is something+GOTOFF where
> @@ -17285,6 +17291,10 @@ ix86_delegitimize_address (rtx x)
>&& GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
>&& XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
>  {
> + /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
> +base.  A CONST can't be arg_pointer_rtx based.  */
> + if (base_term_p && MEM_P (orig_x))
> +   return orig_x;
>   rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
>   x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
>   if (MEM_P (orig_x))
> @@ -17361,7 +17371,9 @@ ix86_delegitimize_address (rtx x)
>if (! result)
>  return ix86_delegitimize_tls_address (orig_x);
>
> -  if (const_addend)
> +  /* For (PLUS something CONST_INT) both find_base_{value,term} just
> + recurse on the first operand.  */
> +  if (const_addend && !base_term_p)
>  result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, 
> const_addend));
>if (reg_addend)
>  result = gen_rtx_PLUS (Pmode, reg_addend, result);
> @@ -17399,6 +17411,14 @@ ix86_delegitimize_address (rtx x)
>return result;
>  }
>
> +/* The normal instantiation of the above template.  */
> +
> +static rtx
> +ix86_delegitimize_address (rtx x)
> +{
> +  return ix86_delegitimize_address_1 (x, false);
> +}
> +
>  /* If X is a machine specific address (i.e. a symbol or label being
> referenced as a displacement from the GOT implemented using an
> UNSPEC), then return the base term.  Otherwise return X.  */
> @@ -17424,7 +17444,7 @@ ix86_find_base_term (rtx x)
>return XVECEXP (term, 0, 0);
>  }
>
> -  return ix86_delegitimize_address (x);
> +  return ix86_delegitimize_address_1 (x, true);
>  }
>
>  static void
>
>
> Jakub


[PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191, take 2)

2017-03-20 Thread Jakub Jelinek
Hi!

On Fri, Mar 10, 2017 at 07:57:39PM +0100, Jakub Jelinek wrote:
> On Fri, Mar 10, 2017 at 07:52:37PM +0100, Bernd Schmidt wrote:
> > On 03/10/2017 06:53 PM, Jakub Jelinek wrote:
> > > +
> > > +template 
> > > +static inline rtx
> > > +ix86_delegitimize_address_tmpl (rtx x)
> > >  {
> > 
> > Why is this a template and not a function arg?
> 
> That was just an attempt to ensure that the compiler actually
> either inlines it, or doesn't share code between the two versions, so the
> base_term_p argument isn't checked at runtime.
> But, as I said, I have no problem changing that, i.e.
> remove the template line, s/_tmpl/_1/, add , bool base_term_p
> and tweak both callers.

Here is the other variant of the patch with inline function instead of
template.  Also bootstrapped/regtested on x86_64-linux and i686-linux, ok
for trunk (or is the other patch ok)?

2017-03-20  Jakub Jelinek  

PR rtl-optimization/63191
* config/i386/i386.c (ix86_delegitimize_address): Turn into small
wrapper function, moved the whole old content into ...
(ix86_delegitimize_address_1): ... this.  New inline function.
(ix86_find_base_term): Use ix86_delegitimize_address_1 with
true as last argument instead of ix86_delegitimize_address.

--- gcc/config/i386/i386.c.jj   2017-03-07 20:04:52.0 +0100
+++ gcc/config/i386/i386.c  2017-03-10 14:46:24.351775710 +0100
@@ -17255,10 +17255,16 @@ ix86_delegitimize_tls_address (rtx orig_
has a different PIC label for each routine but the DWARF debugging
information is not associated with any particular routine, so it's
necessary to remove references to the PIC label from RTL stored by
-   the DWARF output code.  */
+   the DWARF output code.
 
-static rtx
-ix86_delegitimize_address (rtx x)
+   This helper is used in the normal ix86_delegitimize_address
+   entrypoint (e.g. used in the target delegitimization hook) and
+   in ix86_find_base_term.  As compile time memory optimization, we
+   avoid allocating rtxes that will not change anything on the outcome
+   of the callers (find_base_value and find_base_term).  */
+
+static inline rtx
+ix86_delegitimize_address_1 (rtx x, bool base_term_p)
 {
   rtx orig_x = delegitimize_mem_from_attrs (x);
   /* addend is NULL or some rtx if x is something+GOTOFF where
@@ -17285,6 +17291,10 @@ ix86_delegitimize_address (rtx x)
   && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
   && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
 {
+ /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
+base.  A CONST can't be arg_pointer_rtx based.  */
+ if (base_term_p && MEM_P (orig_x))
+   return orig_x;
  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
  if (MEM_P (orig_x))
@@ -17361,7 +17371,9 @@ ix86_delegitimize_address (rtx x)
   if (! result)
 return ix86_delegitimize_tls_address (orig_x);
 
-  if (const_addend)
+  /* For (PLUS something CONST_INT) both find_base_{value,term} just
+ recurse on the first operand.  */
+  if (const_addend && !base_term_p)
 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
   if (reg_addend)
 result = gen_rtx_PLUS (Pmode, reg_addend, result);
@@ -17399,6 +17411,14 @@ ix86_delegitimize_address (rtx x)
   return result;
 }
 
+/* The normal instantiation of the above template.  */
+
+static rtx
+ix86_delegitimize_address (rtx x)
+{
+  return ix86_delegitimize_address_1 (x, false);
+}
+
 /* If X is a machine specific address (i.e. a symbol or label being
referenced as a displacement from the GOT implemented using an
UNSPEC), then return the base term.  Otherwise return X.  */
@@ -17424,7 +17444,7 @@ ix86_find_base_term (rtx x)
   return XVECEXP (term, 0, 0);
 }
 
-  return ix86_delegitimize_address (x);
+  return ix86_delegitimize_address_1 (x, true);
 }
 
 static void


Jakub


Re: [PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191)

2017-03-10 Thread Jakub Jelinek
On Fri, Mar 10, 2017 at 07:52:37PM +0100, Bernd Schmidt wrote:
> On 03/10/2017 06:53 PM, Jakub Jelinek wrote:
> > +
> > +template 
> > +static inline rtx
> > +ix86_delegitimize_address_tmpl (rtx x)
> >  {
> 
> Why is this a template and not a function arg?

That was just an attempt to ensure that the compiler actually
either inlines it, or doesn't share code between the two versions, so the
base_term_p argument isn't checked at runtime.
But, as I said, I have no problem changing that, i.e.
remove the template line, s/_tmpl/_1/, add , bool base_term_p
and tweak both callers.

Jakub


Re: [PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191)

2017-03-10 Thread Bernd Schmidt

On 03/10/2017 06:53 PM, Jakub Jelinek wrote:

+
+template 
+static inline rtx
+ix86_delegitimize_address_tmpl (rtx x)
 {


Why is this a template and not a function arg?


Bernd


[PATCH] Decrease compile time memory with heavy find_base_{value,term} on i?86/x86_64 (PR rtl-optimization/63191)

2017-03-10 Thread Jakub Jelinek
Hi!

On the testcase in the PR (too large for our testsuite) we waste several
gigabytes of memory from allocations in ix86_delegitimize_address called
from find_base_{value,term}.
E.g. for find_base_term
 (plus:SI (value:SI 1:1 @0x2c60f50/0x2c50f40)
 (const:SI (plus:SI (unspec:SI [
 (symbol_ref:SI ("_ZL2Zs") [flags 0x2] )
 ] UNSPEC_GOTOFF)
 (const_int 8 [0x8]
on which it returns
 (const:SI (plus:SI (symbol_ref:SI ("_ZL2Zs") [flags 0x2] )
 (const_int 8 [0x8])))
it needs to allocate the plus and const, but it is all wasted, the caller
for CONST just looks through it and for PLUS if the other operand is
CONST_INT also just recurses on the first operand.
The following patch fixes that by handling delegitimization slightly
differently when called from ix86_find_base_term - we don't allocate memory
that is known to be not useful to the caller.
The template is kind of forced inline, so that we don't slow down the normal
delegitimization (but if you prefer normal static inline with bool
base_term_p argument, it is easy to change that).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-03-10  Jakub Jelinek  

PR rtl-optimization/63191
* config/i386/i386.c (ix86_delegitimize_address): Turn into small
wrapper function, moved the whole old content into ...
(ix86_delegitimize_address_tmpl): ... this.  New template.
(ix86_find_base_term): Use ix86_delegitimize_address_tmpl
instead of ix86_delegitimize_address.

--- gcc/config/i386/i386.c.jj   2017-03-07 20:04:52.0 +0100
+++ gcc/config/i386/i386.c  2017-03-10 14:46:24.351775710 +0100
@@ -17255,10 +17255,17 @@ ix86_delegitimize_tls_address (rtx orig_
has a different PIC label for each routine but the DWARF debugging
information is not associated with any particular routine, so it's
necessary to remove references to the PIC label from RTL stored by
-   the DWARF output code.  */
+   the DWARF output code.
 
-static rtx
-ix86_delegitimize_address (rtx x)
+   This template is used in the normal ix86_delegitimize_address
+   entrypoint (e.g. used in the target delegitimization hook) and
+   in ix86_find_base_term.  As compile time memory optimization, we
+   avoid allocating rtxes that will not change anything on the outcome
+   of the callers (find_base_value and find_base_term).  */
+
+template 
+static inline rtx
+ix86_delegitimize_address_tmpl (rtx x)
 {
   rtx orig_x = delegitimize_mem_from_attrs (x);
   /* addend is NULL or some rtx if x is something+GOTOFF where
@@ -17285,6 +17292,10 @@ ix86_delegitimize_address (rtx x)
   && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
   && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
 {
+ /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
+base.  A CONST can't be arg_pointer_rtx based.  */
+ if (base_term_p && MEM_P (orig_x))
+   return orig_x;
  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
  if (MEM_P (orig_x))
@@ -17361,7 +17372,9 @@ ix86_delegitimize_address (rtx x)
   if (! result)
 return ix86_delegitimize_tls_address (orig_x);
 
-  if (const_addend)
+  /* For (PLUS something CONST_INT) both find_base_{value,term} just
+ recurse on the first operand.  */
+  if (const_addend && !base_term_p)
 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
   if (reg_addend)
 result = gen_rtx_PLUS (Pmode, reg_addend, result);
@@ -17399,6 +17412,14 @@ ix86_delegitimize_address (rtx x)
   return result;
 }
 
+/* The normal instantiation of the above template.  */
+
+static rtx
+ix86_delegitimize_address (rtx x)
+{
+  return ix86_delegitimize_address_tmpl (x);
+}
+
 /* If X is a machine specific address (i.e. a symbol or label being
referenced as a displacement from the GOT implemented using an
UNSPEC), then return the base term.  Otherwise return X.  */
@@ -17424,7 +17445,7 @@ ix86_find_base_term (rtx x)
   return XVECEXP (term, 0, 0);
 }
 
-  return ix86_delegitimize_address (x);
+  return ix86_delegitimize_address_tmpl (x);
 }
 
 static void

Jakub