Re: Implement ggc_trim

2019-10-18 Thread Jakub Jelinek
On Fri, Oct 11, 2019 at 09:03:53AM +0200, Jan Hubicka wrote:
> Bootstrapped/regtested x86_64-linux, OK?
> 
>   * ggc-page.c (release_pages): Output statistics when !quiet_flag.
>   (ggc_collect): Dump later to not interfere with release_page dump.
>   (ggc_trim): New function.
>   * ggc-none.c (ggc_trim): New.
> 
>   * lto.c (lto_wpa_write_files): Call ggc_trim.

> @@ -1152,10 +1156,20 @@ release_pages (void)
>   *gp = g->next;
>   G.bytes_mapped -= g->alloc_size;
>   free (g->allocation);
> + n1 += g->alloc_size;
>}
>  else
>gp = >next;
>  #endif

This broke !defined(USING_MMAP) support, the second g->alloc_size read
is after the memory containing *g is freed.

Fixed thusly, tested with #undef USING_MMAP in the file (without the patch
self-test ICEs, with it succeeds), committed to trunk as obvious.

2019-10-18  Jakub Jelinek  

PR middle-end/92153
* ggc-page.c (release_pages): Read g->alloc_size before free rather
than after it.

--- gcc/ggc-page.c.jj   2019-10-11 14:10:44.987386981 +0200
+++ gcc/ggc-page.c  2019-10-18 19:13:59.458085610 +0200
@@ -1155,8 +1155,8 @@ release_pages (void)
   {
*gp = g->next;
G.bytes_mapped -= g->alloc_size;
-   free (g->allocation);
n1 += g->alloc_size;
+   free (g->allocation);
   }
 else
   gp = >next;


Jakub


Re: Implement ggc_trim

2019-10-11 Thread Richard Biener
On October 11, 2019 9:03:53 AM GMT+02:00, Jan Hubicka  wrote:
>Hi,
>this patch adds ggc_trim that releases free pages used by GGC memory
>to system.  This is useful to reduce memory footprint of WPA streaming:
>WPA streaming ought to not use any more GGC memory (patches in testing
>for that) and trimming the memory makes it available to fork
>used
>by stream out machinery.
>
>I collected some stats for cc1 for both GGC and heap (using mallinfo).
>Memory footprints are as follows:
>
>After streaming in global stream: 123MB GGC;  25MB of heap.
>After streaming in callgraph: 228MB GGC;  45MB of heap.
>After streaming in summaries: 373MB GGC; 126MB of heap.
>After symbol merging   : 348MB GGC; 130MB of heap.
>After IPA-ICF  : 501MB GGC; 160MB of heap. (this is all ICF)
>After IPA-CP   : 528MB GGC; 163MB of heap.
>After IPA-SRA  : 532MB GGC; 163MB of heap.
>After Inline   : 644MB GGC; 173MB of heap
>   This is after collecting of 118MB of
>   garbage and returning 740k to system
>   by madvise_dontneed
>After ipa-reference: 644MB GGC; 370MB of heap
>   I checked this all goes into the
>   bitmaps; I have WIP patch for that
>After releasing summariess : 431MB GGC; 383MB of heap
>   Trim releases 43MB by unmap
>   and 321MB by madvise_dontneed
>
>At least i learnt new fact about ipa-reference consuming  200MB of
>memory which was not obvious from our detailed mem stats.
>
>I think the lowest hanging fruit after this patch is to add
>malloc_madvise which further reduces footpring and fix ipa-reference.
>Hopefully Martin will do a bit about ipa-icf.
>
>I will dig into what inliner does but it produces a lot of clones so I
>think it is mostly clone and summary duplication. Perhaps we can avoid
>copying some of summaries for inline clones.
>
>In TOP I see about 900MB instead of 1.4GB before WPA streaming starts
>with both ggc_trim and madvise.
>
>Note that I also tried to hack ggc_free to recognize free pages but at
>least in simple implementation it is a loss since it makes ggc_alloc
>more expensive (it needs to bring pages back and add into freelists)
>which hurts stream-in performance.
>
>I think sweeping once per WPA is no problem, it is definitly less than
>1% of WPA time.
>
>Bootstrapped/regtested x86_64-linux, OK?

Ok.

Richard. 

>   * ggc-page.c (release_pages): Output statistics when !quiet_flag.
>   (ggc_collect): Dump later to not interfere with release_page dump.
>   (ggc_trim): New function.
>   * ggc-none.c (ggc_trim): New.
>
>   * lto.c (lto_wpa_write_files): Call ggc_trim.
>Index: ggc-page.c
>===
>--- ggc-page.c (revision 276707)
>+++ ggc-page.c (working copy)
>@@ -529,7 +529,6 @@ static void clear_page_group_in_use (pag
> #endif
> static struct page_entry * alloc_page (unsigned);
> static void free_page (struct page_entry *);
>-static void release_pages (void);
> static void clear_marks (void);
> static void sweep_pages (void);
> static void ggc_recalculate_in_use_p (page_entry *);
>@@ -1016,6 +1015,8 @@ free_page (page_entry *entry)
> static void
> release_pages (void)
> {
>+  size_t n1 = 0;
>+  size_t n2 = 0;
> #ifdef USING_MADVISE
>   page_entry *p, *start_p;
>   char *start;
>@@ -1061,6 +1062,7 @@ release_pages (void)
>   else
> G.free_pages = p;
>   G.bytes_mapped -= mapped_len;
>+n1 += len;
> continue;
> }
>   prev = newprev;
>@@ -1092,6 +1094,7 @@ release_pages (void)
>/* Don't count those pages as mapped to not touch the garbage collector
>  unnecessarily. */
>   G.bytes_mapped -= len;
>+  n2 += len;
>   while (start_p != p)
> {
>   start_p->discarded = true;
>@@ -1124,6 +1127,7 @@ release_pages (void)
>   }
> 
>   munmap (start, len);
>+  n1 += len;
>   G.bytes_mapped -= len;
> }
> 
>@@ -1152,10 +1156,20 @@ release_pages (void)
>   *gp = g->next;
>   G.bytes_mapped -= g->alloc_size;
>   free (g->allocation);
>+  n1 += g->alloc_size;
>   }
> else
>   gp = >next;
> #endif
>+  if (!quiet_flag && (n1 || n2))
>+{
>+  fprintf (stderr, " {GC");
>+  if (n1)
>+  fprintf (stderr, " released %luk", (unsigned long)(n1 / 1024));
>+  if (n2)
>+  fprintf (stderr, " madv_dontneed %luk", (unsigned long)(n2 / 1024));
>+  fprintf (stderr, "}");
>+}
> }
> 
> /* This table provides a fast way to determine ceil(log_2(size)) for
>@@ -2178,19 +2192,22 @@ ggc_collect (void)
> return;
> 
>   timevar_push (TV_GC);
>-  if (!quiet_flag)
>-fprintf (stderr, " {GC %luk -> ", (unsigned long) G.allocated /
>1024);
>   if (GGC_DEBUG_LEVEL >= 2)
> fprintf (G.debug_file, 

Implement ggc_trim

2019-10-11 Thread Jan Hubicka
Hi,
this patch adds ggc_trim that releases free pages used by GGC memory
to system.  This is useful to reduce memory footprint of WPA streaming:
WPA streaming ought to not use any more GGC memory (patches in testing
for that) and trimming the memory makes it available to fork used
by stream out machinery.

I collected some stats for cc1 for both GGC and heap (using mallinfo).
Memory footprints are as follows:

After streaming in global stream: 123MB GGC;  25MB of heap.
After streaming in callgraph: 228MB GGC;  45MB of heap.
After streaming in summaries: 373MB GGC; 126MB of heap.
After symbol merging: 348MB GGC; 130MB of heap.
After IPA-ICF   : 501MB GGC; 160MB of heap. (this is all ICF)
After IPA-CP: 528MB GGC; 163MB of heap.
After IPA-SRA   : 532MB GGC; 163MB of heap.
After Inline: 644MB GGC; 173MB of heap
This is after collecting of 118MB of
garbage and returning 740k to system
by madvise_dontneed
After ipa-reference : 644MB GGC; 370MB of heap
I checked this all goes into the
bitmaps; I have WIP patch for that
After releasing summariess  : 431MB GGC; 383MB of heap
Trim releases 43MB by unmap
and 321MB by madvise_dontneed

At least i learnt new fact about ipa-reference consuming  200MB of
memory which was not obvious from our detailed mem stats.

I think the lowest hanging fruit after this patch is to add
malloc_madvise which further reduces footpring and fix ipa-reference.
Hopefully Martin will do a bit about ipa-icf.

I will dig into what inliner does but it produces a lot of clones so I
think it is mostly clone and summary duplication. Perhaps we can avoid
copying some of summaries for inline clones.

In TOP I see about 900MB instead of 1.4GB before WPA streaming starts
with both ggc_trim and madvise.

Note that I also tried to hack ggc_free to recognize free pages but at
least in simple implementation it is a loss since it makes ggc_alloc
more expensive (it needs to bring pages back and add into freelists)
which hurts stream-in performance.

I think sweeping once per WPA is no problem, it is definitly less than
1% of WPA time.

Bootstrapped/regtested x86_64-linux, OK?

* ggc-page.c (release_pages): Output statistics when !quiet_flag.
(ggc_collect): Dump later to not interfere with release_page dump.
(ggc_trim): New function.
* ggc-none.c (ggc_trim): New.

* lto.c (lto_wpa_write_files): Call ggc_trim.
Index: ggc-page.c
===
--- ggc-page.c  (revision 276707)
+++ ggc-page.c  (working copy)
@@ -529,7 +529,6 @@ static void clear_page_group_in_use (pag
 #endif
 static struct page_entry * alloc_page (unsigned);
 static void free_page (struct page_entry *);
-static void release_pages (void);
 static void clear_marks (void);
 static void sweep_pages (void);
 static void ggc_recalculate_in_use_p (page_entry *);
@@ -1016,6 +1015,8 @@ free_page (page_entry *entry)
 static void
 release_pages (void)
 {
+  size_t n1 = 0;
+  size_t n2 = 0;
 #ifdef USING_MADVISE
   page_entry *p, *start_p;
   char *start;
@@ -1061,6 +1062,7 @@ release_pages (void)
   else
 G.free_pages = p;
   G.bytes_mapped -= mapped_len;
+ n1 += len;
  continue;
 }
   prev = newprev;
@@ -1092,6 +1094,7 @@ release_pages (void)
   /* Don't count those pages as mapped to not touch the garbage collector
  unnecessarily. */
   G.bytes_mapped -= len;
+  n2 += len;
   while (start_p != p)
 {
   start_p->discarded = true;
@@ -1124,6 +1127,7 @@ release_pages (void)
}
 
   munmap (start, len);
+  n1 += len;
   G.bytes_mapped -= len;
 }
 
@@ -1152,10 +1156,20 @@ release_pages (void)
*gp = g->next;
G.bytes_mapped -= g->alloc_size;
free (g->allocation);
+   n1 += g->alloc_size;
   }
 else
   gp = >next;
 #endif
+  if (!quiet_flag && (n1 || n2))
+{
+  fprintf (stderr, " {GC");
+  if (n1)
+   fprintf (stderr, " released %luk", (unsigned long)(n1 / 1024));
+  if (n2)
+   fprintf (stderr, " madv_dontneed %luk", (unsigned long)(n2 / 1024));
+  fprintf (stderr, "}");
+}
 }
 
 /* This table provides a fast way to determine ceil(log_2(size)) for
@@ -2178,19 +2192,22 @@ ggc_collect (void)
 return;
 
   timevar_push (TV_GC);
-  if (!quiet_flag)
-fprintf (stderr, " {GC %luk -> ", (unsigned long) G.allocated / 1024);
   if (GGC_DEBUG_LEVEL >= 2)
 fprintf (G.debug_file, "BEGIN COLLECTING\n");
 
   /* Zero the total allocated bytes.  This will be recalculated in the
  sweep phase.  */
+  size_t allocated = G.allocated;
   G.allocated = 0;