Hello community, here is the log from the commit of package glibc for openSUSE:Factory checked in at 2015-06-16 14:04:08 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/glibc (Old) and /work/SRC/openSUSE:Factory/.glibc.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "glibc" Changes: -------- --- /work/SRC/openSUSE:Factory/glibc/glibc-testsuite.changes 2015-05-04 06:48:03.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.glibc.new/glibc-testsuite.changes 2015-06-16 14:04:10.000000000 +0200 @@ -1,0 +2,15 @@ +Tue Jun 9 08:16:46 UTC 2015 - sch...@suse.de + +- Add /usr/include/gnu/lib-names-.*.h to baselibs +- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS + offset in the DTV (bsc#930015, BZ #18457) +- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if + top chunk is MINSIZE (BZ #18502) + +------------------------------------------------------------------- +Wed Apr 8 12:50:39 UTC 2015 - mgor...@suse.com + +- threaded-trim-threshold.patch: Fix regression in threaded application + malloc performance (bsc#915955, BZ #17195) + +------------------------------------------------------------------- glibc-utils.changes: same change glibc.changes: same change New: ---- heap-top-corruption.patch pthread-join-deadlock.patch threaded-trim-threshold.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ glibc-testsuite.spec ++++++ --- /var/tmp/diff_new_pack.UoJUVY/_old 2015-06-16 14:04:13.000000000 +0200 +++ /var/tmp/diff_new_pack.UoJUVY/_new 2015-06-16 14:04:13.000000000 +0200 @@ -243,6 +243,14 @@ Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch +# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver +Patch1007: resolv-nameserver-handling.patch +# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) +Patch1008: nss-separate-state-getXXent.patch +# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) +Patch1009: aarch64-sigstksz.patch ### # Patches awaiting upstream approval @@ -257,12 +265,10 @@ Patch2005: glibc-memset-nontemporal.diff # PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197) Patch2006: ibm93x-redundant-shift-si.patch -# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver -Patch2007: resolv-nameserver-handling.patch -# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) -Patch2008: nss-separate-state-getXXent.patch -# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) -Patch2009: aarch64-sigstksz.patch +# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457) +Patch2007: pthread-join-deadlock.patch +# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502) +Patch2008: heap-top-corruption.patch # Non-glibc patches # PATCH-FIX-OPENSUSE Remove debianisms from manpages @@ -469,6 +475,10 @@ %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 +%patch1007 -p1 +%patch1008 -p1 +%patch1009 -p1 %patch2000 -p1 %patch2002 -p1 @@ -477,7 +487,6 @@ %patch2006 -p1 %patch2007 -p1 %patch2008 -p1 -%patch2009 -p1 %patch3000 @@ -917,8 +926,8 @@ # Create ld.so.conf # cat > %{buildroot}/etc/ld.so.conf <<EOF -%if "%{_lib}" == "lib64" -/usr/local/lib64 +%if "%{_lib}" != "lib" +/usr/local/%{_lib} %endif %ifarch ppc /usr/local/lib64 glibc-utils.spec: same change ++++++ glibc.spec ++++++ --- /var/tmp/diff_new_pack.UoJUVY/_old 2015-06-16 14:04:13.000000000 +0200 +++ /var/tmp/diff_new_pack.UoJUVY/_new 2015-06-16 14:04:13.000000000 +0200 @@ -243,6 +243,14 @@ Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch +# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver +Patch1007: resolv-nameserver-handling.patch +# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) +Patch1008: nss-separate-state-getXXent.patch +# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) +Patch1009: aarch64-sigstksz.patch ### # Patches awaiting upstream approval @@ -257,12 +265,10 @@ Patch2005: glibc-memset-nontemporal.diff # PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197) Patch2006: ibm93x-redundant-shift-si.patch -# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver -Patch2007: resolv-nameserver-handling.patch -# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) -Patch2008: nss-separate-state-getXXent.patch -# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) -Patch2009: aarch64-sigstksz.patch +# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457) +Patch2007: pthread-join-deadlock.patch +# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502) +Patch2008: heap-top-corruption.patch # Non-glibc patches # PATCH-FIX-OPENSUSE Remove debianisms from manpages @@ -469,6 +475,10 @@ %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 +%patch1007 -p1 +%patch1008 -p1 +%patch1009 -p1 %patch2000 -p1 %patch2002 -p1 @@ -477,7 +487,6 @@ %patch2006 -p1 %patch2007 -p1 %patch2008 -p1 -%patch2009 -p1 %patch3000 @@ -917,8 +926,8 @@ # Create ld.so.conf # cat > %{buildroot}/etc/ld.so.conf <<EOF -%if "%{_lib}" == "lib64" -/usr/local/lib64 +%if "%{_lib}" != "lib" +/usr/local/%{_lib} %endif %ifarch ppc /usr/local/lib64 ++++++ baselibs.conf ++++++ --- /var/tmp/diff_new_pack.UoJUVY/_old 2015-06-16 14:04:13.000000000 +0200 +++ /var/tmp/diff_new_pack.UoJUVY/_new 2015-06-16 14:04:13.000000000 +0200 @@ -14,6 +14,7 @@ glibc-devel requires "glibc-<targettype> = %version" arch i586 block! + +^/usr/include/gnu/lib-names-.*\.h$ +^/usr/include/gnu/stubs-.*\.h$ glibc-devel-static arch i586 block! ++++++ heap-top-corruption.patch ++++++ From: Mel Gorman <mgor...@suse.de> Subject: [PATCH] [v3] malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE [BZ #18502] Date: Mon, 8 Jun 2015 13:36:13 +0100 mksquashfs was reported in openSUSE to be causing segmentation faults when creating installation images. Testing showed that mksquashfs sometimes failed and could be reproduced within 10 attempts. The core dump looked like the heap top was corrupted and was pointing to an unmapped area. In other cases, this has been due to an application corrupting glibc structures but mksquashfs appears to be fine in this regard. The problem is that heap_trim is "growing" the top into unmapped space. If the top chunk == MINSIZE then top_area is -1 and this check does not behave as expected due to a signed/unsigned comparison if (top_area <= pad) return 0; The next calculation extra = ALIGN_DOWN(top_area - pad, pagesz) calculates extra as a negative number which also is unnoticed due to a signed/unsigned comparison. We then call shrink_heap(heap, negative_number) which crashes later. This patch adds a simple check against MINSIZE to make sure extra does not become negative. It adds a cast to hint to the reader that this is a signed vs unsigned issue. Without the patch, mksquash fails within 10 attempts. With it applied, it completed 1000 times without error. The standard test suite "make check" showed no changes in the summary of test results. 2015-06-08 Mel Gorman <mgor...@suse.de> [BZ #18502] * malloc/arena.c: Avoid corruption of the top of heaps for threads --- malloc/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: glibc-2.21/malloc/arena.c =================================================================== --- glibc-2.21.orig/malloc/arena.c +++ glibc-2.21/malloc/arena.c @@ -699,7 +699,7 @@ heap_trim (heap_info *heap, size_t pad) by preserving the top pad and at least a page. */ top_size = chunksize (top_chunk); top_area = top_size - MINSIZE - 1; - if (top_area <= pad) + if (top_area < 0 || (size_t) top_area <= pad) return 0; extra = ALIGN_DOWN(top_area - pad, pagesz); ++++++ pthread-join-deadlock.patch ++++++ [PR dynamic-link/18457] * elf/dl-tls.c (tls_get_addr_tail): Don't take the rtld lock if we already have a final static TLS offset. * nptl/tst-join7.c, nptl/tst-join7mod.c: New. Index: glibc-2.21/elf/dl-tls.c =================================================================== --- glibc-2.21.orig/elf/dl-tls.c +++ glibc-2.21/elf/dl-tls.c @@ -755,30 +755,44 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t the_map = listp->slotinfo[idx].map; } - /* Make sure that, if a dlopen running in parallel forces the - variable into static storage, we'll wait until the address in the - static TLS block is set up, and use that. If we're undecided - yet, make sure we make the decision holding the lock as well. */ - if (__glibc_unlikely (the_map->l_tls_offset - != FORCED_DYNAMIC_TLS_OFFSET)) + /* If the TLS block for the map is already assigned to dynamic or to + static TLS, avoid the lock. Be careful to use the same value for + both tests; if we reloaded it, the second test might mistake + forced dynamic for an offset. Now, if the decision hasn't been + made, take the rtld lock, so that an ongoing dlopen gets a chance + to complete, and then retest; if the decision is still pending, + force the module to dynamic TLS. */ + ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset); + if (__glibc_unlikely (offset != FORCED_DYNAMIC_TLS_OFFSET)) { + if (__glibc_unlikely (offset != NO_TLS_OFFSET)) + goto static_tls; __rtld_lock_lock_recursive (GL(dl_load_lock)); - if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET)) + offset = the_map->l_tls_offset; + if (__glibc_likely (offset == NO_TLS_OFFSET)) { the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET; __rtld_lock_unlock_recursive (GL(dl_load_lock)); } - else if (__glibc_likely (the_map->l_tls_offset - != FORCED_DYNAMIC_TLS_OFFSET)) + else if (__glibc_likely (offset != FORCED_DYNAMIC_TLS_OFFSET)) { + /* The decision is made, and it is final. We use the value + we've already loaded, but we could even load the offset + after releasing the lock, since it won't change. Should + the module be released while another thread references + one of its TLS variables, that's undefined behavior. */ + __rtld_lock_unlock_recursive (GL(dl_load_lock)); + + static_tls: + ; + #if TLS_TCB_AT_TP - void *p = (char *) THREAD_SELF - the_map->l_tls_offset; + void *p = (char *) THREAD_SELF - offset; #elif TLS_DTV_AT_TP - void *p = (char *) THREAD_SELF + the_map->l_tls_offset + TLS_PRE_TCB_SIZE; + void *p = (char *) THREAD_SELF + offset + TLS_PRE_TCB_SIZE; #else # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined" #endif - __rtld_lock_unlock_recursive (GL(dl_load_lock)); dtv[GET_ADDR_MODULE].pointer.is_static = true; dtv[GET_ADDR_MODULE].pointer.val = p; Index: glibc-2.21/nptl/Makefile =================================================================== --- glibc-2.21.orig/nptl/Makefile +++ glibc-2.21/nptl/Makefile @@ -234,7 +234,7 @@ tests = tst-typesizes \ tst-basic7 \ tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \ tst-raise1 \ - tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \ + tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \ tst-detach1 \ tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \ tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \ @@ -312,7 +312,8 @@ endif modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \ tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \ tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \ - tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod + tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \ + tst-join7mod extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o test-extras += $(modules-names) tst-cleanup4aux test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names))) @@ -517,6 +518,11 @@ $(objpfx)tst-tls6.out: tst-tls6.sh $(obj $(evaluate-test) endif +$(objpfx)tst-join7: $(libdl) $(shared-thread-library) +$(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so +$(objpfx)tst-join7mod.so: $(shared-thread-library) +LDFLAGS-tst-join7mod.so = -Wl,-soname,tst-join7mod.so + $(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library) $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so Index: glibc-2.21/nptl/tst-join7.c =================================================================== --- /dev/null +++ glibc-2.21/nptl/tst-join7.c @@ -0,0 +1,12 @@ +#include <dlfcn.h> + +int +do_test (void) +{ + void *f = dlopen ("tst-join7mod.so", RTLD_NOW | RTLD_GLOBAL); + if (f) dlclose (f); else return 1; + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" Index: glibc-2.21/nptl/tst-join7mod.c =================================================================== --- /dev/null +++ glibc-2.21/nptl/tst-join7mod.c @@ -0,0 +1,29 @@ +#include <stdio.h> +#include <pthread.h> + +static pthread_t th; +static int running = 1; + +static void * +test_run (void *p) +{ + while (running) + fprintf (stderr, "XXX test_run\n"); + fprintf (stderr, "XXX test_run FINISHED\n"); + return NULL; +} + +static void __attribute__ ((constructor)) +do_init (void) +{ + pthread_create (&th, NULL, test_run, NULL); +} + +static void __attribute__ ((destructor)) +do_end (void) +{ + running = 0; + fprintf (stderr, "thread_join...\n"); + pthread_join (th, NULL); + fprintf (stderr, "thread_join DONE\n"); +} ++++++ threaded-trim-threshold.patch ++++++ >From c26efef9798914e208329c0e8c3c73bb1135d9e3 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgor...@suse.de> Date: Thu, 2 Apr 2015 12:14:14 +0530 Subject: [PATCH] malloc: Consistently apply trim_threshold to all heaps [BZ #17195] Trimming heaps is a balance between saving memory and the system overhead required to update page tables and discard allocated pages. The malloc option M_TRIM_THRESHOLD is a tunable that users are meant to use to decide where this balance point is but it is only applied to the main arena. For scalability reasons, glibc malloc has per-thread heaps but these are shrunk with madvise() if there is one page free at the top of the heap. In some circumstances this can lead to high system overhead if a thread has a control flow like while (data_to_process) { buf = malloc(large_size); do_stuff(); free(buf); } For a large size, the free() will call madvise (pagetable teardown, page free and TLB flush) every time followed immediately by a malloc (fault, kernel page alloc, zeroing and charge accounting). The kernel overhead can dominate such a workload. This patch allows the user to tune when madvise gets called by applying the trim threshold to the per-thread heaps and using similar logic to the main arena when deciding whether to shrink. Alternatively if the dynamic brk/mmap threshold gets adjusted then the new values will be obeyed by the per-thread heaps. Bug 17195 was a test case motivated by a problem encountered in scientific applications written in python that performance badly due to high page fault overhead. The basic operation of such a program was posted by Julian Taylor https://sourceware.org/ml/libc-alpha/2015-02/msg00373.html With this patch applied, the overhead is eliminated. All numbers in this report are in seconds and were recorded by running Julian's program 30 times. pyarray glibc madvise 2.21 v2 System min 1.81 ( 0.00%) 0.00 (100.00%) System mean 1.93 ( 0.00%) 0.02 ( 99.20%) System stddev 0.06 ( 0.00%) 0.01 ( 88.99%) System max 2.06 ( 0.00%) 0.03 ( 98.54%) Elapsed min 3.26 ( 0.00%) 2.37 ( 27.30%) Elapsed mean 3.39 ( 0.00%) 2.41 ( 28.84%) Elapsed stddev 0.14 ( 0.00%) 0.02 ( 82.73%) Elapsed max 4.05 ( 0.00%) 2.47 ( 39.01%) glibc madvise 2.21 v2 User 141.86 142.28 System 57.94 0.60 Elapsed 102.02 72.66 Note that almost a minutes worth of system time is eliminted and the program completes 28% faster on average. To illustrate the problem without python this is a basic test-case for the worst case scenario where every free is a madvise followed by a an alloc /* gcc bench-free.c -lpthread -o bench-free */ static int num = 1024; void __attribute__((noinline,noclone)) dostuff (void *p) { } void *worker (void *data) { int i; for (i = num; i--;) { void *m = malloc (48*4096); dostuff (m); free (m); } return NULL; } int main() { int i; pthread_t t; void *ret; if (pthread_create (&t, NULL, worker, NULL)) exit (2); if (pthread_join (t, &ret)) exit (3); return 0; } Before the patch, this resulted in 1024 calls to madvise. With the patch applied, madvise is called twice because the default trim threshold is high enough to avoid this. This a more complex case where there is a mix of frees. It's simply a different worker function for the test case above void *worker (void *data) { int i; int j = 0; void *free_index[num]; for (i = num; i--;) { void *m = malloc ((i % 58) *4096); dostuff (m); if (i % 2 == 0) { free (m); } else { free_index[j++] = m; } } for (; j >= 0; j--) { free(free_index[j]); } return NULL; } glibc 2.21 calls malloc 90305 times but with the patch applied, it's called 13438. Increasing the trim threshold will decrease the number of times it's called with the option of eliminating the overhead. ebizzy is meant to generate a workload resembling common web application server workloads. It is threaded with a large working set that at its core has an allocation, do_stuff, free loop that also hits this case. The primary metric of the benchmark is records processed per second. This is running on my desktop which is a single socket machine with an I7-4770 and 8 cores. Each thread count was run for 30 seconds. It was only run once as the performance difference is so high that the variation is insignificant. glibc 2.21 patch threads 1 10230 44114 threads 2 19153 84925 threads 4 34295 134569 threads 8 51007 183387 Note that the saving happens to be a concidence as the size allocated by ebizzy was less than the default threshold. If a different number of chunks were specified then it may also be necessary to tune the threshold to compensate This is roughly quadrupling the performance of this benchmark. The difference in system CPU usage illustrates why. ebizzy running 1 thread with glibc 2.21 10230 records/s 306904 real 30.00 s user 7.47 s sys 22.49 s 22.49 seconds was spent in the kernel for a workload runinng 30 seconds. With the patch applied ebizzy running 1 thread with patch applied 44126 records/s 1323792 real 30.00 s user 29.97 s sys 0.00 s system CPU usage was zero with the patch applied. strace shows that glibc running this workload calls madvise approximately 9000 times a second. With the patch applied madvise was called twice during the workload (or 0.06 times per second). 2015-02-10 Mel Gorman <mgor...@suse.de> [BZ #17195] * malloc/arena.c (free): Apply trim threshold to per-thread heaps as well as the main arena. --- --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -241,6 +241,8 @@ /* For MIN, MAX, powerof2. */ #include <sys/param.h> +/* For ALIGN_DOWN. */ +#include <libc-internal.h> /* Debugging: --- a/malloc/arena.c +++ b/malloc/arena.c @@ -658,7 +658,7 @@ heap_trim (heap_info *heap, size_t pad) unsigned long pagesz = GLRO (dl_pagesize); mchunkptr top_chunk = top (ar_ptr), p, bck, fwd; heap_info *prev_heap; - long new_size, top_size, extra, prev_size, misalign; + long new_size, top_size, top_area, extra, prev_size, misalign; /* Can this heap go away completely? */ while (top_chunk == chunk_at_offset (heap, sizeof (*heap))) @@ -694,9 +694,16 @@ heap_trim (heap_info *heap, size_t pad) set_head (top_chunk, new_size | PREV_INUSE); /*check_chunk(ar_ptr, top_chunk);*/ } + + /* Uses similar logic for per-thread arenas as the main arena with systrim + by preserving the top pad and at least a page. */ top_size = chunksize (top_chunk); - extra = (top_size - pad - MINSIZE - 1) & ~(pagesz - 1); - if (extra < (long) pagesz) + top_area = top_size - MINSIZE - 1; + if (top_area <= pad) + return 0; + + extra = ALIGN_DOWN(top_area - pad, pagesz); + if ((unsigned long) extra < mp_.trim_threshold) return 0; /* Try to shrink. */