Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package glibc for openSUSE:Factory checked in at 2023-08-30 10:19:27 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/glibc (Old) and /work/SRC/openSUSE:Factory/.glibc.new.1766 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "glibc" Wed Aug 30 10:19:27 2023 rev:278 rq:1107915 version:2.38 Changes: -------- --- /work/SRC/openSUSE:Factory/glibc/glibc.changes 2023-08-12 15:05:36.866279675 +0200 +++ /work/SRC/openSUSE:Factory/.glibc.new.1766/glibc.changes 2023-08-30 10:22:03.991538598 +0200 @@ -0,0 +1,26 @@ +Mon Aug 28 11:56:10 UTC 2023 - Richard Biener <rguent...@suse.com> + +- Add cross-ppc64le package + +------------------------------------------------------------------- +Tue Aug 22 15:46:51 UTC 2023 - Andreas Schwab <sch...@suse.de> + +- posix-memalign-fragmentation.patch: malloc: Enable merging of remainders + in memalign, remove bin scanning from memalign (BZ #30723) +- Limit build counter sync to i686 flavor, to reduce needs for rebuilds + +------------------------------------------------------------------- +Tue Aug 22 11:24:16 UTC 2023 - Richard Biener <rguent...@suse.com> + +- Add cross-s390x package (bsc#1214460) + +------------------------------------------------------------------- +Mon Aug 14 08:12:28 UTC 2023 - Andreas Schwab <sch...@suse.de> + +- Require that elf/check-localplt does not fail +- glibc-2.3.90-langpackdir.diff: add hidden alias for __strcpy_chk +- cache-amd-legacy.patch: x86: Fix for cache computation on AMD legacy + cpus +- cache-intel-shared.patch: x86: Fix incorrect scope of setting + `shared_per_thread` (BZ# 30745) + New: ---- cache-amd-legacy.patch cache-intel-shared.patch posix-memalign-fragmentation.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ glibc.spec ++++++ --- /var/tmp/diff_new_pack.5vaMCH/_old 2023-08-30 10:22:07.039647398 +0200 +++ /var/tmp/diff_new_pack.5vaMCH/_new 2023-08-30 10:22:07.047647683 +0200 @@ -30,15 +30,23 @@ # can use only simple RPM expressions, no lua, no shell, no '{expand:' # expression :-/ Ideally we'd like to just strip the 'cross_' prefix, # but we can't. So enumerate the possibilities for now. +%define cross_arch %{cross_cpu} %if "%flavor" == "cross-aarch64" -%define cross_arch aarch64 +%define cross_cpu aarch64 %endif %if "%flavor" == "cross-riscv64" -%define cross_arch riscv64 +%define cross_cpu riscv64 +%endif +%if "%flavor" == "cross-s390x" +%define cross_cpu s390x +%endif +%if "%flavor" == "cross-ppc64le" +%define cross_arch ppc64le +%define cross_cpu powerpc64le %endif -%if 0%{?cross_arch:1} -%define binutils_os %{cross_arch}-suse-linux +%if 0%{?cross_cpu:1} +%define binutils_os %{cross_cpu}-suse-linux # use same sysroot as in binutils.spec %define sysroot %{_prefix}/%{binutils_os}/sys-root %endif @@ -81,7 +89,7 @@ %define build_utils 0 %define build_testsuite 1 %endif -%if 0%{?cross_arch:1} +%if 0%{?cross_cpu:1} %define build_main 0 %define build_utils 0 %define build_testsuite 0 @@ -89,7 +97,7 @@ %undefine _build_create_debug ExcludeArch: %{cross_arch} %endif -%define host_arch %{?cross_arch}%{!?cross_arch:%{_target_cpu}} +%define host_arch %{?cross_cpu}%{!?cross_cpu:%{_target_cpu}} %if %{build_main} %define name_suffix %{nil} @@ -232,6 +240,8 @@ %if "%flavor" == "i686" ExclusiveArch: i586 i686 BuildArch: i686 +# Sync only this build counter with the main build +#!BcntSyncTag: glibc %endif ### @@ -289,8 +299,14 @@ ### # Patches from upstream ### -# PATCH-FIX-OPENSUSE iconv: restore verbosity with unrecognized encoding names (BZ #30694) +# PATCH-FIX-UPSTREAM iconv: restore verbosity with unrecognized encoding names (BZ #30694) Patch1000: iconv-error-verbosity.patch +# PATCH-FIX-UPSTREAM x86: Fix for cache computation on AMD legacy cpus +Patch1001: cache-amd-legacy.patch +# PATCH-FIX-UPSTREAM x86: Fix incorrect scope of setting `shared_per_thread` (BZ# 30745) +Patch1002: cache-intel-shared.patch +# PATCH-FIX-UPSTREAM malloc: Enable merging of remainders in memalign, remove bin scanning from memalign (BZ #30723) +Patch1003: posix-memalign-fragmentation.patch ### # Patches awaiting upstream approval @@ -515,6 +531,9 @@ %if %{without snapshot} %patch1000 -p1 +%patch1001 -p1 +%patch1002 -p1 +%patch1003 -p1 %endif %patch2000 -p1 @@ -590,8 +609,8 @@ #now overwrite for some architectures # %if %{build_cross} -BuildCC=%{cross_arch}-suse-linux-gcc -BuildCCplus=%{cross_arch}-suse-linux-g++ +BuildCC=%{cross_cpu}-suse-linux-gcc +BuildCCplus=%{cross_cpu}-suse-linux-g++ %else %ifarch sparc64 BuildFlags="-O2 -mcpu=ultrasparc -mvis -fcall-used-g6" @@ -754,6 +773,7 @@ # Exceptions: # None! make %{?_smp_mflags} %{?make_output_sync} -C cc-base check-abi +make %{?_smp_mflags} %{?make_output_sync} -C cc-base test t=elf/check-localplt %endif %define rtldlib %{_lib} ++++++ _multibuild ++++++ --- /var/tmp/diff_new_pack.5vaMCH/_old 2023-08-30 10:22:07.095649397 +0200 +++ /var/tmp/diff_new_pack.5vaMCH/_new 2023-08-30 10:22:07.095649397 +0200 @@ -3,6 +3,8 @@ <package>utils</package> <package>testsuite</package> <package>cross-aarch64</package> + <package>cross-ppc64le</package> <package>cross-riscv64</package> + <package>cross-s390x</package> </multibuild> ++++++ cache-amd-legacy.patch ++++++ >From ced101ed9d3b7cfd12d97ef24940cb00b8658c81 Mon Sep 17 00:00:00 2001 From: Sajan Karumanchi <sajan.karuman...@amd.com> Date: Tue, 1 Aug 2023 15:20:55 +0000 Subject: [PATCH] x86: Fix for cache computation on AMD legacy cpus. Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D' set to Zero, thus resulting in zeroed-out computed cache values. This patch reintroduces the old way of cache computation as a fail-safe option to handle these exceptions. Fixed 'level4_cache_size' value through handle_amd(). Reviewed-by: Premachandra Mallappa <premachandra.malla...@amd.com> Tested-by: Florian Weimer <fwei...@redhat.com> --- sysdeps/x86/dl-cacheinfo.h | 226 ++++++++++++++++++++++++++++++++----- 1 file changed, 199 insertions(+), 27 deletions(-) diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index cd4d0351ae..285773039f 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -315,40 +315,206 @@ handle_amd (int name) { unsigned int eax; unsigned int ebx; - unsigned int ecx; + unsigned int ecx = 0; unsigned int edx; - unsigned int count = 0x1; + unsigned int max_cpuid = 0; + unsigned int fn = 0; /* No level 4 cache (yet). */ if (name > _SC_LEVEL3_CACHE_LINESIZE) return 0; - if (name >= _SC_LEVEL3_CACHE_SIZE) - count = 0x3; - else if (name >= _SC_LEVEL2_CACHE_SIZE) - count = 0x2; - else if (name >= _SC_LEVEL1_DCACHE_SIZE) - count = 0x0; + __cpuid (0x80000000, max_cpuid, ebx, ecx, edx); + + if (max_cpuid >= 0x8000001D) + /* Use __cpuid__ '0x8000_001D' to compute cache details. */ + { + unsigned int count = 0x1; + + if (name >= _SC_LEVEL3_CACHE_SIZE) + count = 0x3; + else if (name >= _SC_LEVEL2_CACHE_SIZE) + count = 0x2; + else if (name >= _SC_LEVEL1_DCACHE_SIZE) + count = 0x0; + + __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); + + if (ecx != 0) + { + switch (name) + { + case _SC_LEVEL1_ICACHE_ASSOC: + case _SC_LEVEL1_DCACHE_ASSOC: + case _SC_LEVEL2_CACHE_ASSOC: + case _SC_LEVEL3_CACHE_ASSOC: + return ((ebx >> 22) & 0x3ff) + 1; + case _SC_LEVEL1_ICACHE_LINESIZE: + case _SC_LEVEL1_DCACHE_LINESIZE: + case _SC_LEVEL2_CACHE_LINESIZE: + case _SC_LEVEL3_CACHE_LINESIZE: + return (ebx & 0xfff) + 1; + case _SC_LEVEL1_ICACHE_SIZE: + case _SC_LEVEL1_DCACHE_SIZE: + case _SC_LEVEL2_CACHE_SIZE: + case _SC_LEVEL3_CACHE_SIZE: + return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1); + default: + __builtin_unreachable (); + } + return -1; + } + } + + /* Legacy cache computation for CPUs prior to Bulldozer family. + This is also a fail-safe mechanism for some hypervisors that + accidentally configure __cpuid__ '0x8000_001D' to Zero. */ - __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); + fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); + + if (max_cpuid < fn) + return 0; + + __cpuid (fn, eax, ebx, ecx, edx); + + if (name < _SC_LEVEL1_DCACHE_SIZE) + { + name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE; + ecx = edx; + } switch (name) { - case _SC_LEVEL1_ICACHE_ASSOC: - case _SC_LEVEL1_DCACHE_ASSOC: - case _SC_LEVEL2_CACHE_ASSOC: + case _SC_LEVEL1_DCACHE_SIZE: + return (ecx >> 14) & 0x3fc00; + + case _SC_LEVEL1_DCACHE_ASSOC: + ecx >>= 16; + if ((ecx & 0xff) == 0xff) + { + /* Fully associative. */ + return (ecx << 2) & 0x3fc00; + } + return ecx & 0xff; + + case _SC_LEVEL1_DCACHE_LINESIZE: + return ecx & 0xff; + + case _SC_LEVEL2_CACHE_SIZE: + return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; + + case _SC_LEVEL2_CACHE_ASSOC: + switch ((ecx >> 12) & 0xf) + { + case 0: + case 1: + case 2: + case 4: + return (ecx >> 12) & 0xf; + case 6: + return 8; + case 8: + return 16; + case 10: + return 32; + case 11: + return 48; + case 12: + return 64; + case 13: + return 96; + case 14: + return 128; + case 15: + return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff); + default: + return 0; + } + + case _SC_LEVEL2_CACHE_LINESIZE: + return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; + + case _SC_LEVEL3_CACHE_SIZE: + { + long int total_l3_cache = 0, l3_cache_per_thread = 0; + unsigned int threads = 0; + const struct cpu_features *cpu_features; + + if ((edx & 0xf000) == 0) + return 0; + + total_l3_cache = (edx & 0x3ffc0000) << 1; + cpu_features = __get_cpu_features (); + + /* Figure out the number of logical threads that share L3. */ + if (max_cpuid >= 0x80000008) + { + /* Get width of APIC ID. */ + __cpuid (0x80000008, eax, ebx, ecx, edx); + threads = (ecx & 0xff) + 1; + } + + if (threads == 0) + { + /* If APIC ID width is not available, use logical + processor count. */ + __cpuid (0x00000001, eax, ebx, ecx, edx); + if ((edx & (1 << 28)) != 0) + threads = (ebx >> 16) & 0xff; + } + + /* Cap usage of highest cache level to the number of + supported threads. */ + if (threads > 0) + l3_cache_per_thread = total_l3_cache/threads; + + /* Get shared cache per ccx for Zen architectures. */ + if (cpu_features->basic.family >= 0x17) + { + long int l3_cache_per_ccx = 0; + /* Get number of threads share the L3 cache in CCX. */ + __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); + unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; + l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx; + return l3_cache_per_ccx; + } + else + { + return l3_cache_per_thread; + } + } + case _SC_LEVEL3_CACHE_ASSOC: - return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0; - case _SC_LEVEL1_ICACHE_LINESIZE: - case _SC_LEVEL1_DCACHE_LINESIZE: - case _SC_LEVEL2_CACHE_LINESIZE: + switch ((edx >> 12) & 0xf) + { + case 0: + case 1: + case 2: + case 4: + return (edx >> 12) & 0xf; + case 6: + return 8; + case 8: + return 16; + case 10: + return 32; + case 11: + return 48; + case 12: + return 64; + case 13: + return 96; + case 14: + return 128; + case 15: + return ((edx & 0x3ffc0000) << 1) / (edx & 0xff); + default: + return 0; + } + case _SC_LEVEL3_CACHE_LINESIZE: - return ecx ? (ebx & 0xfff) + 1 : 0; - case _SC_LEVEL1_ICACHE_SIZE: - case _SC_LEVEL1_DCACHE_SIZE: - case _SC_LEVEL2_CACHE_SIZE: - case _SC_LEVEL3_CACHE_SIZE: - return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0; + return (edx & 0xf000) == 0 ? 0 : edx & 0xff; + default: __builtin_unreachable (); } @@ -703,7 +869,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); - shared_per_thread = shared; level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE); @@ -716,13 +881,20 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) level3_cache_size = shared; level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC); level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE); + level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE); if (shared <= 0) - /* No shared L3 cache. All we have is the L2 cache. */ - shared = core; + { + /* No shared L3 cache. All we have is the L2 cache. */ + shared = core; + } + else if (cpu_features->basic.family < 0x17) + { + /* Account for exclusive L2 and L3 caches. */ + shared += core; + } - if (shared_per_thread <= 0) - shared_per_thread = shared; + shared_per_thread = shared; } cpu_features->level1_icache_size = level1_icache_size; -- 2.41.0 ++++++ cache-intel-shared.patch ++++++ >From 5ea70cc02626d9b85f1570153873d8648a47bf95 Mon Sep 17 00:00:00 2001 From: Noah Goldstein <goldstein....@gmail.com> Date: Thu, 10 Aug 2023 19:28:24 -0500 Subject: [PATCH] x86: Fix incorrect scope of setting `shared_per_thread` [BZ# 30745] The: ``` if (shared_per_thread > 0 && threads > 0) shared_per_thread /= threads; ``` Code was accidentally moved to inside the else scope. This doesn't match how it was previously (before af992e7abd). This patch fixes that by putting the division after the `else` block. (cherry picked from commit 084fb31bc2c5f95ae0b9e6df4d3cf0ff43471ede) --- sysdeps/x86/dl-cacheinfo.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 285773039f..5ddb35c9d9 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -770,11 +770,10 @@ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, u level. */ threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16) & 0xff); - - /* Get per-thread size of highest level cache. */ - if (shared_per_thread > 0 && threads > 0) - shared_per_thread /= threads; } + /* Get per-thread size of highest level cache. */ + if (shared_per_thread > 0 && threads > 0) + shared_per_thread /= threads; } /* Account for non-inclusive L2 and L3 caches. */ -- 2.41.0 ++++++ glibc-2.3.90-langpackdir.diff ++++++ --- /var/tmp/diff_new_pack.5vaMCH/_old 2023-08-30 10:22:07.151651396 +0200 +++ /var/tmp/diff_new_pack.5vaMCH/_new 2023-08-30 10:22:07.155651538 +0200 @@ -1,3 +1,24 @@ +Index: glibc-2.38/debug/strcpy_chk.c +=================================================================== +--- glibc-2.38.orig/debug/strcpy_chk.c ++++ glibc-2.38/debug/strcpy_chk.c +@@ -31,3 +31,4 @@ __strcpy_chk (char *dest, const char *sr + + return memcpy (dest, src, len + 1); + } ++libc_hidden_builtin_def (__strcpy_chk) +Index: glibc-2.38/include/string.h +=================================================================== +--- glibc-2.38.orig/include/string.h ++++ glibc-2.38/include/string.h +@@ -213,6 +213,7 @@ libc_hidden_builtin_proto (__memcpy_chk) + libc_hidden_builtin_proto (__memmove_chk) + libc_hidden_builtin_proto (__mempcpy_chk) + libc_hidden_builtin_proto (__memset_chk) ++libc_hidden_builtin_proto (__strcpy_chk) + libc_hidden_builtin_proto (__stpcpy_chk) + + #endif Index: glibc-2.38/intl/loadmsgcat.c =================================================================== --- glibc-2.38.orig/intl/loadmsgcat.c ++++++ posix-memalign-fragmentation.patch ++++++ >From 542b1105852568c3ebc712225ae78b8c8ba31a78 Mon Sep 17 00:00:00 2001 From: Florian Weimer <fwei...@redhat.com> Date: Fri, 11 Aug 2023 11:18:17 +0200 Subject: [PATCH] malloc: Enable merging of remainders in memalign (bug 30723) Previously, calling _int_free from _int_memalign could put remainders into the tcache or into fastbins, where they are invisible to the low-level allocator. This results in missed merge opportunities because once these freed chunks become available to the low-level allocator, further memalign allocations (even of the same size are) likely obstructing merges. Furthermore, during forwards merging in _int_memalign, do not completely give up when the remainder is too small to serve as a chunk on its own. We can still give it back if it can be merged with the following unused chunk. This makes it more likely that memalign calls in a loop achieve a compact memory layout, independently of initial heap layout. Drop some useless (unsigned long) casts along the way, and tweak the style to more closely match GNU on changed lines. Reviewed-by: DJ Delorie <d...@redhat.com> --- malloc/malloc.c | 197 +++++++++++++++++++++++++++++------------------- 1 file changed, 121 insertions(+), 76 deletions(-) diff --git a/malloc/malloc.c b/malloc/malloc.c index e2f1a615a4..948f9759af 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1086,6 +1086,11 @@ typedef struct malloc_chunk* mchunkptr; static void* _int_malloc(mstate, size_t); static void _int_free(mstate, mchunkptr, int); +static void _int_free_merge_chunk (mstate, mchunkptr, INTERNAL_SIZE_T); +static INTERNAL_SIZE_T _int_free_create_chunk (mstate, + mchunkptr, INTERNAL_SIZE_T, + mchunkptr, INTERNAL_SIZE_T); +static void _int_free_maybe_consolidate (mstate, INTERNAL_SIZE_T); static void* _int_realloc(mstate, mchunkptr, INTERNAL_SIZE_T, INTERNAL_SIZE_T); static void* _int_memalign(mstate, size_t, size_t); @@ -4637,31 +4642,52 @@ _int_free (mstate av, mchunkptr p, int have_lock) if (!have_lock) __libc_lock_lock (av->mutex); - nextchunk = chunk_at_offset(p, size); - - /* Lightweight tests: check whether the block is already the - top block. */ - if (__glibc_unlikely (p == av->top)) - malloc_printerr ("double free or corruption (top)"); - /* Or whether the next chunk is beyond the boundaries of the arena. */ - if (__builtin_expect (contiguous (av) - && (char *) nextchunk - >= ((char *) av->top + chunksize(av->top)), 0)) - malloc_printerr ("double free or corruption (out)"); - /* Or whether the block is actually not marked used. */ - if (__glibc_unlikely (!prev_inuse(nextchunk))) - malloc_printerr ("double free or corruption (!prev)"); - - nextsize = chunksize(nextchunk); - if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0) - || __builtin_expect (nextsize >= av->system_mem, 0)) - malloc_printerr ("free(): invalid next size (normal)"); + _int_free_merge_chunk (av, p, size); - free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ); + if (!have_lock) + __libc_lock_unlock (av->mutex); + } + /* + If the chunk was allocated via mmap, release via munmap(). + */ + + else { + munmap_chunk (p); + } +} + +/* Try to merge chunk P of SIZE bytes with its neighbors. Put the + resulting chunk on the appropriate bin list. P must not be on a + bin list yet, and it can be in use. */ +static void +_int_free_merge_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size) +{ + mchunkptr nextchunk = chunk_at_offset(p, size); + + /* Lightweight tests: check whether the block is already the + top block. */ + if (__glibc_unlikely (p == av->top)) + malloc_printerr ("double free or corruption (top)"); + /* Or whether the next chunk is beyond the boundaries of the arena. */ + if (__builtin_expect (contiguous (av) + && (char *) nextchunk + >= ((char *) av->top + chunksize(av->top)), 0)) + malloc_printerr ("double free or corruption (out)"); + /* Or whether the block is actually not marked used. */ + if (__glibc_unlikely (!prev_inuse(nextchunk))) + malloc_printerr ("double free or corruption (!prev)"); + + INTERNAL_SIZE_T nextsize = chunksize(nextchunk); + if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0) + || __builtin_expect (nextsize >= av->system_mem, 0)) + malloc_printerr ("free(): invalid next size (normal)"); + + free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ); - /* consolidate backward */ - if (!prev_inuse(p)) { - prevsize = prev_size (p); + /* Consolidate backward. */ + if (!prev_inuse(p)) + { + INTERNAL_SIZE_T prevsize = prev_size (p); size += prevsize; p = chunk_at_offset(p, -((long) prevsize)); if (__glibc_unlikely (chunksize(p) != prevsize)) @@ -4669,9 +4695,25 @@ _int_free (mstate av, mchunkptr p, int have_lock) unlink_chunk (av, p); } - if (nextchunk != av->top) { + /* Write the chunk header, maybe after merging with the following chunk. */ + size = _int_free_create_chunk (av, p, size, nextchunk, nextsize); + _int_free_maybe_consolidate (av, size); +} + +/* Create a chunk at P of SIZE bytes, with SIZE potentially increased + to cover the immediately following chunk NEXTCHUNK of NEXTSIZE + bytes (if NEXTCHUNK is unused). The chunk at P is not actually + read and does not have to be initialized. After creation, it is + placed on the appropriate bin list. The function returns the size + of the new chunk. */ +static INTERNAL_SIZE_T +_int_free_create_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size, + mchunkptr nextchunk, INTERNAL_SIZE_T nextsize) +{ + if (nextchunk != av->top) + { /* get and clear inuse bit */ - nextinuse = inuse_bit_at_offset(nextchunk, nextsize); + bool nextinuse = inuse_bit_at_offset (nextchunk, nextsize); /* consolidate forward */ if (!nextinuse) { @@ -4686,8 +4728,8 @@ _int_free (mstate av, mchunkptr p, int have_lock) been given one chance to be used in malloc. */ - bck = unsorted_chunks(av); - fwd = bck->fd; + mchunkptr bck = unsorted_chunks (av); + mchunkptr fwd = bck->fd; if (__glibc_unlikely (fwd->bk != bck)) malloc_printerr ("free(): corrupted unsorted chunks"); p->fd = fwd; @@ -4706,61 +4748,52 @@ _int_free (mstate av, mchunkptr p, int have_lock) check_free_chunk(av, p); } - /* - If the chunk borders the current high end of memory, - consolidate into top - */ - - else { + else + { + /* If the chunk borders the current high end of memory, + consolidate into top. */ size += nextsize; set_head(p, size | PREV_INUSE); av->top = p; check_chunk(av, p); } - /* - If freeing a large space, consolidate possibly-surrounding - chunks. Then, if the total unused topmost memory exceeds trim - threshold, ask malloc_trim to reduce top. - - Unless max_fast is 0, we don't know if there are fastbins - bordering top, so we cannot tell for sure whether threshold - has been reached unless fastbins are consolidated. But we - don't want to consolidate on each free. As a compromise, - consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD - is reached. - */ + return size; +} - if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) { +/* If freeing a large space, consolidate possibly-surrounding + chunks. Then, if the total unused topmost memory exceeds trim + threshold, ask malloc_trim to reduce top. */ +static void +_int_free_maybe_consolidate (mstate av, INTERNAL_SIZE_T size) +{ + /* Unless max_fast is 0, we don't know if there are fastbins + bordering top, so we cannot tell for sure whether threshold has + been reached unless fastbins are consolidated. But we don't want + to consolidate on each free. As a compromise, consolidation is + performed if FASTBIN_CONSOLIDATION_THRESHOLD is reached. */ + if (size >= FASTBIN_CONSOLIDATION_THRESHOLD) + { if (atomic_load_relaxed (&av->have_fastchunks)) malloc_consolidate(av); - if (av == &main_arena) { + if (av == &main_arena) + { #ifndef MORECORE_CANNOT_TRIM - if ((unsigned long)(chunksize(av->top)) >= - (unsigned long)(mp_.trim_threshold)) - systrim(mp_.top_pad, av); + if (chunksize (av->top) >= mp_.trim_threshold) + systrim (mp_.top_pad, av); #endif - } else { - /* Always try heap_trim(), even if the top chunk is not - large, because the corresponding heap might go away. */ - heap_info *heap = heap_for_ptr(top(av)); + } + else + { + /* Always try heap_trim, even if the top chunk is not large, + because the corresponding heap might go away. */ + heap_info *heap = heap_for_ptr (top (av)); - assert(heap->ar_ptr == av); - heap_trim(heap, mp_.top_pad); - } + assert (heap->ar_ptr == av); + heap_trim (heap, mp_.top_pad); + } } - - if (!have_lock) - __libc_lock_unlock (av->mutex); - } - /* - If the chunk was allocated via mmap, release via munmap(). - */ - - else { - munmap_chunk (p); - } } /* @@ -5221,7 +5254,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) (av != &main_arena ? NON_MAIN_ARENA : 0)); set_inuse_bit_at_offset (newp, newsize); set_head_size (p, leadsize | (av != &main_arena ? NON_MAIN_ARENA : 0)); - _int_free (av, p, 1); + _int_free_merge_chunk (av, p, leadsize); p = newp; assert (newsize >= nb && @@ -5232,15 +5265,27 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) if (!chunk_is_mmapped (p)) { size = chunksize (p); - if ((unsigned long) (size) > (unsigned long) (nb + MINSIZE)) + mchunkptr nextchunk = chunk_at_offset(p, size); + INTERNAL_SIZE_T nextsize = chunksize(nextchunk); + if (size > nb) { remainder_size = size - nb; - remainder = chunk_at_offset (p, nb); - set_head (remainder, remainder_size | PREV_INUSE | - (av != &main_arena ? NON_MAIN_ARENA : 0)); - set_head_size (p, nb); - _int_free (av, remainder, 1); - } + if (remainder_size >= MINSIZE + || nextchunk == av->top + || !inuse_bit_at_offset (nextchunk, nextsize)) + { + /* We can only give back the tail if it is larger than + MINSIZE, or if the following chunk is unused (top + chunk or unused in-heap chunk). Otherwise we would + create a chunk that is smaller than MINSIZE. */ + remainder = chunk_at_offset (p, nb); + set_head_size (p, nb); + remainder_size = _int_free_create_chunk (av, remainder, + remainder_size, + nextchunk, nextsize); + _int_free_maybe_consolidate (av, remainder_size); + } + } } check_inuse_chunk (av, p); -- 2.41.0 >From 0dc7fc1cf094406a138e4d1bcf9553e59edcf89d Mon Sep 17 00:00:00 2001 From: Florian Weimer <fwei...@redhat.com> Date: Thu, 10 Aug 2023 19:36:56 +0200 Subject: [PATCH] malloc: Remove bin scanning from memalign (bug 30723) On the test workload (mpv --cache=yes with VP9 video decoding), the bin scanning has a very poor success rate (less than 2%). The tcache scanning has about 50% success rate, so keep that. Update comments in malloc/tst-memalign-2 to indicate the purpose of the tests. Even with the scanning removed, the additional merging opportunities since commit 542b1105852568c3ebc712225ae78b ("malloc: Enable merging of remainders in memalign (bug 30723)") are sufficient to pass the existing large bins test. Remove leftover variables from _int_free from refactoring in the same commit. Reviewed-by: DJ Delorie <d...@redhat.com> --- malloc/malloc.c | 169 ++-------------------------------------- malloc/tst-memalign-2.c | 7 +- 2 files changed, 10 insertions(+), 166 deletions(-) diff --git a/malloc/malloc.c b/malloc/malloc.c index 948f9759af..d0bbbf3710 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -4488,12 +4488,6 @@ _int_free (mstate av, mchunkptr p, int have_lock) { INTERNAL_SIZE_T size; /* its size */ mfastbinptr *fb; /* associated fastbin */ - mchunkptr nextchunk; /* next contiguous chunk */ - INTERNAL_SIZE_T nextsize; /* its size */ - int nextinuse; /* true if nextchunk is used */ - INTERNAL_SIZE_T prevsize; /* size of previous contiguous chunk */ - mchunkptr bck; /* misc temp for linking */ - mchunkptr fwd; /* misc temp for linking */ size = chunksize (p); @@ -5032,42 +5026,6 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize, ------------------------------ memalign ------------------------------ */ -/* Returns 0 if the chunk is not and does not contain the requested - aligned sub-chunk, else returns the amount of "waste" from - trimming. NB is the *chunk* byte size, not the user byte - size. */ -static size_t -chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb) -{ - void *m = chunk2mem (p); - INTERNAL_SIZE_T size = chunksize (p); - void *aligned_m = m; - - if (__glibc_unlikely (misaligned_chunk (p))) - malloc_printerr ("_int_memalign(): unaligned chunk detected"); - - aligned_m = PTR_ALIGN_UP (m, alignment); - - INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m; - - /* We can't trim off the front as it's too small. */ - if (front_extra > 0 && front_extra < MINSIZE) - return 0; - - /* If it's a perfect fit, it's an exception to the return value rule - (we would return zero waste, which looks like "not usable"), so - handle it here by returning a small non-zero value instead. */ - if (size == nb && front_extra == 0) - return 1; - - /* If the block we need fits in the chunk, calculate total waste. */ - if (size > nb + front_extra) - return size - nb; - - /* Can't use this chunk. */ - return 0; -} - /* BYTES is user requested bytes, not requested chunksize bytes. */ static void * _int_memalign (mstate av, size_t alignment, size_t bytes) @@ -5082,7 +5040,6 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) mchunkptr remainder; /* spare room at end to split off */ unsigned long remainder_size; /* its size */ INTERNAL_SIZE_T size; - mchunkptr victim; nb = checked_request2size (bytes); if (nb == 0) @@ -5101,129 +5058,13 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) we don't find anything in those bins, the common malloc code will scan starting at 2x. */ - /* This will be set if we found a candidate chunk. */ - victim = NULL; - - /* Fast bins are singly-linked, hard to remove a chunk from the middle - and unlikely to meet our alignment requirements. We have not done - any experimentation with searching for aligned fastbins. */ - - if (av != NULL) - { - int first_bin_index; - int first_largebin_index; - int last_bin_index; - - if (in_smallbin_range (nb)) - first_bin_index = smallbin_index (nb); - else - first_bin_index = largebin_index (nb); - - if (in_smallbin_range (nb * 2)) - last_bin_index = smallbin_index (nb * 2); - else - last_bin_index = largebin_index (nb * 2); - - first_largebin_index = largebin_index (MIN_LARGE_SIZE); - - int victim_index; /* its bin index */ - - for (victim_index = first_bin_index; - victim_index < last_bin_index; - victim_index ++) - { - victim = NULL; - - if (victim_index < first_largebin_index) - { - /* Check small bins. Small bin chunks are doubly-linked despite - being the same size. */ - - mchunkptr fwd; /* misc temp for linking */ - mchunkptr bck; /* misc temp for linking */ - - bck = bin_at (av, victim_index); - fwd = bck->fd; - while (fwd != bck) - { - if (chunk_ok_for_memalign (fwd, alignment, nb) > 0) - { - victim = fwd; - - /* Unlink it */ - victim->fd->bk = victim->bk; - victim->bk->fd = victim->fd; - break; - } - - fwd = fwd->fd; - } - } - else - { - /* Check large bins. */ - mchunkptr fwd; /* misc temp for linking */ - mchunkptr bck; /* misc temp for linking */ - mchunkptr best = NULL; - size_t best_size = 0; - - bck = bin_at (av, victim_index); - fwd = bck->fd; + /* Call malloc with worst case padding to hit alignment. */ + m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); - while (fwd != bck) - { - int extra; - - if (chunksize (fwd) < nb) - break; - extra = chunk_ok_for_memalign (fwd, alignment, nb); - if (extra > 0 - && (extra <= best_size || best == NULL)) - { - best = fwd; - best_size = extra; - } + if (m == 0) + return 0; /* propagate failure */ - fwd = fwd->fd; - } - victim = best; - - if (victim != NULL) - { - unlink_chunk (av, victim); - break; - } - } - - if (victim != NULL) - break; - } - } - - /* Strategy: find a spot within that chunk that meets the alignment - request, and then possibly free the leading and trailing space. - This strategy is incredibly costly and can lead to external - fragmentation if header and footer chunks are unused. */ - - if (victim != NULL) - { - p = victim; - m = chunk2mem (p); - set_inuse (p); - if (av != &main_arena) - set_non_main_arena (p); - } - else - { - /* Call malloc with worst case padding to hit alignment. */ - - m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); - - if (m == 0) - return 0; /* propagate failure */ - - p = mem2chunk (m); - } + p = mem2chunk (m); if ((((unsigned long) (m)) % alignment) != 0) /* misaligned */ { diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c index f229283dbf..ecd6fa249e 100644 --- a/malloc/tst-memalign-2.c +++ b/malloc/tst-memalign-2.c @@ -86,7 +86,8 @@ do_test (void) TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2); } - /* Test for non-head tcache hits. */ + /* Test for non-head tcache hits. This exercises the memalign + scanning code to find matching allocations. */ for (i = 0; i < array_length (ptr); ++ i) { if (i == 4) @@ -113,7 +114,9 @@ do_test (void) free (p); TEST_VERIFY (count > 0); - /* Large bins test. */ + /* Large bins test. This verifies that the over-allocated parts + that memalign releases for future allocations can be reused by + memalign itself at least in some cases. */ for (i = 0; i < LN; ++ i) { -- 2.41.0