On Nov 17, 2025, at 00:24, Mark Millard <[email protected]> wrote: > On Nov 16, 2025, at 13:11, Michal Meloun <[email protected]> wrote: > >> On 16.11.2025 18:51, Warner Losh wrote: >>> Maybe try main with the following patch. Adrian noticed the TLS mismatch. I >>> don't think it will matter, but TLS thread model stuff always gives me a >>> big headache. If the following fails to apply, just copy the >>> JEMALLOC_TLS_MODEL line from i386 to arm. The default changed elsewhere, >>> but this wasn't updated here. >>> Warner >> >> Unfortunately, that doesn't help. I'm out of ideas on how to debug this, all >> of my attempts have failed. >> >> The problem only occurs when Clang compiles a larger project and is >> intermediate. Attempt to compile the clang generated reproducer is always >> successful. >> It's clear that the parallelism introduced by make plays a significant role. >> But the system never reached an OOM condition before failure. >> >> I would be grateful for any help and ideas on what to do next. >> Michal > > [Note: The context is an official pkgbase distribution context > and so the /usr/src/ is not tied to git. /usr/src-investigation/ > is a copy of /usr/src/ that was then modified. Also, this is > via a armv7 chroot on the aarch64 Windows Dev Kit 2023, not > via armv7-only hardware.] > > The crude hack reported later below has shown the first failure > indicated as happening during base_alloc_edata by reporting: > > p[i] == 0 && which_base_extent_context == 0x11u > > as the failure message. > > > # diff -u /usr/src/contrib/jemalloc/include/jemalloc/internal/ehooks.h > /usr/src-investigation/contrib/jemalloc/include/jemalloc/ > --- /usr/src/contrib/jemalloc/include/jemalloc/internal/ehooks.h 2025-11-12 > 02:24:28.000000000 -0800 > +++ > /usr/src-investigation/contrib/jemalloc/include/jemalloc/internal/ehooks.h > 2025-11-16 23:47:10.965711000 -0800 > @@ -1,6 +1,7 @@ > #ifndef JEMALLOC_INTERNAL_EHOOKS_H > #define JEMALLOC_INTERNAL_EHOOKS_H > > +#include <signal.h> > #include "jemalloc/internal/atomic.h" > #include "jemalloc/internal/extent_mmap.h" > > @@ -158,6 +159,7 @@ > * This isn't really ehooks-specific (i.e. anyone can check for zeroed > memory). > * But incorrect zero information indicates an ehook bug. > */ > +__attribute__ ((visibility ("internal"))) extern volatile sig_atomic_t > which_base_extent_context; // HACK FOR DEBUGGING USE > static inline void > ehooks_debug_zero_check(void *addr, size_t size) { > assert(((uintptr_t)addr & PAGE_MASK) == 0); > @@ -167,7 +169,45 @@ > /* Check the whole first page. */ > size_t *p = (size_t *)addr; > for (size_t i = 0; i < PAGE / sizeof(size_t); i++) { > - assert(p[i] == 0); > +switch (which_base_extent_context) > +{ > +case 0x10u: // base_alloc > + assert(p[i] == 0 && which_base_extent_context == 0x10u); > + which_base_extent_context= 0x0u; > + break; > +case 0x11u: // base_alloc_edata > + assert(p[i] == 0 && which_base_extent_context == 0x11u); > + which_base_extent_context= 0x0u; > + break; > +case 0x12u: // base_new > + assert(p[i] == 0 && which_base_extent_context == 0x12u); > + which_base_extent_context= 0x0u; > + break; > +case 0x13u: // base_boot > + assert(p[i] == 0 && which_base_extent_context == 0x13u); > + which_base_extent_context= 0x0u; > + break; > +case 0x20u: // extent_commit_wrapper > + assert(p[i] == 0 && which_base_extent_context == 0x20u); > + which_base_extent_context= 0x0u; > + break; > +case 0x21u: // extent_commit_zero > + assert(p[i] == 0 && which_base_extent_context == 0x21u); > + which_base_extent_context= 0x0u; > + break; > +case 0x22u: // ecache_alloc_grow > + assert(p[i] == 0 && which_base_extent_context == 0x22u); > + which_base_extent_context= 0x0u; > + break; > +case 0x00u: // None known > + assert(p[i] == 0 && which_base_extent_context == 0x00u); > + which_base_extent_context= 0x0u; > + break; > +default: // Some other context > + assert(p[i] == 0 && which_base_extent_context != 0x00u); > + which_base_extent_context= 0x0u; > +} > + //assert(p[i] == 0); > } > /* > * And 4 spots within. There's a tradeoff here; the larger > > > # diff -u /usr/src/contrib/jemalloc/src/base.c > /usr/src-investigation/contrib/jemalloc/src/base.c > --- /usr/src/contrib/jemalloc/src/base.c 2025-11-12 02:24:28.000000000 -0800 > +++ /usr/src-investigation/contrib/jemalloc/src/base.c 2025-11-16 > 23:50:14.396483000 -0800 > @@ -1,3 +1,4 @@ > +#include <signal.h> > #include "jemalloc/internal/jemalloc_preamble.h" > #include "jemalloc/internal/jemalloc_internal_includes.h" > > @@ -340,12 +341,15 @@ > b0get(void) { > return b0; > } > + > +__attribute__ ((visibility ("internal"))) volatile sig_atomic_t > which_base_extent_context=0x0u; // HACK FOR DEBUGGING USE > > base_t * > base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks, > bool metadata_use_hooks) { > pszind_t pind_last = 0; > size_t extent_sn_next = 0; > +which_base_extent_context= 0x12u; > > /* > * The base will contain the ehooks eventually, but it itself is > @@ -476,12 +480,14 @@ > */ > void * > base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { > +which_base_extent_context= 0x10u; > return base_alloc_impl(tsdn, base, size, alignment, NULL); > } > > edata_t * > base_alloc_edata(tsdn_t *tsdn, base_t *base) { > size_t esn; > +which_base_extent_context= 0x11u; > edata_t *edata = base_alloc_impl(tsdn, base, sizeof(edata_t), > EDATA_ALIGNMENT, &esn); > if (edata == NULL) { > @@ -523,6 +529,7 @@ > > bool > base_boot(tsdn_t *tsdn) { > +which_base_extent_context= 0x13u; > b0 = base_new(tsdn, 0, (extent_hooks_t *)&ehooks_default_extent_hooks, > /* metadata_use_hooks */ true); > return (b0 == NULL); > > > # diff -u /usr/src/contrib/jemalloc/src/extent.c > /usr/src-investigation/contrib/jemalloc/src/extent.c > --- /usr/src/contrib/jemalloc/src/extent.c 2025-11-12 02:24:28.000000000 -0800 > +++ /usr/src-investigation/contrib/jemalloc/src/extent.c 2025-11-16 > 23:49:55.820658000 -0800 > @@ -1,3 +1,4 @@ > +#include <signal.h> > #include "jemalloc/internal/jemalloc_preamble.h" > #include "jemalloc/internal/jemalloc_internal_includes.h" > > @@ -90,11 +91,14 @@ > assert(edata == NULL || edata_guarded_get(edata) == guarded); > return edata; > } > + > +__attribute__ ((visibility ("internal"))) extern volatile sig_atomic_t > which_base_extent_context; // HACK FOR DEBUGGING USE > > edata_t * > ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t > *ecache, > edata_t *expand_edata, size_t size, size_t alignment, bool zero, > bool guarded) { > +which_base_extent_context= 0x22u; > assert(size != 0); > assert(alignment != 0); > witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), > @@ -1114,6 +1118,7 @@ > bool > extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, > size_t offset, size_t length) { > +which_base_extent_context= 0x20u; > return extent_commit_impl(tsdn, ehooks, edata, offset, length, > /* growing_retained */ false); > } > @@ -1297,6 +1302,7 @@ > bool > extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, > bool commit, bool zero, bool growing_retained) { > +which_base_extent_context= 0x21u; > witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), > WITNESS_RANK_CORE, growing_retained ? 1 : 0);
Well, with this hack, the behavior looks to have changed to always fail leading-to an initial: *** [libzpool.so.2.full] Error code 1 The hack may disturb things too much and may not be sufficiently close to valid code for the context. Using -j1 got a first failure message: Failed assertion: p[i] == 0 && which_base_extent_context == 0x22u That would be during ecache_alloc_grow. Still: *** [libzpool.so.2.full] Error code 1 === Mark Millard marklmi at yahoo.com
