On 05/07/17(Wed) 09:44, David Gwynne wrote: > the following adds a mutex to malloc and free to protect their > internal state. this should be enough to make the api mpsafe, > assuming the way they interact with uvm is mpsafe. > > this only uses a single mutex around the entire malloc subsystem, > but shuffles the code slightly to avoid holding it around calls > into uvm. the shuffling is mostly to account for an allocation (ie, > bump memuse early) before releasing the mutex to try and allocate > it from uvm. if uvm fails, it rolls this back. > > this is modelled on how the item accounting and locking is done in > pools. > > can anyone comment on the uvm safety? > > also, it is obvious that the malloc debug code has started to rot > a bit. ive fixed it up a bit, but now there's also a possible mp > race when items are moved from the used list to the free list, and > when the item is unmapped between that. id like to remove the malloc > debug code and improve the robustness of malloc itself. pools have > shows they can be fast and strict at the same time.
I'd welcome a mpsafe malloc(9), but I'm not sure it is worth spending too much time making it shine. I though the plan was to base it on pool(9) as soon as all the free(xx, 0) are converted. Only ~700 left! That said we're being slowed down by a non MP-safe malloc(9) and free(9). > Index: kern_malloc.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_malloc.c,v > retrieving revision 1.129 > diff -u -p -r1.129 kern_malloc.c > --- kern_malloc.c 7 Jun 2017 13:30:36 -0000 1.129 > +++ kern_malloc.c 3 Jul 2017 10:14:49 -0000 > @@ -39,6 +39,7 @@ > #include <sys/systm.h> > #include <sys/sysctl.h> > #include <sys/time.h> > +#include <sys/mutex.h> > #include <sys/rwlock.h> > > #include <uvm/uvm_extern.h> > @@ -94,6 +95,7 @@ u_int nkmempages_min = 0; > #endif > u_int nkmempages_max = 0; > > +struct mutex malloc_mtx = MUTEX_INITIALIZER(IPL_VM); > struct kmembuckets bucket[MINBUCKET + 16]; > #ifdef KMEMSTATS > struct kmemstats kmemstats[M_LAST]; > @@ -151,8 +153,8 @@ malloc(size_t size, int type, int flags) > struct kmemusage *kup; > struct kmem_freelist *freep; > long indx, npg, allocsize; > - int s; > caddr_t va, cp; > + int s; > #ifdef DIAGNOSTIC > int freshalloc; > char *savedtype; > @@ -164,9 +166,6 @@ malloc(size_t size, int type, int flags) > panic("malloc: bogus type %d", type); > #endif > > - if (!cold) > - KERNEL_ASSERT_LOCKED(); > - > KASSERT(flags & (M_WAITOK | M_NOWAIT)); > > if ((flags & M_NOWAIT) == 0) { > @@ -176,10 +175,6 @@ malloc(size_t size, int type, int flags) > if (pool_debug == 2) > yield(); > #endif > - if (!cold && pool_debug) { > - KERNEL_UNLOCK(); > - KERNEL_LOCK(); > - } > } > > #ifdef MALLOC_DEBUG > @@ -193,6 +188,7 @@ malloc(size_t size, int type, int flags) > if (size > 65535 * PAGE_SIZE) { > if (flags & M_CANFAIL) { > #ifndef SMALL_KERNEL > + /* XXX lock */ > if (ratecheck(&malloc_lasterr, &malloc_errintvl)) > printf("malloc(): allocation too large, " > "type = %d, size = %lu\n", type, size); > @@ -204,33 +200,38 @@ malloc(size_t size, int type, int flags) > } > > indx = BUCKETINDX(size); > + if (size > MAXALLOCSAVE) > + allocsize = round_page(size); > + else > + allocsize = 1 << indx; > kbp = &bucket[indx]; > - s = splvm(); > + mtx_enter(&malloc_mtx); > #ifdef KMEMSTATS > while (ksp->ks_memuse >= ksp->ks_limit) { > if (flags & M_NOWAIT) { > - splx(s); > + mtx_leave(&malloc_mtx); > return (NULL); > } > if (ksp->ks_limblocks < 65535) > ksp->ks_limblocks++; > - tsleep(ksp, PSWP+2, memname[type], 0); > + msleep(ksp, &malloc_mtx, PSWP+2, memname[type], 0); > } > + ksp->ks_memuse += allocsize; /* account for this early */ > ksp->ks_size |= 1 << indx; > #endif > - if (size > MAXALLOCSAVE) > - allocsize = round_page(size); > - else > - allocsize = 1 << indx; > if (XSIMPLEQ_FIRST(&kbp->kb_freelist) == NULL) { > + mtx_leave(&malloc_mtx); > npg = atop(round_page(allocsize)); > + s = splvm(); > va = (caddr_t)uvm_km_kmemalloc_pla(kmem_map, NULL, > (vsize_t)ptoa(npg), 0, > ((flags & M_NOWAIT) ? UVM_KMF_NOWAIT : 0) | > ((flags & M_CANFAIL) ? UVM_KMF_CANFAIL : 0), > no_constraint.ucr_low, no_constraint.ucr_high, > 0, 0, 0); > + splx(s); > if (va == NULL) { > + long wake; > /* > * Kmem_malloc() can return NULL, even if it can > * wait, if there is no map space available, because > @@ -241,9 +242,18 @@ malloc(size_t size, int type, int flags) > */ > if ((flags & (M_NOWAIT|M_CANFAIL)) == 0) > panic("malloc: out of space in kmem_map"); > - splx(s); > + > + mtx_enter(&malloc_mtx); > + ksp->ks_memuse -= allocsize; > + wake = ksp->ks_memuse + allocsize >= ksp->ks_limit && > + ksp->ks_memuse < ksp->ks_limit; > + mtx_leave(&malloc_mtx); > + if (wake) > + wakeup(ksp); > + > return (NULL); > } > + mtx_enter(&malloc_mtx); > #ifdef KMEMSTATS > kbp->kb_total += kbp->kb_elmpercl; > #endif > @@ -254,9 +264,6 @@ malloc(size_t size, int type, int flags) > #endif > if (allocsize > MAXALLOCSAVE) { > kup->ku_pagecnt = npg; > -#ifdef KMEMSTATS > - ksp->ks_memuse += allocsize; > -#endif > goto out; > } > #ifdef KMEMSTATS > @@ -334,17 +341,14 @@ malloc(size_t size, int type, int flags) > panic("malloc: lost data"); > kup->ku_freecnt--; > kbp->kb_totalfree--; > - ksp->ks_memuse += 1 << indx; > out: > kbp->kb_calls++; > ksp->ks_inuse++; > ksp->ks_calls++; > - if (ksp->ks_memuse > ksp->ks_maxused) > - ksp->ks_maxused = ksp->ks_memuse; > #else > out: > #endif > - splx(s); > + mtx_leave(&malloc_mtx); > > if ((flags & M_ZERO) && va != NULL) > memset(va, 0, size); > @@ -369,9 +373,6 @@ free(void *addr, int type, size_t freeds > struct kmemstats *ksp = &kmemstats[type]; > #endif > > - if (!cold) > - KERNEL_ASSERT_LOCKED(); > - > if (addr == NULL) > return; > > @@ -391,7 +392,6 @@ free(void *addr, int type, size_t freeds > kbp = &bucket[kup->ku_indx]; > if (size > MAXALLOCSAVE) > size = kup->ku_pagecnt << PAGE_SHIFT; > - s = splvm(); > #ifdef DIAGNOSTIC > if (freedsize != 0 && freedsize > size) > panic("free: size too large %zu > %ld (%p) type %s", > @@ -412,8 +412,11 @@ free(void *addr, int type, size_t freeds > addr, size, memname[type], alloc); > #endif /* DIAGNOSTIC */ > if (size > MAXALLOCSAVE) { > + s = splvm(); > uvm_km_free(kmem_map, (vaddr_t)addr, ptoa(kup->ku_pagecnt)); > + splx(s); > #ifdef KMEMSTATS > + mtx_enter(&malloc_mtx); > ksp->ks_memuse -= size; > kup->ku_indx = 0; > kup->ku_pagecnt = 0; > @@ -422,11 +425,12 @@ free(void *addr, int type, size_t freeds > wakeup(ksp); > ksp->ks_inuse--; > kbp->kb_total -= 1; > + mtx_leave(&malloc_mtx); > #endif > - splx(s); > return; > } > freep = (struct kmem_freelist *)addr; > + mtx_enter(&malloc_mtx); > #ifdef DIAGNOSTIC > /* > * Check for multiple frees. Use a quick check to see if > @@ -468,7 +472,7 @@ free(void *addr, int type, size_t freeds > ksp->ks_inuse--; > #endif > XSIMPLEQ_INSERT_TAIL(&kbp->kb_freelist, freep, kf_flist); > - splx(s); > + mtx_leave(&malloc_mtx); > } > > /* > @@ -578,6 +582,9 @@ sysctl_malloc(int *name, u_int namelen, > size_t newlen, struct proc *p) > { > struct kmembuckets kb; > +#ifdef KMEMSTATS > + struct kmemstats km; > +#endif > #if defined(KMEMSTATS) || defined(DIAGNOSTIC) || defined(FFS_SOFTUPDATES) > int error; > #endif > @@ -606,15 +613,19 @@ sysctl_malloc(int *name, u_int namelen, > return (sysctl_rdstring(oldp, oldlenp, newp, buckstring)); > > case KERN_MALLOC_BUCKET: > + mtx_enter(&malloc_mtx); > memcpy(&kb, &bucket[BUCKETINDX(name[1])], sizeof(kb)); > + mtx_leave(&malloc_mtx); > memset(&kb.kb_freelist, 0, sizeof(kb.kb_freelist)); > return (sysctl_rdstruct(oldp, oldlenp, newp, &kb, sizeof(kb))); > case KERN_MALLOC_KMEMSTATS: > #ifdef KMEMSTATS > if ((name[1] < 0) || (name[1] >= M_LAST)) > return (EINVAL); > - return (sysctl_rdstruct(oldp, oldlenp, newp, > - &kmemstats[name[1]], sizeof(struct kmemstats))); > + mtx_enter(&malloc_mtx); > + memcpy(&km, &kmemstats[name[1]], sizeof(km)); > + mtx_leave(&malloc_mtx); > + return (sysctl_rdstruct(oldp, oldlenp, newp, &km, sizeof(km))); > #else > return (EOPNOTSUPP); > #endif > Index: kern_malloc_debug.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_malloc_debug.c,v > retrieving revision 1.34 > diff -u -p -r1.34 kern_malloc_debug.c > --- kern_malloc_debug.c 16 Nov 2014 12:31:00 -0000 1.34 > +++ kern_malloc_debug.c 3 Jul 2017 10:14:49 -0000 > @@ -56,8 +56,10 @@ > #include <sys/malloc.h> > #include <sys/systm.h> > #include <sys/pool.h> > +#include <sys/mutex.h> > > #include <uvm/uvm_extern.h> > +#include <uvm/uvm.h> > > /* > * debug_malloc_type and debug_malloc_size define the type and size of > @@ -104,12 +106,13 @@ int debug_malloc_chunks_on_freelist; > int debug_malloc_initialized; > > struct pool debug_malloc_pool; > +struct mutex debug_malloc_mtx = MUTEX_INITIALIZER(IPL_VM); > > int > debug_malloc(unsigned long size, int type, int flags, void **addr) > { > struct debug_malloc_entry *md = NULL; > - int s, wait = (flags & M_NOWAIT) == 0; > + int wait = (flags & M_NOWAIT) == 0; > > /* Careful not to compare unsigned long to int -1 */ > if (((type != debug_malloc_type && debug_malloc_type != 0) || > @@ -123,13 +126,14 @@ debug_malloc(unsigned long size, int typ > if (size > PAGE_SIZE) > return (0); > > - s = splvm(); > if (debug_malloc_chunks_on_freelist < MALLOC_DEBUG_CHUNKS) > debug_malloc_allocate_free(wait); > > + mtx_enter(&debug_malloc_mtx); > + > md = TAILQ_FIRST(&debug_malloc_freelist); > if (md == NULL) { > - splx(s); > + mtx_leave(&debug_malloc_mtx); > return (0); > } > TAILQ_REMOVE(&debug_malloc_freelist, md, md_list); > @@ -137,14 +141,15 @@ debug_malloc(unsigned long size, int typ > > TAILQ_INSERT_HEAD(&debug_malloc_usedlist, md, md_list); > debug_malloc_allocs++; > - splx(s); > - > - pmap_kenter_pa(md->md_va, md->md_pa, PROT_READ | PROT_WRITE); > - pmap_update(pmap_kernel()); > > md->md_size = size; > md->md_type = type; > > + mtx_leave(&debug_malloc_mtx); > + > + pmap_kenter_pa(md->md_va, md->md_pa, PROT_READ | PROT_WRITE); > + pmap_update(pmap_kernel()); > + > /* > * Align the returned addr so that it ends where the first page > * ends. roundup to get decent alignment. > @@ -158,7 +163,6 @@ debug_free(void *addr, int type) > { > struct debug_malloc_entry *md; > vaddr_t va; > - int s; > > if (type != debug_malloc_type && debug_malloc_type != 0 && > type != M_DEBUG) > @@ -169,7 +173,7 @@ debug_free(void *addr, int type) > */ > va = trunc_page((vaddr_t)addr); > > - s = splvm(); > + mtx_enter(&debug_malloc_mtx); > TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list) > if (md->md_va == va) > break; > @@ -185,21 +189,24 @@ debug_free(void *addr, int type) > TAILQ_FOREACH(md, &debug_malloc_freelist, md_list) > if (md->md_va == va) > panic("debug_free: already free"); > - splx(s); > + mtx_leave(&debug_malloc_mtx); > return (0); > } > > debug_malloc_frees++; > TAILQ_REMOVE(&debug_malloc_usedlist, md, md_list); > + mtx_leave(&debug_malloc_mtx); > > - TAILQ_INSERT_TAIL(&debug_malloc_freelist, md, md_list); > - debug_malloc_chunks_on_freelist++; > /* > * unmap the page. > */ > pmap_kremove(md->md_va, PAGE_SIZE); > pmap_update(pmap_kernel()); > - splx(s); > + > + mtx_enter(&debug_malloc_mtx); > + TAILQ_INSERT_TAIL(&debug_malloc_freelist, md, md_list); > + debug_malloc_chunks_on_freelist++; > + mtx_leave(&debug_malloc_mtx); > > return (1); > } > @@ -217,7 +224,7 @@ debug_malloc_init(void) > debug_malloc_chunks_on_freelist = 0; > > pool_init(&debug_malloc_pool, sizeof(struct debug_malloc_entry), > - 0, 0, 0, "mdbepl", NULL); > + 0, 0, IPL_VM, "mdbepl", NULL); > > debug_malloc_initialized = 1; > } > @@ -233,19 +240,18 @@ debug_malloc_allocate_free(int wait) > vaddr_t va, offset; > struct vm_page *pg; > struct debug_malloc_entry *md; > - > - splassert(IPL_VM); > + int s; > > md = pool_get(&debug_malloc_pool, wait ? PR_WAITOK : PR_NOWAIT); > if (md == NULL) > return; > > + s = splvm(); > + > va = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE * 2, > UVM_KMF_VALLOC | (wait ? 0: UVM_KMF_NOWAIT)); > - if (va == 0) { > - pool_put(&debug_malloc_pool, md); > - return; > - } > + if (va == 0) > + goto put; > > offset = va - vm_map_min(kernel_map); > for (;;) { > @@ -258,20 +264,31 @@ debug_malloc_allocate_free(int wait) > if (pg) > break; > > - if (wait == 0) { > - uvm_unmap(kmem_map, va, va + PAGE_SIZE * 2); > - pool_put(&debug_malloc_pool, md); > - return; > - } > + if (wait == 0) > + goto unmap; > + > uvm_wait("debug_malloc"); > } > > + splx(s); > + > md->md_va = va; > md->md_pa = VM_PAGE_TO_PHYS(pg); > > + mtx_enter(&debug_malloc_mtx); > debug_malloc_pages++; > TAILQ_INSERT_HEAD(&debug_malloc_freelist, md, md_list); > debug_malloc_chunks_on_freelist++; > + mtx_leave(&debug_malloc_mtx); > + > + return; > + > +unmap: > + uvm_unmap(kmem_map, va, va + PAGE_SIZE * 2); > +put: > + splx(s); > + pool_put(&debug_malloc_pool, md); > + return; > } > > void > @@ -311,7 +328,7 @@ debug_malloc_printit( > if (addr >= md->md_va && > addr < md->md_va + 2 * PAGE_SIZE) { > (*pr)("Memory at address 0x%lx is in a freed " > - "area. type %d, size: %d\n ", > + "area. type %d, size: %zu\n ", > addr, md->md_type, md->md_size); > return; > } > @@ -320,7 +337,7 @@ debug_malloc_printit( > if (addr >= md->md_va + PAGE_SIZE && > addr < md->md_va + 2 * PAGE_SIZE) { > (*pr)("Memory at address 0x%lx is just outside " > - "an allocated area. type %d, size: %d\n", > + "an allocated area. type %d, size: %zu\n", > addr, md->md_type, md->md_size); > return; > } >