On 2022/01/09 14:54, Otto Moerbeek wrote:
> currently malloc does cache a number of free'ed regions up to 128k in
> size. This cache is indexed by size (in # of pages), so it is very
> quick to check.
>
> Some programs allocate and deallocate larger allocations in a frantic
> way. Accodomate those programs by also keeping a cache of regions
> betwen 128k and 2M, in a cache of variable sized regions.
>
> My test case speeds up about twice. A make build gets a small speedup.
>
> This has been tested by myself on amd64 quite intensively. I am asking
> for more tests, especialy on more "exotic" platforms. I wil do arm64
> myself soon. Test can be running your favorite programs, doing make
> builds or running tests in regress/lib/libc/malloc.
This has been through mkr and ports bulk build on i386.
Ports build times vary too much to say if it's faster there but
no issues have been seen.
> Thanks in advance!
>
> -Otto
>
> Index: stdlib/malloc.c
> ===================================================================
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.272
> diff -u -p -r1.272 malloc.c
> --- stdlib/malloc.c 19 Sep 2021 09:15:22 -0000 1.272
> +++ stdlib/malloc.c 9 Jan 2022 13:10:35 -0000
> @@ -113,13 +113,28 @@ struct region_info {
>
> LIST_HEAD(chunk_head, chunk_info);
>
> -#define MAX_CACHEABLE_SIZE 32
> -struct cache {
> - void *pages[MALLOC_MAXCACHE];
> +/*
> + * Two caches, one for "small" regions, one for "big".
> + * Small cacche is an array per size, big cache is one array with different
> + * sizes regions
> + */
> +#define MAX_SMALLCACHEABLE_SIZE 32
> +#define MAX_BIGCACHEABLE_SIZE 512
> +#define BIGCACHE_SIZE MALLOC_MAXCACHE
> +/* If the total # of pages is larger than this, evict before inserting */
> +#define BIGCACHE_FILL(sz) (MAX_BIGCACHEABLE_SIZE * (sz) / 4)
> +
> +struct smallcache {
> + void **pages;
> ushort length;
> ushort max;
> };
>
> +struct bigcache {
> + void *page;
> + size_t psize;
> +};
> +
> struct dir_info {
> u_int32_t canary1;
> int active; /* status of malloc */
> @@ -139,7 +154,10 @@ struct dir_info {
> void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
> u_char rbytes[32]; /* random bytes */
> /* free pages cache */
> - struct cache cache[MAX_CACHEABLE_SIZE];
> + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
> + ushort bigcache_size;
> + size_t bigcache_used;
> + struct bigcache bigcache[BIGCACHE_SIZE];
> #ifdef MALLOC_STATS
> size_t inserts;
> size_t insert_collisions;
> @@ -714,18 +732,61 @@ unmap(struct dir_info *d, void *p, size_
> size_t psz = sz >> MALLOC_PAGESHIFT;
> void *r;
> u_short i;
> - struct cache *cache;
> + struct smallcache *cache;
>
> if (sz != PAGEROUND(sz) || psz == 0)
> wrterror(d, "munmap round");
>
> - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
> + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
> + psz <= MAX_BIGCACHEABLE_SIZE) {
> + u_short base = getrbyte(d);
> + u_short j;
> +
> + /* don't look through all slots */
> + for (j = 0; j < d->bigcache_size / 4; j++) {
> + i = (base + j) % d->bigcache_size;
> + if (d->bigcache_used <
> + BIGCACHE_FILL(d->bigcache_size)) {
> + if (d->bigcache[i].psize == 0)
> + break;
> + } else {
> + if (d->bigcache[i].psize != 0)
> + break;
> + }
> + }
> + /* if we didn't find a preferred slot, use random one */
> + if (d->bigcache[i].psize != 0) {
> + size_t tmp;
> +
> + r = d->bigcache[i].page;
> + d->bigcache_used -= d->bigcache[i].psize;
> + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT;
> + if (!mopts.malloc_freeunmap)
> + validate_junk(d, r, tmp);
> + if (munmap(r, tmp))
> + wrterror(d, "munmap %p", r);
> + STATS_SUB(d->malloc_used, tmp);
> + }
> +
> + if (clear > 0)
> + explicit_bzero(p, clear);
> + if (mopts.malloc_freeunmap) {
> + if (mprotect(p, sz, PROT_NONE))
> + wrterror(d, "mprotect %p", r);
> + } else
> + junk_free(d->malloc_junk, p, sz);
> + d->bigcache[i].page = p;
> + d->bigcache[i].psize = psz;
> + d->bigcache_used += psz;
> + return;
> + }
> + if (psz > MAX_SMALLCACHEABLE_SIZE || d->smallcache[psz - 1].max == 0) {
> if (munmap(p, sz))
> wrterror(d, "munmap %p", p);
> STATS_SUB(d->malloc_used, sz);
> return;
> }
> - cache = &d->cache[psz - 1];
> + cache = &d->smallcache[psz - 1];
> if (cache->length == cache->max) {
> /* use a random slot */
> i = getrbyte(d) % cache->max;
> @@ -753,9 +814,10 @@ unmap(struct dir_info *d, void *p, size_
> static void *
> map(struct dir_info *d, size_t sz, int zero_fill)
> {
> - size_t i, psz = sz >> MALLOC_PAGESHIFT;
> + size_t psz = sz >> MALLOC_PAGESHIFT;
> + u_short i;
> void *p;
> - struct cache *cache;
> + struct smallcache *cache;
>
> if (mopts.malloc_canary != (d->canary1 ^ (u_int32_t)(uintptr_t)d) ||
> d->canary1 != ~d->canary2)
> @@ -764,8 +826,35 @@ map(struct dir_info *d, size_t sz, int z
> wrterror(d, "map round");
>
>
> - if (psz <= MAX_CACHEABLE_SIZE && d->cache[psz - 1].max > 0) {
> - cache = &d->cache[psz - 1];
> + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
> + psz <= MAX_BIGCACHEABLE_SIZE) {
> + size_t base = getrbyte(d);
> + size_t cached = d->bigcache_used;
> + ushort j;
> +
> + for (j = 0; j < d->bigcache_size && cached >= psz; j++) {
> + i = (j + base) % d->bigcache_size;
> + if (d->bigcache[i].psize == psz) {
> + p = d->bigcache[i].page;
> + d->bigcache_used -= psz;
> + d->bigcache[i].page = NULL;
> + d->bigcache[i].psize = 0;
> +
> + if (!mopts.malloc_freeunmap)
> + validate_junk(d, p, sz);
> + if (mopts.malloc_freeunmap)
> + mprotect(p, sz, PROT_READ | PROT_WRITE);
> + if (zero_fill)
> + memset(p, 0, sz);
> + else if (mopts.malloc_freeunmap)
> + junk_free(d->malloc_junk, p, sz);
> + return p;
> + }
> + cached -= d->bigcache[i].psize;
> + }
> + }
> + if (psz <= MAX_SMALLCACHEABLE_SIZE && d->smallcache[psz - 1].max > 0) {
> + cache = &d->smallcache[psz - 1];
> if (cache->length > 0) {
> if (cache->length == 1)
> p = cache->pages[--cache->length];
> @@ -1224,13 +1313,31 @@ _malloc_init(int from_rthreads)
> if (i == 0) {
> omalloc_poolinit(&d, MAP_CONCEAL);
> d->malloc_junk = 2;
> - for (j = 0; j < MAX_CACHEABLE_SIZE; j++)
> - d->cache[j].max = 0;
> + d->bigcache_size = 0;
> + for (j = 0; j < MAX_SMALLCACHEABLE_SIZE; j++)
> + d->smallcache[j].max = 0;
> } else {
> + size_t sz = 0;
> +
> omalloc_poolinit(&d, 0);
> d->malloc_junk = mopts.def_malloc_junk;
> - for (j = 0; j < MAX_CACHEABLE_SIZE; j++)
> - d->cache[j].max = mopts.def_maxcache >> (j / 8);
> + d->bigcache_size = mopts.def_maxcache;
> + for (j = 0; j < MAX_SMALLCACHEABLE_SIZE; j++) {
> + d->smallcache[j].max =
> + mopts.def_maxcache >> (j / 8);
> + sz += d->smallcache[j].max;
> + }
> + if (sz > 0) {
> + void *p = MMAP(sz * sizeof(void *), 0);
> + if (p == MAP_FAILED)
> + wrterror(NULL,
> + "malloc init mmap failed");
> + for (j = 0; j < MAX_SMALLCACHEABLE_SIZE; j++) {
> + d->smallcache[j].pages = p;
> + p = (char *)p + d->smallcache[j].max *
> + sizeof(void *);
> + }
> + }
> }
> d->mutex = i;
> mopts.malloc_pool[i] = d;
> @@ -1399,6 +1506,8 @@ ofree(struct dir_info **argpool, void *p
> }
> STATS_SUB(pool->malloc_guarded, mopts.malloc_guard);
> }
> + if (!check)
> + argsz -= mopts.malloc_guard;
> unmap(pool, p, PAGEROUND(sz), clear ? argsz : 0);
> delete(pool, r);
> } else {
> @@ -2196,15 +2305,23 @@ dump_free_chunk_info(int fd, struct dir_
> static void
> dump_free_page_info(int fd, struct dir_info *d)
> {
> - struct cache *cache;
> + struct smallcache *cache;
> size_t i, total = 0;
>
> - dprintf(fd, "Cached:\n");
> - for (i = 0; i < MAX_CACHEABLE_SIZE; i++) {
> - cache = &d->cache[i];
> + dprintf(fd, "Cached in small cache:\n");
> + for (i = 0; i < MAX_SMALLCACHEABLE_SIZE; i++) {
> + cache = &d->smallcache[i];
> if (cache->length != 0)
> dprintf(fd, "%zu(%u): %u = %zu\n", i + 1, cache->max,
> cache->length, cache->length * (i + 1));
> total += cache->length * (i + 1);
> + }
> +
> + dprintf(fd, "Cached in big cache: %zu/%zu\n", d->bigcache_used,
> + d->bigcache_size);
> + for (i = 0; i < d->bigcache_size; i++) {
> + if (d->bigcache[i].psize != 0)
> + dprintf(fd, "%zu: %zu\n", i, d->bigcache[i].psize);
> + total += d->bigcache[i].psize;
> }
> dprintf(fd, "Free pages cached: %zu\n", total);
> }
>
>