Hi, today i did some performance messurements on OpenBSD-current, with and without your diff.
I use a custom HTTP proxy. Performance in this case is requests/s and bandwidth. To messure requests/s i use requests with a very small response. To messure bandwidth i use big responses. This custom proxy uses socket splicing to improve the performace for bigger junks of data. The following results are a average over multiple test runs. without your diff: requests/s = 3929 bandwidth in Mbit/s = 1196 with your diff: requests/s = 4093 bandwidth in Mbit/s = 1428 Just ask if you want more details on the testsetup. BR Simon 2016-10-27 5:54 GMT+02:00, David Gwynne <[email protected]>: > On Tue, Oct 25, 2016 at 10:35:45AM +1000, David Gwynne wrote: >> On Mon, Oct 24, 2016 at 04:24:13PM +1000, David Gwynne wrote: >> > ive posted this before as part of a much bigger diff, but smaller >> > is better. >> > >> > it basically lets things ask for per-cpu item caches to be enabled >> > on pools. the most obvious use case for this is the mbuf pools. >> > >> > the caches are modelled on whats described in the "Magazines and >> > Vmem: Extending the Slab Allocator to Many CPUs and Arbitrary >> > Resources" paper by Jeff Bonwick and Jonathan Adams. pools are >> > modelled on slabs, which bonwick described in a previous paper. >> > >> > the main inspiration the paper provided was around how many objects >> > to cache on each cpu, and how often to move sets of objects between >> > the cpu caches and a global list of objects. unlike the paper we >> > do not care about maintaining constructed objects on the free lists, >> > so we reuse the objects themselves to build the free list. >> > >> > id like to get this in so we can tinker with it in tree. the things >> > i think we need to tinker with are what poisioning we can get away >> > with on the per cpu caches, and what limits can we enforce at the >> > pool level. >> > >> > i think poisioning will be relatively simple to add. the limits one >> > is more challenging because we dont want the pools to have to >> > coordinate between cpus for every get or put operation. my thought >> > there was to limit the number of pages that a pool can allocate >> > from its backend rather than limit the items the pool can provide. >> > limiting the pages could also be done at a lower level. eg, the >> > mbuf clusters could share a common backend that limits the pages >> > the pools can get, rather than have the cluster pools account for >> > pages separately. >> > >> > anyway, either way i would like to get this in so we can work on >> > this stuff. >> > >> > ok? >> >> this adds per-cpu caches to the mbuf pools so people can actually >> try and see if the code works or not. > > this fixes a crash hrvoje and i found independently. avoid holding > a mutex when calling yield(). > > also some whitespace fixes. > > Index: kern/subr_pool.c > =================================================================== > RCS file: /cvs/src/sys/kern/subr_pool.c,v > retrieving revision 1.198 > diff -u -p -r1.198 subr_pool.c > --- kern/subr_pool.c 15 Sep 2016 02:00:16 -0000 1.198 > +++ kern/subr_pool.c 27 Oct 2016 03:51:10 -0000 > @@ -42,6 +42,7 @@ > #include <sys/sysctl.h> > #include <sys/task.h> > #include <sys/timeout.h> > +#include <sys/percpu.h> > > #include <uvm/uvm_extern.h> > > @@ -96,6 +97,33 @@ struct pool_item { > }; > #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic) > > +#ifdef MULTIPROCESSOR > +struct pool_list { > + struct pool_list *pl_next; /* next in list */ > + unsigned long pl_cookie; > + struct pool_list *pl_nextl; /* next list */ > + unsigned long pl_nitems; /* items in list */ > +}; > + > +struct pool_cache { > + struct pool_list *pc_actv; > + unsigned long pc_nactv; /* cache pc_actv nitems */ > + struct pool_list *pc_prev; > + > + uint64_t pc_gen; /* generation number */ > + uint64_t pc_gets; > + uint64_t pc_puts; > + uint64_t pc_fails; > + > + int pc_nout; > +}; > + > +void *pool_cache_get(struct pool *); > +void pool_cache_put(struct pool *, void *); > +void pool_cache_destroy(struct pool *); > +#endif > +void pool_cache_info(struct pool *, struct kinfo_pool *); > + > #ifdef POOL_DEBUG > int pool_debug = 1; > #else > @@ -355,6 +383,11 @@ pool_destroy(struct pool *pp) > struct pool_item_header *ph; > struct pool *prev, *iter; > > +#ifdef MULTIPROCESSOR > + if (pp->pr_cache != NULL) > + pool_cache_destroy(pp); > +#endif > + > #ifdef DIAGNOSTIC > if (pp->pr_nout != 0) > panic("%s: pool busy: still out: %u", __func__, pp->pr_nout); > @@ -421,6 +454,14 @@ pool_get(struct pool *pp, int flags) > void *v = NULL; > int slowdown = 0; > > +#ifdef MULTIPROCESSOR > + if (pp->pr_cache != NULL) { > + v = pool_cache_get(pp); > + if (v != NULL) > + goto good; > + } > +#endif > + > KASSERT(flags & (PR_WAITOK | PR_NOWAIT)); > > mtx_enter(&pp->pr_mtx); > @@ -453,6 +494,9 @@ pool_get(struct pool *pp, int flags) > v = mem.v; > } > > +#ifdef MULTIPROCESSOR > +good: > +#endif > if (ISSET(flags, PR_ZERO)) > memset(v, 0, pp->pr_size); > > @@ -631,6 +675,13 @@ pool_put(struct pool *pp, void *v) > panic("%s: NULL item", __func__); > #endif > > +#ifdef MULTIPROCESSOR > + if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) { > + pool_cache_put(pp, v); > + return; > + } > +#endif > + > mtx_enter(&pp->pr_mtx); > > splassert(pp->pr_ipl); > @@ -1333,6 +1384,8 @@ sysctl_dopool(int *name, u_int namelen, > pi.pr_nidle = pp->pr_nidle; > mtx_leave(&pp->pr_mtx); > > + pool_cache_info(pp, &pi); > + > rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi)); > break; > } > @@ -1499,3 +1552,265 @@ pool_multi_free_ni(struct pool *pp, void > km_free(v, pp->pr_pgsize, &kv, pp->pr_crange); > KERNEL_UNLOCK(); > } > + > +#ifdef MULTIPROCESSOR > + > +struct pool pool_caches; /* per cpu cache entries */ > + > +void > +pool_cache_init(struct pool *pp) > +{ > + struct cpumem *cm; > + struct pool_cache *pc; > + struct cpumem_iter i; > + > + if (pool_caches.pr_size == 0) { > + pool_init(&pool_caches, sizeof(struct pool_cache), 64, > + IPL_NONE, PR_WAITOK, "plcache", NULL); > + } > + > + KASSERT(pp->pr_size >= sizeof(*pc)); > + > + cm = cpumem_get(&pool_caches); > + > + mtx_init(&pp->pr_cache_mtx, pp->pr_ipl); > + pp->pr_cache_list = NULL; > + pp->pr_cache_nlist = 0; > + pp->pr_cache_items = 8; > + pp->pr_cache_contention = 0; > + > + CPUMEM_FOREACH(pc, &i, cm) { > + pc->pc_actv = NULL; > + pc->pc_nactv = 0; > + pc->pc_prev = NULL; > + > + pc->pc_gets = 0; > + pc->pc_puts = 0; > + pc->pc_fails = 0; > + pc->pc_nout = 0; > + } > + > + pp->pr_cache = cm; > +} > + > +static inline void > +pool_list_enter(struct pool *pp) > +{ > + if (mtx_enter_try(&pp->pr_cache_mtx) == 0) { > + mtx_enter(&pp->pr_cache_mtx); > + pp->pr_cache_contention++; > + } > +} > + > +static inline void > +pool_list_leave(struct pool *pp) > +{ > + mtx_leave(&pp->pr_cache_mtx); > +} > + > +static inline struct pool_list * > +pool_list_alloc(struct pool *pp, struct pool_cache *pc) > +{ > + struct pool_list *pl; > + > + pool_list_enter(pp); > + pl = pp->pr_cache_list; > + if (pl != NULL) { > + pp->pr_cache_list = pl->pl_nextl; > + pp->pr_cache_nlist--; > + } > + > + pp->pr_cache_nout += pc->pc_nout; > + pc->pc_nout = 0; > + pool_list_leave(pp); > + > + return (pl); > +} > + > +static inline void > +pool_list_free(struct pool *pp, struct pool_cache *pc, struct pool_list > *pl) > +{ > + pool_list_enter(pp); > + pl->pl_nextl = pp->pr_cache_list; > + pp->pr_cache_list = pl; > + pp->pr_cache_nlist++; > + > + pp->pr_cache_nout += pc->pc_nout; > + pc->pc_nout = 0; > + pool_list_leave(pp); > +} > + > +static inline struct pool_cache * > +pool_cache_enter(struct pool *pp, int *s) > +{ > + struct pool_cache *pc; > + > + pc = cpumem_enter(pp->pr_cache); > + *s = splraise(pp->pr_ipl); > + pc->pc_gen++; > + > + return (pc); > +} > + > +static inline void > +pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s) > +{ > + pc->pc_gen++; > + splx(s); > + cpumem_leave(pp->pr_cache, pc); > +} > + > +void * > +pool_cache_get(struct pool *pp) > +{ > + struct pool_cache *pc; > + struct pool_list *pl; > + int s; > + > + pc = pool_cache_enter(pp, &s); > + > + if (pc->pc_actv != NULL) { > + pl = pc->pc_actv; > + } else if (pc->pc_prev != NULL) { > + pl = pc->pc_prev; > + pc->pc_prev = NULL; > + } else if ((pl = pool_list_alloc(pp, pc)) == NULL) { > + pc->pc_fails++; > + goto done; > + } > + > + pc->pc_actv = pl->pl_next; > + pc->pc_nactv = pl->pl_nitems - 1; > + pc->pc_gets++; > + pc->pc_nout++; > +done: > + pool_cache_leave(pp, pc, s); > + > + return (pl); > +} > + > +void > +pool_cache_put(struct pool *pp, void *v) > +{ > + struct pool_cache *pc; > + struct pool_list *pl = v; > + unsigned long cache_items = pp->pr_cache_items; > + unsigned long nitems; > + int s; > + > + pc = pool_cache_enter(pp, &s); > + > + nitems = pc->pc_nactv; > + if (nitems >= cache_items) { > + if (pc->pc_prev != NULL) > + pool_list_free(pp, pc, pc->pc_prev); > + > + pc->pc_prev = pc->pc_actv; > + > + pc->pc_actv = NULL; > + pc->pc_nactv = 0; > + nitems = 0; > + } > + > + pl->pl_next = pc->pc_actv; > + pl->pl_nitems = ++nitems; > + > + pc->pc_actv = pl; > + pc->pc_nactv = nitems; > + > + pc->pc_puts++; > + pc->pc_nout--; > + > + pool_cache_leave(pp, pc, s); > +} > + > +struct pool_list * > +pool_list_put(struct pool *pp, struct pool_list *pl) > +{ > + struct pool_list *rpl, *npl; > + > + if (pl == NULL) > + return (NULL); > + > + rpl = (struct pool_list *)pl->pl_next; > + > + do { > + npl = pl->pl_next; > + pool_put(pp, pl); > + pl = npl; > + } while (pl != NULL); > + > + return (rpl); > +} > + > +void > +pool_cache_destroy(struct pool *pp) > +{ > + struct pool_cache *pc; > + struct pool_list *pl; > + struct cpumem_iter i; > + struct cpumem *cm; > + > + cm = pp->pr_cache; > + pp->pr_cache = NULL; /* make pool_put avoid the cache */ > + > + CPUMEM_FOREACH(pc, &i, cm) { > + pool_list_put(pp, pc->pc_actv); > + pool_list_put(pp, pc->pc_prev); > + } > + > + cpumem_put(&pool_caches, cm); > + > + pl = pp->pr_cache_list; > + while (pl != NULL) > + pl = pool_list_put(pp, pl); > +} > + > +void > +pool_cache_info(struct pool *pp, struct kinfo_pool *pi) > +{ > + struct pool_cache *pc; > + struct cpumem_iter i; > + > + if (pp->pr_cache == NULL) > + return; > + > + /* loop through the caches twice to collect stats */ > + > + /* once without the mtx so we can yield while reading nget/nput */ > + CPUMEM_FOREACH(pc, &i, pp->pr_cache) { > + uint64_t gen, nget, nput; > + > + do { > + while ((gen = pc->pc_gen) & 1) > + yield(); > + > + nget = pc->pc_gets; > + nput = pc->pc_puts; > + } while (gen != pc->pc_gen); > + > + pi->pr_nget += nget; > + pi->pr_nput += nput; > + } > + > + /* and once with the mtx so we can get consistent nout values */ > + mtx_enter(&pp->pr_cache_mtx); > + CPUMEM_FOREACH(pc, &i, pp->pr_cache) > + pi->pr_nout += pc->pc_nout; > + > + pi->pr_nout += pp->pr_cache_nout; > + mtx_leave(&pp->pr_cache_mtx); > +} > +#else /* MULTIPROCESSOR */ > +void > +pool_cache_init(struct pool *pp) > +{ > + /* nop */ > +} > + > +void > +pool_cache_info(struct pool *pp, struct kinfo_pool *pi) > +{ > + /* nop */ > +} > +#endif /* MULTIPROCESSOR */ > Index: kern/uipc_mbuf.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v > retrieving revision 1.237 > diff -u -p -r1.237 uipc_mbuf.c > --- kern/uipc_mbuf.c 27 Oct 2016 03:29:55 -0000 1.237 > +++ kern/uipc_mbuf.c 27 Oct 2016 03:51:10 -0000 > @@ -186,7 +186,15 @@ mbinit(void) > void > mbcpuinit() > { > + int i; > + > mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF); > + > + pool_cache_init(&mbpool); > + pool_cache_init(&mtagpool); > + > + for (i = 0; i < nitems(mclsizes); i++) > + pool_cache_init(&mclpools[i]); > } > > void > Index: sys/pool.h > =================================================================== > RCS file: /cvs/src/sys/sys/pool.h,v > retrieving revision 1.63 > diff -u -p -r1.63 pool.h > --- sys/pool.h 15 Sep 2016 02:00:16 -0000 1.63 > +++ sys/pool.h 27 Oct 2016 03:51:10 -0000 > @@ -84,6 +84,9 @@ struct pool_allocator { > > TAILQ_HEAD(pool_pagelist, pool_item_header); > > +struct pool_list; > +struct cpumem; > + > struct pool { > struct mutex pr_mtx; > SIMPLEQ_ENTRY(pool) > @@ -124,6 +127,15 @@ struct pool { > RBT_HEAD(phtree, pool_item_header) > pr_phtree; > > + struct cpumem * pr_cache; > + struct mutex pr_cache_mtx; > + struct pool_list * > + pr_cache_list; > + u_int pr_cache_nlist; > + u_int pr_cache_items; > + u_int pr_cache_contention; > + int pr_cache_nout; > + > u_int pr_align; > u_int pr_maxcolors; /* Cache coloring */ > int pr_phoffset; /* Offset in page of page header */ > @@ -175,6 +187,7 @@ struct pool_request { > > void pool_init(struct pool *, size_t, u_int, int, int, > const char *, struct pool_allocator *); > +void pool_cache_init(struct pool *); > void pool_destroy(struct pool *); > void pool_setlowat(struct pool *, int); > void pool_sethiwat(struct pool *, int); > >
