Re: per-cpu caches for pools

Simon Mages Mon, 31 Oct 2016 11:14:33 -0700

Hi,

today i did some performance messurements on OpenBSD-current, with and
without your diff.


I use a custom HTTP proxy. Performance in this case is requests/s and bandwidth.
To messure requests/s i use requests with a very small response. To
messure bandwidth
i use big responses. This custom proxy uses socket splicing to improve
the performace for
bigger junks of data. The following results are a average over
multiple test runs.

without your diff:
requests/s = 3929
bandwidth in Mbit/s = 1196

with your diff:
requests/s = 4093
bandwidth in Mbit/s = 1428

Just ask if you want more details on the testsetup.

BR

Simon

2016-10-27 5:54 GMT+02:00, David Gwynne <[email protected]>:
> On Tue, Oct 25, 2016 at 10:35:45AM +1000, David Gwynne wrote:
>> On Mon, Oct 24, 2016 at 04:24:13PM +1000, David Gwynne wrote:
>> > ive posted this before as part of a much bigger diff, but smaller
>> > is better.
>> >
>> > it basically lets things ask for per-cpu item caches to be enabled
>> > on pools. the most obvious use case for this is the mbuf pools.
>> >
>> > the caches are modelled on whats described in the "Magazines and
>> > Vmem: Extending the Slab Allocator to Many CPUs and Arbitrary
>> > Resources" paper by Jeff Bonwick and Jonathan Adams. pools are
>> > modelled on slabs, which bonwick described in a previous paper.
>> >
>> > the main inspiration the paper provided was around how many objects
>> > to cache on each cpu, and how often to move sets of objects between
>> > the cpu caches and a global list of objects. unlike the paper we
>> > do not care about maintaining constructed objects on the free lists,
>> > so we reuse the objects themselves to build the free list.
>> >
>> > id like to get this in so we can tinker with it in tree. the things
>> > i think we need to tinker with are what poisioning we can get away
>> > with on the per cpu caches, and what limits can we enforce at the
>> > pool level.
>> >
>> > i think poisioning will be relatively simple to add. the limits one
>> > is more challenging because we dont want the pools to have to
>> > coordinate between cpus for every get or put operation. my thought
>> > there was to limit the number of pages that a pool can allocate
>> > from its backend rather than limit the items the pool can provide.
>> > limiting the pages could also be done at a lower level. eg, the
>> > mbuf clusters could share a common backend that limits the pages
>> > the pools can get, rather than have the cluster pools account for
>> > pages separately.
>> >
>> > anyway, either way i would like to get this in so we can work on
>> > this stuff.
>> >
>> > ok?
>>
>> this adds per-cpu caches to the mbuf pools so people can actually
>> try and see if the code works or not.
>
> this fixes a crash hrvoje and i found independently. avoid holding
> a mutex when calling yield().
>
> also some whitespace fixes.
>
> Index: kern/subr_pool.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/subr_pool.c,v
> retrieving revision 1.198
> diff -u -p -r1.198 subr_pool.c
> --- kern/subr_pool.c  15 Sep 2016 02:00:16 -0000      1.198
> +++ kern/subr_pool.c  27 Oct 2016 03:51:10 -0000
> @@ -42,6 +42,7 @@
>  #include <sys/sysctl.h>
>  #include <sys/task.h>
>  #include <sys/timeout.h>
> +#include <sys/percpu.h>
>
>  #include <uvm/uvm_extern.h>
>
> @@ -96,6 +97,33 @@ struct pool_item {
>  };
>  #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
>
> +#ifdef MULTIPROCESSOR
> +struct pool_list {
> +     struct pool_list        *pl_next;       /* next in list */
> +     unsigned long            pl_cookie;
> +     struct pool_list        *pl_nextl;      /* next list */
> +     unsigned long            pl_nitems;     /* items in list */
> +};
> +
> +struct pool_cache {
> +     struct pool_list        *pc_actv;
> +     unsigned long            pc_nactv;      /* cache pc_actv nitems */
> +     struct pool_list        *pc_prev;
> +
> +     uint64_t                 pc_gen;        /* generation number */
> +     uint64_t                 pc_gets;
> +     uint64_t                 pc_puts;
> +     uint64_t                 pc_fails;
> +
> +     int                      pc_nout;
> +};
> +
> +void *pool_cache_get(struct pool *);
> +void  pool_cache_put(struct pool *, void *);
> +void  pool_cache_destroy(struct pool *);
> +#endif
> +void  pool_cache_info(struct pool *, struct kinfo_pool *);
> +
>  #ifdef POOL_DEBUG
>  int  pool_debug = 1;
>  #else
> @@ -355,6 +383,11 @@ pool_destroy(struct pool *pp)
>       struct pool_item_header *ph;
>       struct pool *prev, *iter;
>
> +#ifdef MULTIPROCESSOR
> +     if (pp->pr_cache != NULL)
> +             pool_cache_destroy(pp);
> +#endif
> +
>  #ifdef DIAGNOSTIC
>       if (pp->pr_nout != 0)
>               panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
> @@ -421,6 +454,14 @@ pool_get(struct pool *pp, int flags)
>       void *v = NULL;
>       int slowdown = 0;
>
> +#ifdef MULTIPROCESSOR
> +     if (pp->pr_cache != NULL) {
> +             v = pool_cache_get(pp);
> +             if (v != NULL)
> +                     goto good;
> +     }
> +#endif
> +
>       KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
>
>       mtx_enter(&pp->pr_mtx);
> @@ -453,6 +494,9 @@ pool_get(struct pool *pp, int flags)
>               v = mem.v;
>       }
>
> +#ifdef MULTIPROCESSOR
> +good:
> +#endif
>       if (ISSET(flags, PR_ZERO))
>               memset(v, 0, pp->pr_size);
>
> @@ -631,6 +675,13 @@ pool_put(struct pool *pp, void *v)
>               panic("%s: NULL item", __func__);
>  #endif
>
> +#ifdef MULTIPROCESSOR
> +     if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
> +             pool_cache_put(pp, v);
> +             return;
> +     }
> +#endif
> +
>       mtx_enter(&pp->pr_mtx);
>
>       splassert(pp->pr_ipl);
> @@ -1333,6 +1384,8 @@ sysctl_dopool(int *name, u_int namelen,
>               pi.pr_nidle = pp->pr_nidle;
>               mtx_leave(&pp->pr_mtx);
>
> +             pool_cache_info(pp, &pi);
> +
>               rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
>               break;
>       }
> @@ -1499,3 +1552,265 @@ pool_multi_free_ni(struct pool *pp, void
>       km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
>       KERNEL_UNLOCK();
>  }
> +
> +#ifdef MULTIPROCESSOR
> +
> +struct pool pool_caches; /* per cpu cache entries */
> +
> +void
> +pool_cache_init(struct pool *pp)
> +{
> +     struct cpumem *cm;
> +     struct pool_cache *pc;
> +     struct cpumem_iter i;
> +
> +     if (pool_caches.pr_size == 0) {
> +             pool_init(&pool_caches, sizeof(struct pool_cache), 64,
> +                 IPL_NONE, PR_WAITOK, "plcache", NULL);
> +     }
> +
> +     KASSERT(pp->pr_size >= sizeof(*pc));
> +
> +     cm = cpumem_get(&pool_caches);
> +
> +     mtx_init(&pp->pr_cache_mtx, pp->pr_ipl);
> +     pp->pr_cache_list = NULL;
> +     pp->pr_cache_nlist = 0;
> +     pp->pr_cache_items = 8;
> +     pp->pr_cache_contention = 0;
> +
> +     CPUMEM_FOREACH(pc, &i, cm) {
> +             pc->pc_actv = NULL;
> +             pc->pc_nactv = 0;
> +             pc->pc_prev = NULL;
> +
> +             pc->pc_gets = 0;
> +             pc->pc_puts = 0;
> +             pc->pc_fails = 0;
> +             pc->pc_nout = 0;
> +     }
> +
> +     pp->pr_cache = cm;
> +}
> +
> +static inline void
> +pool_list_enter(struct pool *pp)
> +{
> +     if (mtx_enter_try(&pp->pr_cache_mtx) == 0) {
> +             mtx_enter(&pp->pr_cache_mtx);
> +             pp->pr_cache_contention++;
> +     }
> +}
> +
> +static inline void
> +pool_list_leave(struct pool *pp)
> +{
> +     mtx_leave(&pp->pr_cache_mtx);
> +}
> +
> +static inline struct pool_list *
> +pool_list_alloc(struct pool *pp, struct pool_cache *pc)
> +{
> +     struct pool_list *pl;
> +
> +     pool_list_enter(pp);
> +     pl = pp->pr_cache_list;
> +     if (pl != NULL) {
> +             pp->pr_cache_list = pl->pl_nextl;
> +             pp->pr_cache_nlist--;
> +     }
> +
> +     pp->pr_cache_nout += pc->pc_nout;
> +     pc->pc_nout = 0;
> +     pool_list_leave(pp);
> +
> +     return (pl);
> +}
> +
> +static inline void
> +pool_list_free(struct pool *pp, struct pool_cache *pc, struct pool_list
> *pl)
> +{
> +     pool_list_enter(pp);
> +     pl->pl_nextl = pp->pr_cache_list;
> +     pp->pr_cache_list = pl;
> +     pp->pr_cache_nlist++;
> +
> +     pp->pr_cache_nout += pc->pc_nout;
> +     pc->pc_nout = 0;
> +     pool_list_leave(pp);
> +}
> +
> +static inline struct pool_cache *
> +pool_cache_enter(struct pool *pp, int *s)
> +{
> +     struct pool_cache *pc;
> +
> +     pc = cpumem_enter(pp->pr_cache);
> +     *s = splraise(pp->pr_ipl);
> +     pc->pc_gen++;
> +
> +     return (pc);
> +}
> +
> +static inline void
> +pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
> +{
> +     pc->pc_gen++;
> +     splx(s);
> +     cpumem_leave(pp->pr_cache, pc);
> +}
> +
> +void *
> +pool_cache_get(struct pool *pp)
> +{
> +     struct pool_cache *pc;
> +     struct pool_list *pl;
> +     int s;
> +
> +     pc = pool_cache_enter(pp, &s);
> +
> +     if (pc->pc_actv != NULL) {
> +             pl = pc->pc_actv;
> +     } else if (pc->pc_prev != NULL) {
> +             pl = pc->pc_prev;
> +             pc->pc_prev = NULL;
> +     } else if ((pl = pool_list_alloc(pp, pc)) == NULL) {
> +             pc->pc_fails++;
> +             goto done;
> +     }
> +
> +     pc->pc_actv = pl->pl_next;
> +     pc->pc_nactv = pl->pl_nitems - 1;
> +     pc->pc_gets++;
> +     pc->pc_nout++;
> +done:
> +     pool_cache_leave(pp, pc, s);
> +
> +     return (pl);
> +}
> +
> +void
> +pool_cache_put(struct pool *pp, void *v)
> +{
> +     struct pool_cache *pc;
> +     struct pool_list *pl = v;
> +     unsigned long cache_items = pp->pr_cache_items;
> +     unsigned long nitems;
> +     int s;
> +
> +     pc = pool_cache_enter(pp, &s);
> +
> +     nitems = pc->pc_nactv;
> +     if (nitems >= cache_items) {
> +             if (pc->pc_prev != NULL)
> +                     pool_list_free(pp, pc, pc->pc_prev);
> +                     
> +             pc->pc_prev = pc->pc_actv;
> +
> +             pc->pc_actv = NULL;
> +             pc->pc_nactv = 0;
> +             nitems = 0;
> +     }
> +
> +     pl->pl_next = pc->pc_actv;
> +     pl->pl_nitems = ++nitems;
> +
> +     pc->pc_actv = pl;
> +     pc->pc_nactv = nitems;
> +
> +     pc->pc_puts++;
> +     pc->pc_nout--;
> +
> +     pool_cache_leave(pp, pc, s);
> +}
> +
> +struct pool_list *
> +pool_list_put(struct pool *pp, struct pool_list *pl)
> +{
> +     struct pool_list *rpl, *npl;
> +
> +     if (pl == NULL)
> +             return (NULL);
> +
> +     rpl = (struct pool_list *)pl->pl_next;
> +
> +     do {
> +             npl = pl->pl_next;
> +             pool_put(pp, pl);
> +             pl = npl;
> +     } while (pl != NULL);
> +
> +     return (rpl);
> +}
> +
> +void
> +pool_cache_destroy(struct pool *pp)
> +{
> +     struct pool_cache *pc;
> +     struct pool_list *pl;
> +     struct cpumem_iter i;
> +     struct cpumem *cm;
> +
> +     cm = pp->pr_cache;
> +     pp->pr_cache = NULL; /* make pool_put avoid the cache */
> +
> +     CPUMEM_FOREACH(pc, &i, cm) {
> +             pool_list_put(pp, pc->pc_actv);
> +             pool_list_put(pp, pc->pc_prev);
> +     }
> +
> +     cpumem_put(&pool_caches, cm);
> +
> +     pl = pp->pr_cache_list;
> +     while (pl != NULL)
> +             pl = pool_list_put(pp, pl);
> +}
> +
> +void
> +pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
> +{
> +     struct pool_cache *pc;
> +     struct cpumem_iter i;
> +
> +     if (pp->pr_cache == NULL)
> +             return;
> +
> +     /* loop through the caches twice to collect stats */
> +
> +     /* once without the mtx so we can yield while reading nget/nput */
> +     CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
> +             uint64_t gen, nget, nput;
> +
> +             do {
> +                     while ((gen = pc->pc_gen) & 1)
> +                             yield();
> +
> +                     nget = pc->pc_gets;
> +                     nput = pc->pc_puts;
> +             } while (gen != pc->pc_gen);
> +
> +             pi->pr_nget += nget;
> +             pi->pr_nput += nput;
> +     }
> +
> +     /* and once with the mtx so we can get consistent nout values */
> +     mtx_enter(&pp->pr_cache_mtx);
> +     CPUMEM_FOREACH(pc, &i, pp->pr_cache)
> +             pi->pr_nout += pc->pc_nout;
> +
> +     pi->pr_nout += pp->pr_cache_nout;
> +     mtx_leave(&pp->pr_cache_mtx);
> +}
> +#else /* MULTIPROCESSOR */
> +void
> +pool_cache_init(struct pool *pp)
> +{
> +     /* nop */
> +}
> +
> +void
> +pool_cache_info(struct pool *pp, struct kinfo_pool *pi)
> +{
> +     /* nop */
> +}
> +#endif /* MULTIPROCESSOR */
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> retrieving revision 1.237
> diff -u -p -r1.237 uipc_mbuf.c
> --- kern/uipc_mbuf.c  27 Oct 2016 03:29:55 -0000      1.237
> +++ kern/uipc_mbuf.c  27 Oct 2016 03:51:10 -0000
> @@ -186,7 +186,15 @@ mbinit(void)
>  void
>  mbcpuinit()
>  {
> +     int i;
> +
>       mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
> +
> +     pool_cache_init(&mbpool);
> +     pool_cache_init(&mtagpool);
> +
> +     for (i = 0; i < nitems(mclsizes); i++)
> +             pool_cache_init(&mclpools[i]);
>  }
>
>  void
> Index: sys/pool.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/pool.h,v
> retrieving revision 1.63
> diff -u -p -r1.63 pool.h
> --- sys/pool.h        15 Sep 2016 02:00:16 -0000      1.63
> +++ sys/pool.h        27 Oct 2016 03:51:10 -0000
> @@ -84,6 +84,9 @@ struct pool_allocator {
>
>  TAILQ_HEAD(pool_pagelist, pool_item_header);
>
> +struct pool_list;
> +struct cpumem;
> +
>  struct pool {
>       struct mutex    pr_mtx;
>       SIMPLEQ_ENTRY(pool)
> @@ -124,6 +127,15 @@ struct pool {
>       RBT_HEAD(phtree, pool_item_header)
>                       pr_phtree;
>
> +     struct cpumem * pr_cache;
> +     struct mutex    pr_cache_mtx;
> +     struct pool_list *
> +                     pr_cache_list;
> +     u_int           pr_cache_nlist;
> +     u_int           pr_cache_items;
> +     u_int           pr_cache_contention;
> +     int             pr_cache_nout;
> +
>       u_int           pr_align;
>       u_int           pr_maxcolors;   /* Cache coloring */
>       int             pr_phoffset;    /* Offset in page of page header */
> @@ -175,6 +187,7 @@ struct pool_request {
>
>  void         pool_init(struct pool *, size_t, u_int, int, int,
>                   const char *, struct pool_allocator *);
> +void         pool_cache_init(struct pool *);
>  void         pool_destroy(struct pool *);
>  void         pool_setlowat(struct pool *, int);
>  void         pool_sethiwat(struct pool *, int);
>
>

Re: per-cpu caches for pools

Reply via email to