> On 22 Nov 2016, at 21:20, Mark Kettenis <[email protected]> wrote:
>
>> Date: Tue, 22 Nov 2016 12:42:39 +1000
>> From: David Gwynne <[email protected]>
>>
>> right now pools that make up mbufs are each limited individually.
>>
>> the following diff instead has the mbuf layer have a global limit
>> on the amount of memory that can be allocated to the pools. this
>> is enforced by wrapping the multi page pool allocator with something
>> that checks the mbuf memory limit first.
>>
>> this means all mbufs will use a max of 2k * nmbclust bytes instead
>> of each pool being able to use that amount each.
>>
>> ok?
>
> Mostly makes sense to me. Not sure the complixty of copying the
> supported page sizes from the multi-page pool allocator is worth the
> additional complication. I'd probably just initialize it the same way
> using POOL_ALLOC_SIZES(PAGE_SIZE, 1UL<<31, POOL_ALLOC_ALIGNED).
id have to remember to change both if the on in subr_pool gets changed though.
> Wouldn't it make sense to use atomic operations to keep track of the
> amount of memory that was allocated?
mutexes are full of atomic ops. to conditionally add to mbuf_mem_alloc id have
to use a cas loop, which isnt easy to read. eg, instead of:
if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
return (NULL);
mtx_enter(&m_pool_mtx);
if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
avail = 0;
else
mbuf_mem_alloc += pp->pr_pgsize;
mtx_leave(&m_pool_mtx);
id need:
unsigned int cur, new;
cur = mbuf_mem_alloc;
for (;;) {
new = cur + pp->pr_pgsize;
if (new > mbuf_mem_limit)
return (NULL);
new = atomic_cas_uint(&mbuf_mem_alloc, cur, new);
if (new == cur)
break;
cur = new;
}
or something.
the mtx may be slow, but it is a slow path id like to keep simple.
>
> Long run I suppose we want to drop nmbclust and let users tune the
> total amount of memory available for clusters and set the initial
> amount to a percentage of physical memory?
i do want to move to specifying memory in terms of bytes instead of 2k clusters.
how much should be available by default is a complicated issue.
>
>
>> Index: sys/pool.h
>> ===================================================================
>> RCS file: /cvs/src/sys/sys/pool.h,v
>> retrieving revision 1.68
>> diff -u -p -r1.68 pool.h
>> --- sys/pool.h 21 Nov 2016 01:44:06 -0000 1.68
>> +++ sys/pool.h 22 Nov 2016 02:31:47 -0000
>> @@ -205,6 +205,7 @@ struct pool {
>> #ifdef _KERNEL
>>
>> extern struct pool_allocator pool_allocator_single;
>> +extern struct pool_allocator pool_allocator_multi;
>>
>> struct pool_request {
>> TAILQ_ENTRY(pool_request) pr_entry;
>> Index: sys/mbuf.h
>> ===================================================================
>> RCS file: /cvs/src/sys/sys/mbuf.h,v
>> retrieving revision 1.222
>> diff -u -p -r1.222 mbuf.h
>> --- sys/mbuf.h 24 Oct 2016 04:38:44 -0000 1.222
>> +++ sys/mbuf.h 22 Nov 2016 02:31:47 -0000
>> @@ -416,6 +416,7 @@ struct mbuf_queue {
>> };
>>
>> #ifdef _KERNEL
>> +struct pool;
>>
>> extern int nmbclust; /* limit on the # of clusters */
>> extern int mblowat; /* mbuf low water mark */
>> @@ -444,6 +445,7 @@ int m_leadingspace(struct mbuf *);
>> int m_trailingspace(struct mbuf *);
>> struct mbuf *m_clget(struct mbuf *, int, u_int);
>> void m_extref(struct mbuf *, struct mbuf *);
>> +void m_pool_init(struct pool *, u_int, u_int, const char *);
>> void m_extfree_pool(caddr_t, u_int, void *);
>> void m_adj(struct mbuf *, int);
>> int m_copyback(struct mbuf *, int, int, const void *, int);
>> Index: kern/uipc_mbuf.c
>> ===================================================================
>> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
>> retrieving revision 1.238
>> diff -u -p -r1.238 uipc_mbuf.c
>> --- kern/uipc_mbuf.c 9 Nov 2016 08:55:11 -0000 1.238
>> +++ kern/uipc_mbuf.c 22 Nov 2016 02:31:47 -0000
>> @@ -133,6 +133,19 @@ void m_extfree(struct mbuf *);
>> void nmbclust_update(void);
>> void m_zero(struct mbuf *);
>>
>> +struct mutex m_pool_mtx = MUTEX_INITIALIZER(IPL_NET);
>> +unsigned int mbuf_mem_limit; /* how much memory can we allocated */
>> +unsigned int mbuf_mem_alloc; /* how much memory has been allocated */
>> +
>> +void *m_pool_alloc(struct pool *, int, int *);
>> +void m_pool_free(struct pool *, void *);
>> +
>> +struct pool_allocator m_pool_allocator = {
>> + m_pool_alloc,
>> + m_pool_free,
>> + 0 /* will be copied from pool_allocator_multi */
>> +};
>> +
>> static void (*mextfree_fns[4])(caddr_t, u_int, void *);
>> static u_int num_extfree_fns;
>>
>> @@ -148,6 +161,11 @@ mbinit(void)
>> int i;
>> unsigned int lowbits;
>>
>> + m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
>> +
>> + nmbclust_update();
>> + mbuf_mem_alloc = 0;
>> +
>> #if DIAGNOSTIC
>> if (mclsizes[0] != MCLBYTES)
>> panic("mbinit: the smallest cluster size != MCLBYTES");
>> @@ -155,9 +173,7 @@ mbinit(void)
>> panic("mbinit: the largest cluster size != MAXMCLBYTES");
>> #endif
>>
>> - pool_init(&mbpool, MSIZE, 0, IPL_NET, 0, "mbufpl", NULL);
>> - pool_set_constraints(&mbpool, &kp_dma_contig);
>> - pool_setlowat(&mbpool, mblowat);
>> + m_pool_init(&mbpool, MSIZE, 64, "mbufpl");
>>
>> pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
>> IPL_NET, 0, "mtagpl", NULL);
>> @@ -171,47 +187,32 @@ mbinit(void)
>> snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk",
>> mclsizes[i] >> 10);
>> }
>> - pool_init(&mclpools[i], mclsizes[i], 64, IPL_NET, 0,
>> - mclnames[i], NULL);
>> - pool_set_constraints(&mclpools[i], &kp_dma_contig);
>> - pool_setlowat(&mclpools[i], mcllowat);
>> +
>> + m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]);
>> }
>>
>> (void)mextfree_register(m_extfree_pool);
>> KASSERT(num_extfree_fns == 1);
>> -
>> - nmbclust_update();
>> }
>>
>> void
>> mbcpuinit()
>> {
>> + int i;
>> +
>> mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
>> +
>> + pool_cache_init(&mbpool);
>> + pool_cache_init(&mtagpool);
>> +
>> + for (i = 0; i < nitems(mclsizes); i++)
>> + pool_cache_init(&mclpools[i]);
>> }
>>
>> void
>> nmbclust_update(void)
>> {
>> - unsigned int i, n;
>> -
>> - /*
>> - * Set the hard limit on the mclpools to the number of
>> - * mbuf clusters the kernel is to support. Log the limit
>> - * reached message max once a minute.
>> - */
>> - for (i = 0; i < nitems(mclsizes); i++) {
>> - n = (unsigned long long)nmbclust * MCLBYTES / mclsizes[i];
>> - (void)pool_sethardlimit(&mclpools[i], n, mclpool_warnmsg, 60);
>> - /*
>> - * XXX this needs to be reconsidered.
>> - * Setting the high water mark to nmbclust is too high
>> - * but we need to have enough spare buffers around so that
>> - * allocations in interrupt context don't fail or mclgeti()
>> - * drivers may end up with empty rings.
>> - */
>> - pool_sethiwat(&mclpools[i], n);
>> - }
>> - pool_sethiwat(&mbpool, nmbclust);
>> + mbuf_mem_limit = nmbclust * MCLBYTES;
>> }
>>
>> /*
>> @@ -1377,6 +1378,52 @@ m_dup_pkt(struct mbuf *m0, unsigned int
>> fail:
>> m_freem(m);
>> return (NULL);
>> +}
>> +
>> +void *
>> +m_pool_alloc(struct pool *pp, int flags, int *slowdown)
>> +{
>> + void *v = NULL;
>> + int avail = 1;
>> +
>> + if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
>> + return (NULL);
>> +
>> + mtx_enter(&m_pool_mtx);
>> + if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
>> + avail = 0;
>> + else
>> + mbuf_mem_alloc += pp->pr_pgsize;
>> + mtx_leave(&m_pool_mtx);
>> +
>> + if (avail) {
>> + v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown);
>> +
>> + if (v == NULL) {
>> + mtx_enter(&m_pool_mtx);
>> + mbuf_mem_alloc -= pp->pr_pgsize;
>> + mtx_leave(&m_pool_mtx);
>> + }
>> + }
>> +
>> + return (v);
>> +}
>> +
>> +void
>> +m_pool_free(struct pool *pp, void *v)
>> +{
>> + (*pool_allocator_multi.pa_free)(pp, v);
>> +
>> + mtx_enter(&m_pool_mtx);
>> + mbuf_mem_alloc -= pp->pr_pgsize;
>> + mtx_leave(&m_pool_mtx);
>> +}
>> +
>> +void
>> +m_pool_init(struct pool *pp, u_int size, u_int align, const char *wmesg)
>> +{
>> + pool_init(pp, size, align, IPL_NET, 0, wmesg, &m_pool_allocator);
>> + pool_set_constraints(pp, &kp_dma_contig);
>> }
>>
>> #ifdef DDB
>> Index: dev/pci/if_myx.c
>> ===================================================================
>> RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
>> retrieving revision 1.99
>> diff -u -p -r1.99 if_myx.c
>> --- dev/pci/if_myx.c 31 Oct 2016 01:38:57 -0000 1.99
>> +++ dev/pci/if_myx.c 22 Nov 2016 02:31:47 -0000
>> @@ -294,8 +294,6 @@ myx_attach(struct device *parent, struct
>>
>> /* this is sort of racy */
>> if (myx_mcl_pool == NULL) {
>> - extern struct kmem_pa_mode kp_dma_contig;
>> -
>> myx_mcl_pool = malloc(sizeof(*myx_mcl_pool), M_DEVBUF,
>> M_WAITOK);
>> if (myx_mcl_pool == NULL) {
>> @@ -303,9 +301,9 @@ myx_attach(struct device *parent, struct
>> DEVNAME(sc));
>> goto unmap;
>> }
>> - pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, IPL_NET,
>> - 0, "myxmcl", NULL);
>> - pool_set_constraints(myx_mcl_pool, &kp_dma_contig);
>> +
>> + m_pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY,
>> + "myxmcl");
>> }
>>
>> if (myx_pcie_dc(sc, pa) != 0)
>>
>>