> On 23 Dec 2014, at 11:38, David Gwynne <[email protected]> wrote:
>
> On Mon, Dec 22, 2014 at 10:54:16AM -0500, Ted Unangst wrote:
>> On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote:
>>> 1) how is it different from what we have now?
>>>
>>> 2) why can't you do it when you pool_put?
>>
>> Right now, if you allocate a bunch of things, then free them, the
>> pages won't be freed because the timestamp is new. But you've already
>> freed everything, so there won't be a future pool_put to trigger the
>> final page freeing. This keeps pages from getting stuck.
>>
>>> 3) why can't you call it from uvm when there's memory pressure since
>>> from what i understand pool_reclaim was supposed to work like that?
>>
>> Calling reclaim only when we're low on memory is sometimes too late.
>
> and we never do. reclaim is only called when sysctl kern.pool_debug
> is fiddled with.
>
>>
>>> 4) i assume you don't want to call pool_reclaim from pool_gc_pages
>>> because of the mutex_enter_try benefits, but why does logic in
>>> these functions differ, e.g. why did you omit the pr_itemsperpage
>>> bit?
>>
>> We're not trying to release all free pages. Only the ones that are both
>> too many and too old. This is the same logic that is already in
>> pool_put.
>>
>>> 7) it looks like pool_reclaim_all should also raise an IPL since it
>>> does the same thing. wasn't it noteced before?
>>
>> Likely. I don't think reclaim_all gets called very often.
>
> or at all, really.
>
> mikeb, id really appreciate it if you could benchmark with this diff.
>
> if you're interested in seeing the effect of freeing at different
> intervals, you could try the diff below. it adds kern.pool.wait_free
> and kern.pool.wait_gc tunables so you can set how long a page has
> to be idle before a pool_put and the gc respectively will select a
> page for freeing.
>
> id prefer to pick values that Just Work(tm) all the time, but being
> able to test different values easily might be useful in the short
> term.
ive been running the diff below in production for nearly a month now without
issue.
anyone want to ok this? if i commit this would anyone object?
dlg
>
> Index: sbin/sysctl/sysctl.c
> ===================================================================
> RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
> retrieving revision 1.207
> diff -u -p -r1.207 sysctl.c
> --- sbin/sysctl/sysctl.c 19 Nov 2014 18:04:54 -0000 1.207
> +++ sbin/sysctl/sysctl.c 22 Dec 2014 23:44:52 -0000
> @@ -44,6 +44,7 @@
> #include <sys/sched.h>
> #include <sys/sensors.h>
> #include <sys/vmmeter.h>
> +#include <sys/pool.h>
> #include <net/route.h>
> #include <net/if.h>
>
> @@ -125,6 +126,7 @@ struct ctlname semname[] = CTL_KERN_SEMI
> struct ctlname shmname[] = CTL_KERN_SHMINFO_NAMES;
> struct ctlname watchdogname[] = CTL_KERN_WATCHDOG_NAMES;
> struct ctlname tcname[] = CTL_KERN_TIMECOUNTER_NAMES;
> +struct ctlname poolname[] = CTL_KERN_POOL_NAMES;
> struct ctlname *vfsname;
> #ifdef CTL_MACHDEP_NAMES
> struct ctlname machdepname[] = CTL_MACHDEP_NAMES;
> @@ -207,6 +209,7 @@ int sysctl_seminfo(char *, char **, int
> int sysctl_shminfo(char *, char **, int *, int, int *);
> int sysctl_watchdog(char *, char **, int *, int, int *);
> int sysctl_tc(char *, char **, int *, int, int *);
> +int sysctl_pool(char *, char **, int *, int, int *);
> int sysctl_sensors(char *, char **, int *, int, int *);
> void print_sensordev(char *, int *, u_int, struct sensordev *);
> void print_sensor(struct sensor *);
> @@ -388,6 +391,11 @@ parse(char *string, int flags)
> return;
> warnx("use dmesg to view %s", string);
> return;
> + case KERN_POOL:
> + len = sysctl_pool(string, &bufp, mib, flags, &type);
> + if (len < 0)
> + return;
> + break;
> case KERN_PROC:
> if (flags == 0)
> return;
> @@ -1633,6 +1641,7 @@ struct list semlist = { semname, KERN_SE
> struct list shmlist = { shmname, KERN_SHMINFO_MAXID };
> struct list watchdoglist = { watchdogname, KERN_WATCHDOG_MAXID };
> struct list tclist = { tcname, KERN_TIMECOUNTER_MAXID };
> +struct list poollist = { poolname, KERN_POOL_MAXID };
>
> /*
> * handle vfs namei cache statistics
> @@ -2302,6 +2311,25 @@ sysctl_tc(char *string, char **bufpp, in
> return (-1);
> mib[2] = indx;
> *typep = tclist.list[indx].ctl_type;
> + return (3);
> +}
> +
> +/*
> + * Handle pools support
> + */
> +int
> +sysctl_pool(char *string, char **bufpp, int mib[], int flags, int *typep)
> +{
> + int indx;
> +
> + if (*bufpp == NULL) {
> + listall(string, &poollist);
> + return (-1);
> + }
> + if ((indx = findname(string, "third", bufpp, &poollist)) == -1)
> + return (-1);
> + mib[2] = indx;
> + *typep = poollist.list[indx].ctl_type;
> return (3);
> }
>
> Index: sys/kern/init_main.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_main.c,v
> retrieving revision 1.227
> diff -u -p -r1.227 init_main.c
> --- sys/kern/init_main.c 15 Dec 2014 02:24:23 -0000 1.227
> +++ sys/kern/init_main.c 22 Dec 2014 23:44:55 -0000
> @@ -121,6 +121,8 @@ extern struct timeout setperf_to;
> void setperf_auto(void *);
> #endif
>
> +extern struct timeout pool_gc_tick;
> +
> int cmask = CMASK;
> extern struct user *proc0paddr;
>
> @@ -554,6 +556,11 @@ main(void *framep)
> #ifndef SMALL_KERNEL
> timeout_set(&setperf_to, setperf_auto, NULL);
> #endif
> +
> + /*
> + * Start the idle pool page garbage collector
> + */
> + timeout_add_sec(&pool_gc_tick, 1);
>
> /*
> * proc0: nothing to do, back to sleep
> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.277
> diff -u -p -r1.277 kern_sysctl.c
> --- sys/kern/kern_sysctl.c 12 Dec 2014 07:45:46 -0000 1.277
> +++ sys/kern/kern_sysctl.c 22 Dec 2014 23:44:55 -0000
> @@ -496,7 +496,8 @@ kern_sysctl(int *name, u_int namelen, vo
> case KERN_NPROCS:
> return (sysctl_rdint(oldp, oldlenp, newp, nprocesses));
> case KERN_POOL:
> - return (sysctl_dopool(name + 1, namelen - 1, oldp, oldlenp));
> + return (sysctl_pool(name + 1, namelen - 1, oldp, oldlenp,
> + newp, newlen));
> case KERN_STACKGAPRANDOM:
> stackgap = stackgap_random;
> error = sysctl_int(oldp, oldlenp, newp, newlen, &stackgap);
> Index: sys/kern/subr_pool.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/subr_pool.c,v
> retrieving revision 1.173
> diff -u -p -r1.173 subr_pool.c
> --- sys/kern/subr_pool.c 22 Dec 2014 00:33:40 -0000 1.173
> +++ sys/kern/subr_pool.c 22 Dec 2014 23:44:55 -0000
> @@ -40,6 +40,8 @@
> #include <sys/syslog.h>
> #include <sys/rwlock.h>
> #include <sys/sysctl.h>
> +#include <sys/task.h>
> +#include <sys/timeout.h>
>
> #include <uvm/uvm_extern.h>
>
> @@ -160,6 +162,14 @@ void pool_print1(struct pool *, const c
>
> #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
>
> +/* stale page garbage collectors */
> +void pool_gc_sched(void *);
> +struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
> +void pool_gc_pages(void *, void *);
> +struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL, NULL);
> +int pool_wait_free = 1;
> +int pool_wait_gc = 8;
> +
> static inline int
> phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
> {
> @@ -678,7 +688,7 @@ pool_put(struct pool *pp, void *v)
> /* is it time to free a page? */
> if (pp->pr_nidle > pp->pr_maxpages &&
> (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
> - (ticks - ph->ph_tick) > hz) {
> + (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
> freeph = ph;
> pool_p_remove(pp, freeph);
> }
> @@ -1269,11 +1279,13 @@ pool_walk(struct pool *pp, int full,
> * kern.pool.name.<pool#> - the name for pool#.
> */
> int
> -sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
> +sysctl_pool(int *name, u_int namelen, char *oldp, size_t *oldlenp,
> + void *newp, size_t newlen)
> {
> struct kinfo_pool pi;
> struct pool *pp;
> int rv = ENOENT;
> + int wait;
>
> switch (name[0]) {
> case KERN_POOL_NPOOLS:
> @@ -1284,6 +1296,35 @@ sysctl_dopool(int *name, u_int namelen,
> case KERN_POOL_NAME:
> case KERN_POOL_POOL:
> break;
> +
> + case KERN_POOL_WAITFREE:
> + if (namelen != 1)
> + return (ENOTDIR);
> + wait = pool_wait_free;
> + rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait);
> + if (rv != 0)
> + return (rv);
> + if (newp == NULL)
> + return (0);
> + if (wait < 1 || wait >= pool_wait_gc)
> + return (EINVAL);
> + pool_wait_free = wait;
> + return (0);
> +
> + case KERN_POOL_WAITGC:
> + if (namelen != 1)
> + return (ENOTDIR);
> + wait = pool_wait_gc;
> + rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait);
> + if (rv != 0)
> + return (rv);
> + if (newp == NULL)
> + return (0);
> + if (wait <= pool_wait_free || wait > 3600)
> + return (EINVAL);
> + pool_wait_gc = wait;
> + return (0);
> +
> default:
> return (EOPNOTSUPP);
> }
> @@ -1337,6 +1378,46 @@ done:
> rw_exit_read(&pool_lock);
>
> return (rv);
> +}
> +
> +void
> +pool_gc_sched(void *x)
> +{
> + task_add(systq, &pool_gc_task);
> +}
> +
> +void
> +pool_gc_pages(void *x, void *y)
> +{
> + extern int ticks;
> + struct pool *pp;
> + struct pool_item_header *ph, *freeph;
> + int s;
> +
> + rw_enter_read(&pool_lock);
> + s = splvm();
> + SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
> + if (!mtx_enter_try(&pp->pr_mtx))
> + continue;
> +
> + /* is it time to free a page? */
> + if (pp->pr_nidle > pp->pr_minpages &&
> + (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
> + (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
> + freeph = ph;
> + pool_p_remove(pp, freeph);
> + } else
> + freeph = NULL;
> +
> + mtx_leave(&pp->pr_mtx);
> +
> + if (freeph != NULL)
> + pool_p_free(pp, freeph);
> + }
> + splx(s); /* XXX */
> + rw_exit_read(&pool_lock);
> +
> + timeout_add_sec(&pool_gc_tick, 1);
> }
>
> /*
> Index: sys/sys/pool.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/pool.h,v
> retrieving revision 1.55
> diff -u -p -r1.55 pool.h
> --- sys/sys/pool.h 19 Dec 2014 02:15:25 -0000 1.55
> +++ sys/sys/pool.h 22 Dec 2014 23:44:55 -0000
> @@ -39,10 +39,24 @@
> * kern.pool.npools
> * kern.pool.name.<number>
> * kern.pool.pool.<number>
> + * kern.pool.wait_free
> + * kern.pool.wait_gc
> */
> #define KERN_POOL_NPOOLS 1
> #define KERN_POOL_NAME 2
> #define KERN_POOL_POOL 3
> +#define KERN_POOL_WAITFREE 4
> +#define KERN_POOL_WAITGC 5
> +#define KERN_POOL_MAXID 6
> +
> +#define CTL_KERN_POOL_NAMES { \
> + { 0, 0 }, \
> + { "npools", CTLTYPE_INT }, \
> + { "name", CTLTYPE_NODE }, \
> + { "pool", CTLTYPE_NODE }, \
> + { "wait_free", CTLTYPE_INT }, \
> + { "wait_gc", CTLTYPE_INT } \
> +}
>
> struct kinfo_pool {
> unsigned int pr_size; /* size of a pool item */
> Index: sys/sys/sysctl.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/sysctl.h,v
> retrieving revision 1.153
> diff -u -p -r1.153 sysctl.h
> --- sys/sys/sysctl.h 12 Dec 2014 07:45:46 -0000 1.153
> +++ sys/sys/sysctl.h 22 Dec 2014 23:44:55 -0000
> @@ -945,7 +945,7 @@ int sysctl_vnode(char *, size_t *, struc
> #ifdef GPROF
> int sysctl_doprof(int *, u_int, void *, size_t *, void *, size_t);
> #endif
> -int sysctl_dopool(int *, u_int, char *, size_t *);
> +int sysctl_pool(int *, u_int, char *, size_t *, void *, size_t);
>
> int kern_sysctl(int *, u_int, void *, size_t *, void *, size_t,
> struct proc *);