Re: idle pool page gc

2015-01-18 Thread Ted Unangst
On Mon, Jan 19, 2015 at 11:28, David Gwynne wrote:

 if you're interested in seeing the effect of freeing at different
 intervals, you could try the diff below. it adds kern.pool.wait_free
 and kern.pool.wait_gc tunables so you can set how long a page has
 to be idle before a pool_put and the gc respectively will select a
 page for freeing.

 id prefer to pick values that Just Work(tm) all the time, but being
 able to test different values easily might be useful in the short
 term.
 
 ive been running the diff below in production for nearly a month now
 without issue.
 
 anyone want to ok this? if i commit this would anyone object?

I thought the sysctl code was ok for testing, but not for commit. I
don't think we should be adding more knobs like this. A diff without
that would be much more attractive.



Re: idle pool page gc

2015-01-18 Thread David Gwynne

 On 23 Dec 2014, at 11:38, David Gwynne da...@gwynne.id.au wrote:
 
 On Mon, Dec 22, 2014 at 10:54:16AM -0500, Ted Unangst wrote:
 On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote:
 1) how is it different from what we have now?
 
 2) why can't you do it when you pool_put?
 
 Right now, if you allocate a bunch of things, then free them, the
 pages won't be freed because the timestamp is new. But you've already
 freed everything, so there won't be a future pool_put to trigger the
 final page freeing. This keeps pages from getting stuck.
 
 3) why can't you call it from uvm when there's memory pressure since
 from what i understand pool_reclaim was supposed to work like that?
 
 Calling reclaim only when we're low on memory is sometimes too late.
 
 and we never do. reclaim is only called when sysctl kern.pool_debug
 is fiddled with.
 
 
 4) i assume you don't want to call pool_reclaim from pool_gc_pages
 because of the mutex_enter_try benefits, but why does logic in
 these functions differ, e.g. why did you omit the pr_itemsperpage
 bit?
 
 We're not trying to release all free pages. Only the ones that are both
 too many and too old. This is the same logic that is already in
 pool_put.
 
 7) it looks like pool_reclaim_all should also raise an IPL since it
 does the same thing.  wasn't it noteced before?
 
 Likely. I don't think reclaim_all gets called very often.
 
 or at all, really.
 
 mikeb, id really appreciate it if you could benchmark with this diff.
 
 if you're interested in seeing the effect of freeing at different
 intervals, you could try the diff below. it adds kern.pool.wait_free
 and kern.pool.wait_gc tunables so you can set how long a page has
 to be idle before a pool_put and the gc respectively will select a
 page for freeing.
 
 id prefer to pick values that Just Work(tm) all the time, but being
 able to test different values easily might be useful in the short
 term.

ive been running the diff below in production for nearly a month now without 
issue.

anyone want to ok this? if i commit this would anyone object?

dlg

 
 Index: sbin/sysctl/sysctl.c
 ===
 RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
 retrieving revision 1.207
 diff -u -p -r1.207 sysctl.c
 --- sbin/sysctl/sysctl.c  19 Nov 2014 18:04:54 -  1.207
 +++ sbin/sysctl/sysctl.c  22 Dec 2014 23:44:52 -
 @@ -44,6 +44,7 @@
 #include sys/sched.h
 #include sys/sensors.h
 #include sys/vmmeter.h
 +#include sys/pool.h
 #include net/route.h
 #include net/if.h
 
 @@ -125,6 +126,7 @@ struct ctlname semname[] = CTL_KERN_SEMI
 struct ctlname shmname[] = CTL_KERN_SHMINFO_NAMES;
 struct ctlname watchdogname[] = CTL_KERN_WATCHDOG_NAMES;
 struct ctlname tcname[] = CTL_KERN_TIMECOUNTER_NAMES;
 +struct ctlname poolname[] = CTL_KERN_POOL_NAMES;
 struct ctlname *vfsname;
 #ifdef CTL_MACHDEP_NAMES
 struct ctlname machdepname[] = CTL_MACHDEP_NAMES;
 @@ -207,6 +209,7 @@ int sysctl_seminfo(char *, char **, int 
 int sysctl_shminfo(char *, char **, int *, int, int *);
 int sysctl_watchdog(char *, char **, int *, int, int *);
 int sysctl_tc(char *, char **, int *, int, int *);
 +int sysctl_pool(char *, char **, int *, int, int *);
 int sysctl_sensors(char *, char **, int *, int, int *);
 void print_sensordev(char *, int *, u_int, struct sensordev *);
 void print_sensor(struct sensor *);
 @@ -388,6 +391,11 @@ parse(char *string, int flags)
   return;
   warnx(use dmesg to view %s, string);
   return;
 + case KERN_POOL:
 + len = sysctl_pool(string, bufp, mib, flags, type);
 + if (len  0)
 + return;
 + break;
   case KERN_PROC:
   if (flags == 0)
   return;
 @@ -1633,6 +1641,7 @@ struct list semlist = { semname, KERN_SE
 struct list shmlist = { shmname, KERN_SHMINFO_MAXID };
 struct list watchdoglist = { watchdogname, KERN_WATCHDOG_MAXID };
 struct list tclist = { tcname, KERN_TIMECOUNTER_MAXID };
 +struct list poollist = { poolname, KERN_POOL_MAXID };
 
 /*
  * handle vfs namei cache statistics
 @@ -2302,6 +2311,25 @@ sysctl_tc(char *string, char **bufpp, in
   return (-1);
   mib[2] = indx;
   *typep = tclist.list[indx].ctl_type;
 + return (3);
 +}
 +
 +/*
 + * Handle pools support
 + */
 +int
 +sysctl_pool(char *string, char **bufpp, int mib[], int flags, int *typep)
 +{
 + int indx;
 +
 + if (*bufpp == NULL) {
 + listall(string, poollist);
 + return (-1);
 + }
 + if ((indx = findname(string, third, bufpp, poollist)) == -1)
 + return (-1);
 + mib[2] = indx;
 + *typep = poollist.list[indx].ctl_type;
   return (3);
 }
 
 Index: sys/kern/init_main.c
 ===
 RCS file: /cvs/src/sys/kern/init_main.c,v
 

Re: idle pool page gc

2014-12-22 Thread Mike Belopuhov
On 22 December 2014 at 06:43, David Gwynne da...@gwynne.id.au wrote:
 this introduces a global gc task that loops over all the pools
 looking for pages that havent been used for a very long time so
 they can be freed.

 this is the simplest way of doing this without introducing per pool
 timeouts/tasks which in turn could introduce races with pool_destroy,
 or more shared global state that gets touched on pool_get/put ops.

 im adding the timeout in src/sys/kern/init_main.c cos having pool_init
 do it causes faults when a pool is created before the timeout
 subsystem is set up.

 i have tested this on sparc64, amd64, and hppa, and it seems to do
 the advertised job.


i have a couple of questions to understand the reasoning behind this.

1) how is it different from what we have now?

2) why can't you do it when you pool_put?

3) why can't you call it from uvm when there's memory pressure since
   from what i understand pool_reclaim was supposed to work like that?

4) i assume you don't want to call pool_reclaim from pool_gc_pages
   because of the mutex_enter_try benefits, but why does logic in
   these functions differ, e.g. why did you omit the pr_itemsperpage
   bit?

5) why 8 ticks is a very long time to free a page?
   why not 10 seconds?

6) why is there an XXX next to the splx?

7) it looks like pool_reclaim_all should also raise an IPL since it
   does the same thing.  wasn't it noteced before?



Re: idle pool page gc

2014-12-22 Thread Ted Unangst
On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote:
 1) how is it different from what we have now?
 
 2) why can't you do it when you pool_put?

Right now, if you allocate a bunch of things, then free them, the
pages won't be freed because the timestamp is new. But you've already
freed everything, so there won't be a future pool_put to trigger the
final page freeing. This keeps pages from getting stuck.

 3) why can't you call it from uvm when there's memory pressure since
 from what i understand pool_reclaim was supposed to work like that?

Calling reclaim only when we're low on memory is sometimes too late.

 4) i assume you don't want to call pool_reclaim from pool_gc_pages
 because of the mutex_enter_try benefits, but why does logic in
 these functions differ, e.g. why did you omit the pr_itemsperpage
 bit?

We're not trying to release all free pages. Only the ones that are both
too many and too old. This is the same logic that is already in
pool_put.

 7) it looks like pool_reclaim_all should also raise an IPL since it
 does the same thing.  wasn't it noteced before?

Likely. I don't think reclaim_all gets called very often.



Re: idle pool page gc

2014-12-22 Thread David Gwynne
On Mon, Dec 22, 2014 at 10:54:16AM -0500, Ted Unangst wrote:
 On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote:
  1) how is it different from what we have now?
  
  2) why can't you do it when you pool_put?
 
 Right now, if you allocate a bunch of things, then free them, the
 pages won't be freed because the timestamp is new. But you've already
 freed everything, so there won't be a future pool_put to trigger the
 final page freeing. This keeps pages from getting stuck.
 
  3) why can't you call it from uvm when there's memory pressure since
  from what i understand pool_reclaim was supposed to work like that?
 
 Calling reclaim only when we're low on memory is sometimes too late.

and we never do. reclaim is only called when sysctl kern.pool_debug
is fiddled with.

 
  4) i assume you don't want to call pool_reclaim from pool_gc_pages
  because of the mutex_enter_try benefits, but why does logic in
  these functions differ, e.g. why did you omit the pr_itemsperpage
  bit?
 
 We're not trying to release all free pages. Only the ones that are both
 too many and too old. This is the same logic that is already in
 pool_put.
 
  7) it looks like pool_reclaim_all should also raise an IPL since it
  does the same thing.  wasn't it noteced before?
 
 Likely. I don't think reclaim_all gets called very often.

or at all, really.

mikeb, id really appreciate it if you could benchmark with this diff.

if you're interested in seeing the effect of freeing at different
intervals, you could try the diff below. it adds kern.pool.wait_free
and kern.pool.wait_gc tunables so you can set how long a page has
to be idle before a pool_put and the gc respectively will select a
page for freeing.

id prefer to pick values that Just Work(tm) all the time, but being
able to test different values easily might be useful in the short
term.

Index: sbin/sysctl/sysctl.c
===
RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
retrieving revision 1.207
diff -u -p -r1.207 sysctl.c
--- sbin/sysctl/sysctl.c19 Nov 2014 18:04:54 -  1.207
+++ sbin/sysctl/sysctl.c22 Dec 2014 23:44:52 -
@@ -44,6 +44,7 @@
 #include sys/sched.h
 #include sys/sensors.h
 #include sys/vmmeter.h
+#include sys/pool.h
 #include net/route.h
 #include net/if.h
 
@@ -125,6 +126,7 @@ struct ctlname semname[] = CTL_KERN_SEMI
 struct ctlname shmname[] = CTL_KERN_SHMINFO_NAMES;
 struct ctlname watchdogname[] = CTL_KERN_WATCHDOG_NAMES;
 struct ctlname tcname[] = CTL_KERN_TIMECOUNTER_NAMES;
+struct ctlname poolname[] = CTL_KERN_POOL_NAMES;
 struct ctlname *vfsname;
 #ifdef CTL_MACHDEP_NAMES
 struct ctlname machdepname[] = CTL_MACHDEP_NAMES;
@@ -207,6 +209,7 @@ int sysctl_seminfo(char *, char **, int 
 int sysctl_shminfo(char *, char **, int *, int, int *);
 int sysctl_watchdog(char *, char **, int *, int, int *);
 int sysctl_tc(char *, char **, int *, int, int *);
+int sysctl_pool(char *, char **, int *, int, int *);
 int sysctl_sensors(char *, char **, int *, int, int *);
 void print_sensordev(char *, int *, u_int, struct sensordev *);
 void print_sensor(struct sensor *);
@@ -388,6 +391,11 @@ parse(char *string, int flags)
return;
warnx(use dmesg to view %s, string);
return;
+   case KERN_POOL:
+   len = sysctl_pool(string, bufp, mib, flags, type);
+   if (len  0)
+   return;
+   break;
case KERN_PROC:
if (flags == 0)
return;
@@ -1633,6 +1641,7 @@ struct list semlist = { semname, KERN_SE
 struct list shmlist = { shmname, KERN_SHMINFO_MAXID };
 struct list watchdoglist = { watchdogname, KERN_WATCHDOG_MAXID };
 struct list tclist = { tcname, KERN_TIMECOUNTER_MAXID };
+struct list poollist = { poolname, KERN_POOL_MAXID };
 
 /*
  * handle vfs namei cache statistics
@@ -2302,6 +2311,25 @@ sysctl_tc(char *string, char **bufpp, in
return (-1);
mib[2] = indx;
*typep = tclist.list[indx].ctl_type;
+   return (3);
+}
+
+/*
+ * Handle pools support
+ */
+int
+sysctl_pool(char *string, char **bufpp, int mib[], int flags, int *typep)
+{
+   int indx;
+
+   if (*bufpp == NULL) {
+   listall(string, poollist);
+   return (-1);
+   }
+   if ((indx = findname(string, third, bufpp, poollist)) == -1)
+   return (-1);
+   mib[2] = indx;
+   *typep = poollist.list[indx].ctl_type;
return (3);
 }
 
Index: sys/kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.227
diff -u -p -r1.227 init_main.c
--- sys/kern/init_main.c15 Dec 2014 02:24:23 -  1.227
+++ sys/kern/init_main.c22 Dec 2014 23:44:55 -
@@ -121,6 +121,8 @@ extern struct timeout setperf_to;