I don't think a different interface would necessarily fix all consistency problems, because there are probably additional consistency problems within ZFS itself. The ZFS stats just weren't designed to provide a consistent view, like "zpool status". But a different interface certainly could be much faster; sysctl is slow. Do you have any good ideas for how to easily create such an alternate interface?
On Wed, Sep 28, 2022 at 10:19 AM Maxim Sobolev <sobo...@freebsd.org> wrote: > > This also brings a question as to whether sysctl is the right interface to > pull this data from the kernel in the first place? From my somewhat ignorant > look this approach is likely to be poised with all sorts of race conditions, > such so if configuration changes while you are pulling it out you'd get some > inconsistent view that is not here not there. Wouldn't it be easier to use > some other mechanism to pull configuration of all 1,000 datasets as one blob > in one or few system calls? Like read(2) from /dev/zfsstats or something like > that? Then you can iterate over it as much as you need in userland. > > -Max > > On Tue, Sep 27, 2022, 3:04 AM Alan Somers <asom...@freebsd.org> wrote: >> >> The branch main has been updated by asomers: >> >> URL: >> https://cgit.FreeBSD.org/src/commit/?id=d3f96f661050e9bd21fe29931992a8b9e67ff189 >> >> commit d3f96f661050e9bd21fe29931992a8b9e67ff189 >> Author: Alan Somers <asom...@freebsd.org> >> AuthorDate: 2022-09-07 14:12:49 +0000 >> Commit: Alan Somers <asom...@freebsd.org> >> CommitDate: 2022-09-27 00:03:34 +0000 >> >> Fix O(n^2) behavior in sysctl >> >> Sysctl OIDs were internally stored in linked lists, triggering O(n^2) >> behavior when userland iterates over many of them. The slowdown is >> noticeable for MIBs that have > 100 children (for example, vm.uma). But >> it's unignorable for kstat.zfs when a pool has > 1000 datasets. >> >> Convert the linked lists into RB trees. This produces a ~25x speedup >> for listing kstat.zfs with 4100 datasets, and no measurable penalty for >> small dataset counts. >> >> Bump __FreeBSD_version for the KPI change. >> >> Sponsored by: Axcient >> Reviewed by: mjg >> Differential Revision: https://reviews.freebsd.org/D36500 >> --- >> sys/compat/linuxkpi/common/include/linux/sysfs.h | 2 +- >> sys/kern/kern_sysctl.c | 149 >> +++++++++++------------ >> sys/kern/vfs_init.c | 2 +- >> sys/sys/param.h | 2 +- >> sys/sys/sysctl.h | 31 ++++- >> 5 files changed, 99 insertions(+), 87 deletions(-) >> >> diff --git a/sys/compat/linuxkpi/common/include/linux/sysfs.h >> b/sys/compat/linuxkpi/common/include/linux/sysfs.h >> index 0b6b479d9362..881a72e62ed9 100644 >> --- a/sys/compat/linuxkpi/common/include/linux/sysfs.h >> +++ b/sys/compat/linuxkpi/common/include/linux/sysfs.h >> @@ -246,7 +246,7 @@ sysfs_unmerge_group(struct kobject *kobj, const struct >> attribute_group *grp) >> struct attribute **attr; >> struct sysctl_oid *oidp; >> >> - SLIST_FOREACH(oidp, SYSCTL_CHILDREN(kobj->oidp), oid_link) { >> + RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(kobj->oidp)) { >> if (strcmp(oidp->oid_name, grp->name) != 0) >> continue; >> for (attr = grp->attrs; *attr != NULL; attr++) { >> diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c >> index 9bc595f111cc..e1cd6ea4bd61 100644 >> --- a/sys/kern/kern_sysctl.c >> +++ b/sys/kern/kern_sysctl.c >> @@ -84,6 +84,8 @@ static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal >> magic"); >> static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); >> static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); >> >> +RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); >> + >> /* >> * The sysctllock protects the MIB tree. It also protects sysctl >> * contexts used with dynamic sysctls. The sysctl_register_oid() and >> @@ -120,7 +122,7 @@ static struct sx sysctlstringlock; >> static int sysctl_root(SYSCTL_HANDLER_ARGS); >> >> /* Root list */ >> -struct sysctl_oid_list sysctl__children = >> SLIST_HEAD_INITIALIZER(&sysctl__children); >> +struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children); >> >> static char* sysctl_escape_name(const char*); >> static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, >> @@ -134,7 +136,7 @@ sysctl_find_oidname(const char *name, struct >> sysctl_oid_list *list) >> struct sysctl_oid *oidp; >> >> SYSCTL_ASSERT_LOCKED(); >> - SLIST_FOREACH(oidp, list, oid_link) { >> + RB_FOREACH(oidp, sysctl_oid_list, list) { >> if (strcmp(oidp->oid_name, name) == 0) { >> return (oidp); >> } >> @@ -356,11 +358,14 @@ sysctl_search_oid(struct sysctl_oid **nodes, struct >> sysctl_oid *needle) >> indx = 0; >> while (indx < CTL_MAXNAME && indx >= 0) { >> if (nodes[indx] == NULL && indx == 0) >> - nodes[indx] = SLIST_FIRST(&sysctl__children); >> + nodes[indx] = RB_MIN(sysctl_oid_list, >> + &sysctl__children); >> else if (nodes[indx] == NULL) >> - nodes[indx] = SLIST_FIRST(&nodes[indx - >> 1]->oid_children); >> + nodes[indx] = RB_MIN(sysctl_oid_list, >> + &nodes[indx - 1]->oid_children); >> else >> - nodes[indx] = SLIST_NEXT(nodes[indx], oid_link); >> + nodes[indx] = RB_NEXT(sysctl_oid_list, >> + &nodes[indx - 1]->oid_children, nodes[indx]); >> >> if (nodes[indx] == needle) >> return (indx + 1); >> @@ -425,8 +430,7 @@ void >> sysctl_register_oid(struct sysctl_oid *oidp) >> { >> struct sysctl_oid_list *parent = oidp->oid_parent; >> - struct sysctl_oid *p; >> - struct sysctl_oid *q; >> + struct sysctl_oid *p, key; >> int oid_number; >> int timeout = 2; >> >> @@ -476,25 +480,21 @@ sysctl_register_oid(struct sysctl_oid *oidp) >> * Insert the OID into the parent's list sorted by OID number. >> */ >> retry: >> - q = NULL; >> - SLIST_FOREACH(p, parent, oid_link) { >> - /* check if the current OID number is in use */ >> - if (oid_number == p->oid_number) { >> - /* get the next valid OID number */ >> - if (oid_number < CTL_AUTO_START || >> - oid_number == 0x7fffffff) { >> - /* wraparound - restart */ >> - oid_number = CTL_AUTO_START; >> - /* don't loop forever */ >> - if (!timeout--) >> - panic("sysctl: Out of OID >> numbers\n"); >> - goto retry; >> - } else { >> - oid_number++; >> - } >> - } else if (oid_number < p->oid_number) >> - break; >> - q = p; >> + key.oid_number = oid_number; >> + p = RB_FIND(sysctl_oid_list, parent, &key); >> + if (p) { >> + /* get the next valid OID number */ >> + if (oid_number < CTL_AUTO_START || >> + oid_number == 0x7fffffff) { >> + /* wraparound - restart */ >> + oid_number = CTL_AUTO_START; >> + /* don't loop forever */ >> + if (!timeout--) >> + panic("sysctl: Out of OID numbers\n"); >> + goto retry; >> + } else { >> + oid_number++; >> + } >> } >> /* check for non-auto OID number collision */ >> if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && >> @@ -504,10 +504,7 @@ retry: >> } >> /* update the OID number, if any */ >> oidp->oid_number = oid_number; >> - if (q != NULL) >> - SLIST_INSERT_AFTER(q, oidp, oid_link); >> - else >> - SLIST_INSERT_HEAD(parent, oidp, oid_link); >> + RB_INSERT(sysctl_oid_list, parent, oidp); >> >> if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && >> #ifdef VIMAGE >> @@ -556,7 +553,6 @@ sysctl_enable_oid(struct sysctl_oid *oidp) >> void >> sysctl_unregister_oid(struct sysctl_oid *oidp) >> { >> - struct sysctl_oid *p; >> int error; >> >> SYSCTL_ASSERT_WLOCKED(); >> @@ -564,14 +560,8 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) >> error = EINVAL; >> } else { >> error = ENOENT; >> - SLIST_FOREACH(p, oidp->oid_parent, oid_link) { >> - if (p == oidp) { >> - SLIST_REMOVE(oidp->oid_parent, oidp, >> - sysctl_oid, oid_link); >> - error = 0; >> - break; >> - } >> - } >> + if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp)) >> + error = 0; >> } >> >> /* >> @@ -732,17 +722,14 @@ int >> sysctl_remove_name(struct sysctl_oid *parent, const char *name, >> int del, int recurse) >> { >> - struct sysctl_oid *p, *tmp; >> + struct sysctl_oid *p; >> int error; >> >> error = ENOENT; >> SYSCTL_WLOCK(); >> - SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { >> - if (strcmp(p->oid_name, name) == 0) { >> - error = sysctl_remove_oid_locked(p, del, recurse); >> - break; >> - } >> - } >> + p = sysctl_find_oidname(name, &parent->oid_children); >> + if (p) >> + error = sysctl_remove_oid_locked(p, del, recurse); >> SYSCTL_WUNLOCK(); >> >> return (error); >> @@ -811,14 +798,16 @@ sysctl_remove_oid_locked(struct sysctl_oid *oidp, int >> del, int recurse) >> */ >> if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { >> if (oidp->oid_refcnt == 1) { >> - SLIST_FOREACH_SAFE(p, >> - SYSCTL_CHILDREN(oidp), oid_link, tmp) { >> + for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children); >> + p != NULL; p = tmp) { >> if (!recurse) { >> printf("Warning: failed attempt to " >> "remove oid %s with child %s\n", >> oidp->oid_name, p->oid_name); >> return (ENOTEMPTY); >> } >> + tmp = RB_NEXT(sysctl_oid_list, >> + &oidp->oid_children, p); >> error = sysctl_remove_oid_locked(p, del, >> recurse); >> if (error) >> @@ -895,7 +884,7 @@ sysctl_add_oid(struct sysctl_ctx_list *clist, struct >> sysctl_oid_list *parent, >> } >> oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, >> M_WAITOK|M_ZERO); >> oidp->oid_parent = parent; >> - SLIST_INIT(&oidp->oid_children); >> + RB_INIT(&oidp->oid_children); >> oidp->oid_number = number; >> oidp->oid_refcnt = 1; >> oidp->oid_name = escaped; >> @@ -1016,7 +1005,7 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list >> *l, int i) >> struct sysctl_oid *oidp; >> >> SYSCTL_ASSERT_LOCKED(); >> - SLIST_FOREACH(oidp, l, oid_link) { >> + RB_FOREACH(oidp, sysctl_oid_list, l) { >> for (k=0; k<i; k++) >> printf(" "); >> >> @@ -1081,7 +1070,7 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) >> int *name = (int *) arg1; >> u_int namelen = arg2; >> int error; >> - struct sysctl_oid *oid; >> + struct sysctl_oid *oid, key; >> struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; >> struct rm_priotracker tracker; >> char buf[10]; >> @@ -1105,10 +1094,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) >> continue; >> } >> lsp2 = NULL; >> - SLIST_FOREACH(oid, lsp, oid_link) { >> - if (oid->oid_number != *name) >> - continue; >> - >> + key.oid_number = *name; >> + oid = RB_FIND(sysctl_oid_list, lsp, &key); >> + if (oid) { >> if (req->oldidx) >> error = SYSCTL_OUT(req, ".", 1); >> if (!error) >> @@ -1120,14 +1108,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) >> namelen--; >> name++; >> >> - if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) >> - break; >> - >> - if (oid->oid_handler) >> - break; >> - >> - lsp2 = SYSCTL_CHILDREN(oid); >> - break; >> + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE && >> + !oid->oid_handler) >> + lsp2 = SYSCTL_CHILDREN(oid); >> } >> lsp = lsp2; >> } >> @@ -1239,13 +1222,25 @@ static bool >> sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int >> namelen, >> int *next, int *len, int level, bool honor_skip) >> { >> - struct sysctl_oid *oidp; >> + struct sysctl_oid_list *next_lsp; >> + struct sysctl_oid *oidp = NULL, key; >> bool success = false; >> enum sysctl_iter_action action; >> >> SYSCTL_ASSERT_LOCKED(); >> - SLIST_FOREACH(oidp, lsp, oid_link) { >> - action = sysctl_sysctl_next_node(oidp, name, namelen, >> honor_skip); >> + /* >> + * Start the search at the requested oid. But if not found, then >> scan >> + * through all children. >> + */ >> + if (namelen > 0) { >> + key.oid_number = *name; >> + oidp = RB_FIND(sysctl_oid_list, lsp, &key); >> + } >> + if (!oidp) >> + oidp = RB_MIN(sysctl_oid_list, lsp); >> + for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) { >> + action = sysctl_sysctl_next_node(oidp, name, namelen, >> + honor_skip); >> if (action == ITER_SIBLINGS) >> continue; >> if (action == ITER_FOUND) { >> @@ -1254,13 +1249,13 @@ sysctl_sysctl_next_action(struct sysctl_oid_list >> *lsp, int *name, u_int namelen, >> } >> KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", >> action)); >> >> - lsp = SYSCTL_CHILDREN(oidp); >> + next_lsp = SYSCTL_CHILDREN(oidp); >> if (namelen == 0) { >> - success = sysctl_sysctl_next_action(lsp, NULL, 0, >> + success = sysctl_sysctl_next_action(next_lsp, NULL, >> 0, >> next + 1, len, level + 1, honor_skip); >> } else { >> - success = sysctl_sysctl_next_action(lsp, name + 1, >> namelen - 1, >> - next + 1, len, level + 1, honor_skip); >> + success = sysctl_sysctl_next_action(next_lsp, name + >> 1, >> + namelen - 1, next + 1, len, level + 1, >> honor_skip); >> if (!success) { >> >> /* >> @@ -1332,13 +1327,12 @@ name2oid(char *name, int *oid, int *len, struct >> sysctl_oid **oidpp) >> for (*len = 0; *len < CTL_MAXNAME;) { >> p = strsep(&name, "."); >> >> - oidp = SLIST_FIRST(lsp); >> - for (;; oidp = SLIST_NEXT(oidp, oid_link)) { >> - if (oidp == NULL) >> - return (ENOENT); >> + RB_FOREACH(oidp, sysctl_oid_list, lsp) { >> if (strcmp(p, oidp->oid_name) == 0) >> break; >> } >> + if (oidp == NULL) >> + return (ENOENT); >> *oid++ = oidp->oid_number; >> (*len)++; >> >> @@ -2162,16 +2156,15 @@ sysctl_find_oid(int *name, u_int namelen, struct >> sysctl_oid **noid, >> { >> struct sysctl_oid_list *lsp; >> struct sysctl_oid *oid; >> + struct sysctl_oid key; >> int indx; >> >> SYSCTL_ASSERT_LOCKED(); >> lsp = &sysctl__children; >> indx = 0; >> while (indx < CTL_MAXNAME) { >> - SLIST_FOREACH(oid, lsp, oid_link) { >> - if (oid->oid_number == name[indx]) >> - break; >> - } >> + key.oid_number = name[indx]; >> + oid = RB_FIND(sysctl_oid_list, lsp, &key); >> if (oid == NULL) >> return (ENOENT); >> >> diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c >> index 6572a8e362c2..6e2e78aaf597 100644 >> --- a/sys/kern/vfs_init.c >> +++ b/sys/kern/vfs_init.c >> @@ -525,7 +525,7 @@ vfs_register(struct vfsconf *vfc) >> * number. >> */ >> sysctl_wlock(); >> - SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) { >> + RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(&sysctl___vfs)) { >> if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) { >> sysctl_unregister_oid(oidp); >> oidp->oid_number = vfc->vfc_typenum; >> diff --git a/sys/sys/param.h b/sys/sys/param.h >> index f875d839d41f..3f5da06ef951 100644 >> --- a/sys/sys/param.h >> +++ b/sys/sys/param.h >> @@ -76,7 +76,7 @@ >> * cannot include sys/param.h and should only be updated here. >> */ >> #undef __FreeBSD_version >> -#define __FreeBSD_version 1400070 >> +#define __FreeBSD_version 1400071 >> >> /* >> * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, >> diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h >> index 451d83bbe125..3bd77cf87243 100644 >> --- a/sys/sys/sysctl.h >> +++ b/sys/sys/sysctl.h >> @@ -39,7 +39,8 @@ >> #define _SYS_SYSCTL_H_ >> >> #ifdef _KERNEL >> -#include <sys/queue.h> >> +#include <sys/tree.h> >> +#include <sys/systm.h> >> #endif >> >> /* >> @@ -173,20 +174,25 @@ struct sysctl_req { >> int flags; >> }; >> >> -SLIST_HEAD(sysctl_oid_list, sysctl_oid); >> +struct sysctl_oid; >> + >> +/* RB Tree handling */ >> +RB_HEAD(sysctl_oid_list, sysctl_oid); >> >> /* >> * This describes one "oid" in the MIB tree. Potentially more nodes can >> * be hidden behind it, expanded by the handler. >> */ >> struct sysctl_oid { >> - struct sysctl_oid_list oid_children; >> - struct sysctl_oid_list *oid_parent; >> - SLIST_ENTRY(sysctl_oid) oid_link; >> + struct sysctl_oid_list oid_children; >> + struct sysctl_oid_list* oid_parent; >> + RB_ENTRY(sysctl_oid) oid_link; >> + /* Sort key for all siblings, and lookup key for userland */ >> int oid_number; >> u_int oid_kind; >> void *oid_arg1; >> intmax_t oid_arg2; >> + /* Must be unique amongst all siblings. */ >> const char *oid_name; >> int (*oid_handler)(SYSCTL_HANDLER_ARGS); >> const char *oid_fmt; >> @@ -196,6 +202,19 @@ struct sysctl_oid { >> const char *oid_label; >> }; >> >> +static inline int >> +cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b) >> +{ >> + if (a->oid_number > b->oid_number) >> + return (1); >> + else if (a->oid_number < b->oid_number) >> + return (-1); >> + else >> + return (0); >> +} >> + >> +RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); >> + >> #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) >> #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) >> #define SYSCTL_OUT_STR(r, p) (r->oldfunc)(r, p, strlen(p) + 1) >> @@ -275,7 +294,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); >> #define SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, >> a2, handler, fmt, descr, label) \ >> struct sysctl_oid id = { \ >> .oid_parent = (parent_child_head), \ >> - .oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \ >> + .oid_children = RB_INITIALIZER(&id.oid_children), \ >> .oid_number = (nbr), \ >> .oid_kind = (kind), \ >> .oid_arg1 = (a1), \ >>