> Date: Tue, 30 Apr 2024 11:08:13 +0200
> From: Martin Pieuchot <[email protected]>
>
> On 27/04/24(Sat) 13:44, Visa Hankala wrote:
> > On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote:
> > > [...]
> > > I agree. Now I'd be very grateful if someone could dig into WITNESS to
> > > figure out why we see such reports. Are these false positive or are we
> > > missing data from the code path that we think are incorrect?
> >
> > WITNESS currently cannot show lock cycles longer than two locks.
> > To fix this, WITNESS needs to do a path search in the lock order graph.
>
> Lovely!
>
> > However, there is also something else wrong in WITNESS, possibly
> > related to situations where the kernel lock comes between two rwlocks
> > in the lock order. I still need to study this more.
>
> I greatly appreciate your dedication in this area.
>
> > Below is a patch that adds the cycle search and printing. The patch
> > also tweaks a few prints to show more context.
>
> This is ok mpi@
>
> > With the patch, the nfsnode-vmmaplk reversal looks like this:
>
> So the issue here is due to NFS entering the network stack after the
> VFS. Alexander, Vitaly are we far from a NET_LOCK()-free sosend()?
> Is something we should consider?
>
> On the other side, would that make sense to have a NET_LOCK()-free
> sysctl path?
I don't think it is necessary to make sysctl calls NET_LOCK()-free,
but it does make a lot of sense to avoid calling copyin() and
copyout() while holding locks as much as possible.
For the network sysctl, that means that instead of something like:
NET_LOCK();
...
sysctl_int_bounded(oldp, oldlenp, newp, newlen, ...); /* copyin/copyout */
...
NET_UNLOCK();
we really should be doing something like:
sysctl_int_bounded(NULL, 0, newp, newlen, ...); /* copyin */
NET_LOCK();
...
NET_UNLOCK();
sysctl_int_bounded(oldp, oldlenp, NULL, 0, ...); /* copyout */
> > witness: lock order reversal:
> > 1st 0xfffffd8126deacf8 vmmaplk (&map->lock)
> > 2nd 0xffff800039831948 nfsnode (&np->n_lock)
> > lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock)
> > #0 rw_enter+0x6d
> > #1 rrw_enter+0x5e
> > #2 VOP_LOCK+0x5f
> > #3 vn_lock+0xbc
> > #4 vn_rdwr+0x83
> > #5 vndstrategy+0x2ca
> > #6 physio+0x204
> > #7 spec_write+0x9e
> > #8 VOP_WRITE+0x6e
> > #9 vn_write+0x100
> > #10 dofilewritev+0x143
> > #11 sys_pwrite+0x60
> > #12 syscall+0x588
> > #13 Xsyscall+0x128
> > lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock)
> > #0 rw_enter_read+0x50
> > #1 solock_shared+0x3a
> > #2 sosend+0x10c
> > #3 nfs_send+0x8d
> > #4 nfs_request+0x258
> > #5 nfs_getattr+0xcb
> > #6 VOP_GETATTR+0x55
> > #7 mountnfs+0x37c
> > #8 nfs_mount+0x125
> > #9 sys_mount+0x343
> > #10 syscall+0x561
> > #11 Xsyscall+0x128
> > lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock)
> > #0 rw_enter_read+0x50
> > #1 uvmfault_lookup+0x8a
> > #2 uvm_fault_check+0x36
> > #3 uvm_fault+0xfb
> > #4 kpageflttrap+0x158
> > #5 kerntrap+0x94
> > #6 alltraps_kern_meltdown+0x7b
> > #7 _copyin+0x62
> > #8 sysctl_bounded_arr+0x83
> > #9 tcp_sysctl+0x546
> > #10 sys_sysctl+0x17b
> > #11 syscall+0x561
> > #12 Xsyscall+0x128
> >
> >
> > Index: kern/subr_witness.c
> > ===================================================================
> > RCS file: src/sys/kern/subr_witness.c,v
> > retrieving revision 1.50
> > diff -u -p -r1.50 subr_witness.c
> > --- kern/subr_witness.c 30 May 2023 08:30:01 -0000 1.50
> > +++ kern/subr_witness.c 27 Apr 2024 13:08:43 -0000
> > @@ -369,6 +369,13 @@ static struct witness_lock_order_data *w
> > struct witness *child);
> > static void witness_list_lock(struct lock_instance *instance,
> > int (*prnt)(const char *fmt, ...));
> > +static void witness_print_cycle(int(*prnt)(const char *fmt, ...),
> > + struct witness *parent, struct witness *child);
> > +static void witness_print_cycle_edge(int(*prnt)(const char *fmt,
> > ...),
> > + struct witness *parent, struct witness *child,
> > + int step, int last);
> > +static int witness_search(struct witness *w, struct witness *target,
> > + struct witness **path, int depth, int *remaining);
> > static void witness_setflag(struct lock_object *lock, int flag, int
> > set);
> >
> > /*
> > @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn
> >
> > for (i = 0; i < indent; i++)
> > prnt(" ");
> > - prnt("%s (type: %s, depth: %d)",
> > - w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
> > + prnt("%s (%s) (type: %s, depth: %d)",
> > + w->w_subtype, w->w_type->lt_name,
> > + w->w_class->lc_name, w->w_ddb_level);
> > if (w->w_displayed) {
> > prnt(" -- (already displayed)\n");
> > return;
> > @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha
> > SLIST_FOREACH(w, &w_all, w_list) {
> > if (w->w_acquired)
> > continue;
> > - prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
> > + prnt("%s (%s) (type: %s, depth: %d)\n",
> > + w->w_subtype, w->w_type->lt_name,
> > w->w_class->lc_name, w->w_ddb_level);
> > }
> > }
> > @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l
> > printf(" 3rd %p %s (%s)\n", lock,
> > lock->lo_name, w->w_type->lt_name);
> > }
> > - if (witness_watch > 1) {
> > - struct witness_lock_order_data *wlod1, *wlod2;
> > -
> > - mtx_enter(&w_mtx);
> > - wlod1 = witness_lock_order_get(w, w1);
> > - wlod2 = witness_lock_order_get(w1, w);
> > - mtx_leave(&w_mtx);
> > -
> > - /*
> > - * It is safe to access saved stack traces,
> > - * w_type, and w_class without the lock.
> > - * Once written, they never change.
> > - */
> > -
> > - if (wlod1 != NULL) {
> > - printf("lock order \"%s\"(%s) -> "
> > - "\"%s\"(%s) first seen at:\n",
> > - w->w_type->lt_name,
> > - w->w_class->lc_name,
> > - w1->w_type->lt_name,
> > - w1->w_class->lc_name);
> > - stacktrace_print(
> > - &wlod1->wlod_stack, printf);
> > - } else {
> > - printf("lock order data "
> > - "w2 -> w1 missing\n");
> > - }
> > - if (wlod2 != NULL) {
> > - printf("lock order \"%s\"(%s) -> "
> > - "\"%s\"(%s) first seen at:\n",
> > - w1->w_type->lt_name,
> > - w1->w_class->lc_name,
> > - w->w_type->lt_name,
> > - w->w_class->lc_name);
> > - stacktrace_print(
> > - &wlod2->wlod_stack, printf);
> > - } else {
> > - printf("lock order data "
> > - "w1 -> w2 missing\n");
> > - }
> > - }
> > + if (witness_watch > 1)
> > + witness_print_cycle(printf, w1, w);
> > witness_debugger(0);
> > goto out_splx;
> > }
> > @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance *
> > stacktrace_print(&instance->li_stack->ls_stack, prnt);
> > }
> >
> > +static int
> > +witness_search(struct witness *w, struct witness *target,
> > + struct witness **path, int depth, int *remaining)
> > +{
> > + int i, any_remaining;
> > +
> > + if (depth == 0) {
> > + *remaining = 1;
> > + return (w == target);
> > + }
> > +
> > + any_remaining = 0;
> > + for (i = 1; i <= w_max_used_index; i++) {
> > + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
> > + if (witness_search(&w_data[i], target, path, depth - 1,
> > + remaining)) {
> > + path[depth - 1] = &w_data[i];
> > + *remaining = 1;
> > + return 1;
> > + }
> > + if (remaining)
> > + any_remaining = 1;
> > + }
> > + }
> > + *remaining = any_remaining;
> > + return 0;
> > +}
> > +
> > +static void
> > +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> > + struct witness *parent, struct witness *child, int step, int last)
> > +{
> > + struct witness_lock_order_data *wlod;
> > + int next;
> > +
> > + if (last)
> > + next = 1;
> > + else
> > + next = step + 1;
> > + prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n",
> > + step, parent->w_subtype, parent->w_type->lt_name,
> > + next, child->w_subtype, child->w_type->lt_name);
> > + if (witness_watch > 1) {
> > + mtx_enter(&w_mtx);
> > + wlod = witness_lock_order_get(parent, child);
> > + mtx_leave(&w_mtx);
> > +
> > + if (wlod != NULL)
> > + stacktrace_print(&wlod->wlod_stack, printf);
> > + else
> > + prnt("lock order data %p -> %p is missing\n",
> > + parent->w_type->lt_name, child->w_type->lt_name);
> > + }
> > +}
> > +
> > +static void
> > +witness_print_cycle(int(*prnt)(const char *fmt, ...),
> > + struct witness *parent, struct witness *child)
> > +{
> > + struct witness *path[4];
> > + struct witness *w;
> > + int depth, remaining;
> > + int step = 0;
> > +
> > + /*
> > + * Use depth-limited search to find the shortest path
> > + * from child to parent.
> > + */
> > + for (depth = 1; depth < nitems(path); depth++) {
> > + if (witness_search(child, parent, path, depth, &remaining))
> > + goto found;
> > + if (!remaining)
> > + break;
> > + }
> > + prnt("witness: incomplete path, depth %d\n", depth);
> > + return;
> > +
> > +found:
> > + witness_print_cycle_edge(prnt, parent, child, ++step, 0);
> > + for (w = child; depth > 0; depth--) {
> > + witness_print_cycle_edge(prnt, w, path[depth - 1], ++step,
> > + depth == 1);
> > + w = path[depth - 1];
> > + }
> > +}
> > +
> > #ifdef DDB
> > static int
> > witness_thread_has_locks(struct proc *p)
> > @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int
> > void
> > witness_print_badstacks(void)
> > {
> > - static struct witness tmp_w1, tmp_w2;
> > - static struct witness_lock_order_data tmp_data1, tmp_data2;
> > - struct witness_lock_order_data *data1, *data2;
> > struct witness *w1, *w2;
> > int error, generation, i, j;
> >
> > @@ -2139,11 +2192,6 @@ witness_print_badstacks(void)
> > }
> > error = 0;
> >
> > - memset(&tmp_w1, 0, sizeof(tmp_w1));
> > - memset(&tmp_w2, 0, sizeof(tmp_w2));
> > - memset(&tmp_data1, 0, sizeof(tmp_data1));
> > - memset(&tmp_data2, 0, sizeof(tmp_data2));
> > -
> > restart:
> > mtx_enter(&w_mtx);
> > generation = w_generation;
> > @@ -2165,12 +2213,9 @@ restart:
> > mtx_leave(&w_mtx);
> > continue;
> > }
> > -
> > - /* Copy w1 locally so we can release the spin lock. */
> > - tmp_w1 = *w1;
> > mtx_leave(&w_mtx);
> >
> > - if (tmp_w1.w_reversed == 0)
> > + if (w1->w_reversed == 0)
> > continue;
> > for (j = 1; j < w_max_used_index; j++) {
> > if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
> > @@ -2187,47 +2232,13 @@ restart:
> > }
> >
> > w2 = &w_data[j];
> > - data1 = witness_lock_order_get(w1, w2);
> > - data2 = witness_lock_order_get(w2, w1);
> > -
> > - /*
> > - * Copy information locally so we can release the
> > - * spin lock.
> > - */
> > - tmp_w2 = *w2;
> > -
> > - if (data1)
> > - tmp_data1.wlod_stack = data1->wlod_stack;
> > - if (data2 && data2 != data1)
> > - tmp_data2.wlod_stack = data2->wlod_stack;
> > mtx_leave(&w_mtx);
> >
> > db_printf("\nLock order reversal between \"%s\"(%s) "
> > "and \"%s\"(%s)!\n",
> > - tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
> > - tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
> > - if (data1) {
> > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> > - "first seen at:\n",
> > - tmp_w1.w_type->lt_name,
> > - tmp_w1.w_class->lc_name,
> > - tmp_w2.w_type->lt_name,
> > - tmp_w2.w_class->lc_name);
> > - stacktrace_print(&tmp_data1.wlod_stack,
> > - db_printf);
> > - db_printf("\n");
> > - }
> > - if (data2 && data2 != data1) {
> > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> > - "first seen at:\n",
> > - tmp_w2.w_type->lt_name,
> > - tmp_w2.w_class->lc_name,
> > - tmp_w1.w_type->lt_name,
> > - tmp_w1.w_class->lc_name);
> > - stacktrace_print(&tmp_data2.wlod_stack,
> > - db_printf);
> > - db_printf("\n");
> > - }
> > + w1->w_type->lt_name, w1->w_class->lc_name,
> > + w2->w_type->lt_name, w2->w_class->lc_name);
> > + witness_print_cycle(db_printf, w1, w2);
> > }
> > }
> > mtx_enter(&w_mtx);
>
>