On 27/04/24(Sat) 13:44, Visa Hankala wrote:
> On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote:
> > [...]
> > I agree. Now I'd be very grateful if someone could dig into WITNESS to
> > figure out why we see such reports. Are these false positive or are we
> > missing data from the code path that we think are incorrect?
>
> WITNESS currently cannot show lock cycles longer than two locks.
> To fix this, WITNESS needs to do a path search in the lock order graph.
Lovely!
> However, there is also something else wrong in WITNESS, possibly
> related to situations where the kernel lock comes between two rwlocks
> in the lock order. I still need to study this more.
I greatly appreciate your dedication in this area.
> Below is a patch that adds the cycle search and printing. The patch
> also tweaks a few prints to show more context.
This is ok mpi@
> With the patch, the nfsnode-vmmaplk reversal looks like this:
So the issue here is due to NFS entering the network stack after the
VFS. Alexander, Vitaly are we far from a NET_LOCK()-free sosend()?
Is something we should consider?
On the other side, would that make sense to have a NET_LOCK()-free
sysctl path?
> witness: lock order reversal:
> 1st 0xfffffd8126deacf8 vmmaplk (&map->lock)
> 2nd 0xffff800039831948 nfsnode (&np->n_lock)
> lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock)
> #0 rw_enter+0x6d
> #1 rrw_enter+0x5e
> #2 VOP_LOCK+0x5f
> #3 vn_lock+0xbc
> #4 vn_rdwr+0x83
> #5 vndstrategy+0x2ca
> #6 physio+0x204
> #7 spec_write+0x9e
> #8 VOP_WRITE+0x6e
> #9 vn_write+0x100
> #10 dofilewritev+0x143
> #11 sys_pwrite+0x60
> #12 syscall+0x588
> #13 Xsyscall+0x128
> lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock)
> #0 rw_enter_read+0x50
> #1 solock_shared+0x3a
> #2 sosend+0x10c
> #3 nfs_send+0x8d
> #4 nfs_request+0x258
> #5 nfs_getattr+0xcb
> #6 VOP_GETATTR+0x55
> #7 mountnfs+0x37c
> #8 nfs_mount+0x125
> #9 sys_mount+0x343
> #10 syscall+0x561
> #11 Xsyscall+0x128
> lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock)
> #0 rw_enter_read+0x50
> #1 uvmfault_lookup+0x8a
> #2 uvm_fault_check+0x36
> #3 uvm_fault+0xfb
> #4 kpageflttrap+0x158
> #5 kerntrap+0x94
> #6 alltraps_kern_meltdown+0x7b
> #7 _copyin+0x62
> #8 sysctl_bounded_arr+0x83
> #9 tcp_sysctl+0x546
> #10 sys_sysctl+0x17b
> #11 syscall+0x561
> #12 Xsyscall+0x128
>
>
> Index: kern/subr_witness.c
> ===================================================================
> RCS file: src/sys/kern/subr_witness.c,v
> retrieving revision 1.50
> diff -u -p -r1.50 subr_witness.c
> --- kern/subr_witness.c 30 May 2023 08:30:01 -0000 1.50
> +++ kern/subr_witness.c 27 Apr 2024 13:08:43 -0000
> @@ -369,6 +369,13 @@ static struct witness_lock_order_data *w
> struct witness *child);
> static void witness_list_lock(struct lock_instance *instance,
> int (*prnt)(const char *fmt, ...));
> +static void witness_print_cycle(int(*prnt)(const char *fmt, ...),
> + struct witness *parent, struct witness *child);
> +static void witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> + struct witness *parent, struct witness *child,
> + int step, int last);
> +static int witness_search(struct witness *w, struct witness *target,
> + struct witness **path, int depth, int *remaining);
> static void witness_setflag(struct lock_object *lock, int flag, int set);
>
> /*
> @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn
>
> for (i = 0; i < indent; i++)
> prnt(" ");
> - prnt("%s (type: %s, depth: %d)",
> - w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
> + prnt("%s (%s) (type: %s, depth: %d)",
> + w->w_subtype, w->w_type->lt_name,
> + w->w_class->lc_name, w->w_ddb_level);
> if (w->w_displayed) {
> prnt(" -- (already displayed)\n");
> return;
> @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha
> SLIST_FOREACH(w, &w_all, w_list) {
> if (w->w_acquired)
> continue;
> - prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
> + prnt("%s (%s) (type: %s, depth: %d)\n",
> + w->w_subtype, w->w_type->lt_name,
> w->w_class->lc_name, w->w_ddb_level);
> }
> }
> @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l
> printf(" 3rd %p %s (%s)\n", lock,
> lock->lo_name, w->w_type->lt_name);
> }
> - if (witness_watch > 1) {
> - struct witness_lock_order_data *wlod1, *wlod2;
> -
> - mtx_enter(&w_mtx);
> - wlod1 = witness_lock_order_get(w, w1);
> - wlod2 = witness_lock_order_get(w1, w);
> - mtx_leave(&w_mtx);
> -
> - /*
> - * It is safe to access saved stack traces,
> - * w_type, and w_class without the lock.
> - * Once written, they never change.
> - */
> -
> - if (wlod1 != NULL) {
> - printf("lock order \"%s\"(%s) -> "
> - "\"%s\"(%s) first seen at:\n",
> - w->w_type->lt_name,
> - w->w_class->lc_name,
> - w1->w_type->lt_name,
> - w1->w_class->lc_name);
> - stacktrace_print(
> - &wlod1->wlod_stack, printf);
> - } else {
> - printf("lock order data "
> - "w2 -> w1 missing\n");
> - }
> - if (wlod2 != NULL) {
> - printf("lock order \"%s\"(%s) -> "
> - "\"%s\"(%s) first seen at:\n",
> - w1->w_type->lt_name,
> - w1->w_class->lc_name,
> - w->w_type->lt_name,
> - w->w_class->lc_name);
> - stacktrace_print(
> - &wlod2->wlod_stack, printf);
> - } else {
> - printf("lock order data "
> - "w1 -> w2 missing\n");
> - }
> - }
> + if (witness_watch > 1)
> + witness_print_cycle(printf, w1, w);
> witness_debugger(0);
> goto out_splx;
> }
> @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance *
> stacktrace_print(&instance->li_stack->ls_stack, prnt);
> }
>
> +static int
> +witness_search(struct witness *w, struct witness *target,
> + struct witness **path, int depth, int *remaining)
> +{
> + int i, any_remaining;
> +
> + if (depth == 0) {
> + *remaining = 1;
> + return (w == target);
> + }
> +
> + any_remaining = 0;
> + for (i = 1; i <= w_max_used_index; i++) {
> + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
> + if (witness_search(&w_data[i], target, path, depth - 1,
> + remaining)) {
> + path[depth - 1] = &w_data[i];
> + *remaining = 1;
> + return 1;
> + }
> + if (remaining)
> + any_remaining = 1;
> + }
> + }
> + *remaining = any_remaining;
> + return 0;
> +}
> +
> +static void
> +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> + struct witness *parent, struct witness *child, int step, int last)
> +{
> + struct witness_lock_order_data *wlod;
> + int next;
> +
> + if (last)
> + next = 1;
> + else
> + next = step + 1;
> + prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n",
> + step, parent->w_subtype, parent->w_type->lt_name,
> + next, child->w_subtype, child->w_type->lt_name);
> + if (witness_watch > 1) {
> + mtx_enter(&w_mtx);
> + wlod = witness_lock_order_get(parent, child);
> + mtx_leave(&w_mtx);
> +
> + if (wlod != NULL)
> + stacktrace_print(&wlod->wlod_stack, printf);
> + else
> + prnt("lock order data %p -> %p is missing\n",
> + parent->w_type->lt_name, child->w_type->lt_name);
> + }
> +}
> +
> +static void
> +witness_print_cycle(int(*prnt)(const char *fmt, ...),
> + struct witness *parent, struct witness *child)
> +{
> + struct witness *path[4];
> + struct witness *w;
> + int depth, remaining;
> + int step = 0;
> +
> + /*
> + * Use depth-limited search to find the shortest path
> + * from child to parent.
> + */
> + for (depth = 1; depth < nitems(path); depth++) {
> + if (witness_search(child, parent, path, depth, &remaining))
> + goto found;
> + if (!remaining)
> + break;
> + }
> + prnt("witness: incomplete path, depth %d\n", depth);
> + return;
> +
> +found:
> + witness_print_cycle_edge(prnt, parent, child, ++step, 0);
> + for (w = child; depth > 0; depth--) {
> + witness_print_cycle_edge(prnt, w, path[depth - 1], ++step,
> + depth == 1);
> + w = path[depth - 1];
> + }
> +}
> +
> #ifdef DDB
> static int
> witness_thread_has_locks(struct proc *p)
> @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int
> void
> witness_print_badstacks(void)
> {
> - static struct witness tmp_w1, tmp_w2;
> - static struct witness_lock_order_data tmp_data1, tmp_data2;
> - struct witness_lock_order_data *data1, *data2;
> struct witness *w1, *w2;
> int error, generation, i, j;
>
> @@ -2139,11 +2192,6 @@ witness_print_badstacks(void)
> }
> error = 0;
>
> - memset(&tmp_w1, 0, sizeof(tmp_w1));
> - memset(&tmp_w2, 0, sizeof(tmp_w2));
> - memset(&tmp_data1, 0, sizeof(tmp_data1));
> - memset(&tmp_data2, 0, sizeof(tmp_data2));
> -
> restart:
> mtx_enter(&w_mtx);
> generation = w_generation;
> @@ -2165,12 +2213,9 @@ restart:
> mtx_leave(&w_mtx);
> continue;
> }
> -
> - /* Copy w1 locally so we can release the spin lock. */
> - tmp_w1 = *w1;
> mtx_leave(&w_mtx);
>
> - if (tmp_w1.w_reversed == 0)
> + if (w1->w_reversed == 0)
> continue;
> for (j = 1; j < w_max_used_index; j++) {
> if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
> @@ -2187,47 +2232,13 @@ restart:
> }
>
> w2 = &w_data[j];
> - data1 = witness_lock_order_get(w1, w2);
> - data2 = witness_lock_order_get(w2, w1);
> -
> - /*
> - * Copy information locally so we can release the
> - * spin lock.
> - */
> - tmp_w2 = *w2;
> -
> - if (data1)
> - tmp_data1.wlod_stack = data1->wlod_stack;
> - if (data2 && data2 != data1)
> - tmp_data2.wlod_stack = data2->wlod_stack;
> mtx_leave(&w_mtx);
>
> db_printf("\nLock order reversal between \"%s\"(%s) "
> "and \"%s\"(%s)!\n",
> - tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
> - tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
> - if (data1) {
> - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> - "first seen at:\n",
> - tmp_w1.w_type->lt_name,
> - tmp_w1.w_class->lc_name,
> - tmp_w2.w_type->lt_name,
> - tmp_w2.w_class->lc_name);
> - stacktrace_print(&tmp_data1.wlod_stack,
> - db_printf);
> - db_printf("\n");
> - }
> - if (data2 && data2 != data1) {
> - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> - "first seen at:\n",
> - tmp_w2.w_type->lt_name,
> - tmp_w2.w_class->lc_name,
> - tmp_w1.w_type->lt_name,
> - tmp_w1.w_class->lc_name);
> - stacktrace_print(&tmp_data2.wlod_stack,
> - db_printf);
> - db_printf("\n");
> - }
> + w1->w_type->lt_name, w1->w_class->lc_name,
> + w2->w_type->lt_name, w2->w_class->lc_name);
> + witness_print_cycle(db_printf, w1, w2);
> }
> }
> mtx_enter(&w_mtx);