Re: lock order reversal in soreceive and NFS

Martin Pieuchot Tue, 30 Apr 2024 02:08:39 -0700

On 27/04/24(Sat) 13:44, Visa Hankala wrote:
> On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote:
> > [...]
> > I agree.  Now I'd be very grateful if someone could dig into WITNESS to
> > figure out why we see such reports.  Are these false positive or are we
> > missing data from the code path that we think are incorrect?
> 
> WITNESS currently cannot show lock cycles longer than two locks.
> To fix this, WITNESS needs to do a path search in the lock order graph.


Lovely!

> However, there is also something else wrong in WITNESS, possibly
> related to situations where the kernel lock comes between two rwlocks
> in the lock order. I still need to study this more.

I greatly appreciate your dedication in this area.

> Below is a patch that adds the cycle search and printing. The patch
> also tweaks a few prints to show more context.

This is ok mpi@

> With the patch, the nfsnode-vmmaplk reversal looks like this:

So the issue here is due to NFS entering the network stack after the
VFS.  Alexander, Vitaly are we far from a NET_LOCK()-free sosend()?
Is something we should consider?

On the other side, would that make sense to have a NET_LOCK()-free
sysctl path?

> witness: lock order reversal:
>  1st 0xfffffd8126deacf8 vmmaplk (&map->lock)
>  2nd 0xffff800039831948 nfsnode (&np->n_lock)
> lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock)
> #0  rw_enter+0x6d
> #1  rrw_enter+0x5e
> #2  VOP_LOCK+0x5f
> #3  vn_lock+0xbc
> #4  vn_rdwr+0x83
> #5  vndstrategy+0x2ca
> #6  physio+0x204
> #7  spec_write+0x9e
> #8  VOP_WRITE+0x6e
> #9  vn_write+0x100
> #10 dofilewritev+0x143
> #11 sys_pwrite+0x60
> #12 syscall+0x588
> #13 Xsyscall+0x128
> lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock)
> #0  rw_enter_read+0x50
> #1  solock_shared+0x3a
> #2  sosend+0x10c
> #3  nfs_send+0x8d
> #4  nfs_request+0x258
> #5  nfs_getattr+0xcb
> #6  VOP_GETATTR+0x55
> #7  mountnfs+0x37c
> #8  nfs_mount+0x125
> #9  sys_mount+0x343
> #10 syscall+0x561
> #11 Xsyscall+0x128
> lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock)
> #0  rw_enter_read+0x50
> #1  uvmfault_lookup+0x8a
> #2  uvm_fault_check+0x36
> #3  uvm_fault+0xfb
> #4  kpageflttrap+0x158
> #5  kerntrap+0x94
> #6  alltraps_kern_meltdown+0x7b
> #7  _copyin+0x62
> #8  sysctl_bounded_arr+0x83
> #9  tcp_sysctl+0x546
> #10 sys_sysctl+0x17b
> #11 syscall+0x561
> #12 Xsyscall+0x128
> 
> 
> Index: kern/subr_witness.c
> ===================================================================
> RCS file: src/sys/kern/subr_witness.c,v
> retrieving revision 1.50
> diff -u -p -r1.50 subr_witness.c
> --- kern/subr_witness.c       30 May 2023 08:30:01 -0000      1.50
> +++ kern/subr_witness.c       27 Apr 2024 13:08:43 -0000
> @@ -369,6 +369,13 @@ static struct witness_lock_order_data    *w
>                                           struct witness *child);
>  static void  witness_list_lock(struct lock_instance *instance,
>                   int (*prnt)(const char *fmt, ...));
> +static void  witness_print_cycle(int(*prnt)(const char *fmt, ...),
> +                 struct witness *parent, struct witness *child);
> +static void  witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> +                 struct witness *parent, struct witness *child,
> +                 int step, int last);
> +static int   witness_search(struct witness *w, struct witness *target,
> +                 struct witness **path, int depth, int *remaining);
>  static void  witness_setflag(struct lock_object *lock, int flag, int set);
>  
>  /*
> @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn
>  
>       for (i = 0; i < indent; i++)
>               prnt(" ");
> -     prnt("%s (type: %s, depth: %d)",
> -          w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
> +     prnt("%s (%s) (type: %s, depth: %d)",
> +         w->w_subtype, w->w_type->lt_name,
> +         w->w_class->lc_name, w->w_ddb_level);
>       if (w->w_displayed) {
>               prnt(" -- (already displayed)\n");
>               return;
> @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha
>       SLIST_FOREACH(w, &w_all, w_list) {
>               if (w->w_acquired)
>                       continue;
> -             prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
> +             prnt("%s (%s) (type: %s, depth: %d)\n",
> +                 w->w_subtype, w->w_type->lt_name,
>                   w->w_class->lc_name, w->w_ddb_level);
>       }
>  }
> @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l
>                               printf(" 3rd %p %s (%s)\n", lock,
>                                   lock->lo_name, w->w_type->lt_name);
>                       }
> -                     if (witness_watch > 1) {
> -                             struct witness_lock_order_data *wlod1, *wlod2;
> -
> -                             mtx_enter(&w_mtx);
> -                             wlod1 = witness_lock_order_get(w, w1);
> -                             wlod2 = witness_lock_order_get(w1, w);
> -                             mtx_leave(&w_mtx);
> -
> -                             /*
> -                              * It is safe to access saved stack traces,
> -                              * w_type, and w_class without the lock.
> -                              * Once written, they never change.
> -                              */
> -
> -                             if (wlod1 != NULL) {
> -                                     printf("lock order \"%s\"(%s) -> "
> -                                         "\"%s\"(%s) first seen at:\n",
> -                                         w->w_type->lt_name,
> -                                         w->w_class->lc_name,
> -                                         w1->w_type->lt_name,
> -                                         w1->w_class->lc_name);
> -                                     stacktrace_print(
> -                                         &wlod1->wlod_stack, printf);
> -                             } else {
> -                                     printf("lock order data "
> -                                         "w2 -> w1 missing\n");
> -                             }
> -                             if (wlod2 != NULL) {
> -                                     printf("lock order \"%s\"(%s) -> "
> -                                         "\"%s\"(%s) first seen at:\n",
> -                                         w1->w_type->lt_name,
> -                                         w1->w_class->lc_name,
> -                                         w->w_type->lt_name,
> -                                         w->w_class->lc_name);
> -                                     stacktrace_print(
> -                                         &wlod2->wlod_stack, printf);
> -                             } else {
> -                                     printf("lock order data "
> -                                         "w1 -> w2 missing\n");
> -                             }
> -                     }
> +                     if (witness_watch > 1)
> +                             witness_print_cycle(printf, w1, w);
>                       witness_debugger(0);
>                       goto out_splx;
>               }
> @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance *
>               stacktrace_print(&instance->li_stack->ls_stack, prnt);
>  }
>  
> +static int
> +witness_search(struct witness *w, struct witness *target,
> +    struct witness **path, int depth, int *remaining)
> +{
> +     int i, any_remaining;
> +
> +     if (depth == 0) {
> +             *remaining = 1;
> +             return (w == target);
> +     }
> +
> +     any_remaining = 0;
> +     for (i = 1; i <= w_max_used_index; i++) {
> +             if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
> +                     if (witness_search(&w_data[i], target, path, depth - 1,
> +                         remaining)) {
> +                             path[depth - 1] = &w_data[i];
> +                             *remaining = 1;
> +                             return 1;
> +                     }
> +                     if (remaining)
> +                             any_remaining = 1;
> +             }
> +     }
> +     *remaining = any_remaining;
> +     return 0;
> +}
> +
> +static void
> +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> +    struct witness *parent, struct witness *child, int step, int last)
> +{
> +     struct witness_lock_order_data *wlod;
> +     int next;
> +
> +     if (last)
> +             next = 1;
> +     else
> +             next = step + 1;
> +     prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n",
> +         step, parent->w_subtype, parent->w_type->lt_name,
> +         next, child->w_subtype, child->w_type->lt_name);
> +     if (witness_watch > 1) {
> +             mtx_enter(&w_mtx);
> +             wlod = witness_lock_order_get(parent, child);
> +             mtx_leave(&w_mtx);
> +
> +             if (wlod != NULL)
> +                     stacktrace_print(&wlod->wlod_stack, printf);
> +             else
> +                     prnt("lock order data %p -> %p is missing\n",
> +                         parent->w_type->lt_name, child->w_type->lt_name);
> +     }
> +}
> +
> +static void
> +witness_print_cycle(int(*prnt)(const char *fmt, ...),
> +    struct witness *parent, struct witness *child)
> +{
> +     struct witness *path[4];
> +     struct witness *w;
> +     int depth, remaining;
> +     int step = 0;
> +
> +     /*
> +      * Use depth-limited search to find the shortest path
> +      * from child to parent.
> +      */
> +     for (depth = 1; depth < nitems(path); depth++) {
> +             if (witness_search(child, parent, path, depth, &remaining))
> +                     goto found;
> +             if (!remaining)
> +                     break;
> +     }
> +     prnt("witness: incomplete path, depth %d\n", depth);
> +     return;
> +
> +found:
> +     witness_print_cycle_edge(prnt, parent, child, ++step, 0);
> +     for (w = child; depth > 0; depth--) {
> +             witness_print_cycle_edge(prnt, w, path[depth - 1], ++step,
> +                 depth == 1);
> +             w = path[depth - 1];
> +     }
> +}
> +
>  #ifdef DDB
>  static int
>  witness_thread_has_locks(struct proc *p)
> @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int 
>  void
>  witness_print_badstacks(void)
>  {
> -     static struct witness tmp_w1, tmp_w2;
> -     static struct witness_lock_order_data tmp_data1, tmp_data2;
> -     struct witness_lock_order_data *data1, *data2;
>       struct witness *w1, *w2;
>       int error, generation, i, j;
>  
> @@ -2139,11 +2192,6 @@ witness_print_badstacks(void)
>       }
>       error = 0;
>  
> -     memset(&tmp_w1, 0, sizeof(tmp_w1));
> -     memset(&tmp_w2, 0, sizeof(tmp_w2));
> -     memset(&tmp_data1, 0, sizeof(tmp_data1));
> -     memset(&tmp_data2, 0, sizeof(tmp_data2));
> -
>  restart:
>       mtx_enter(&w_mtx);
>       generation = w_generation;
> @@ -2165,12 +2213,9 @@ restart:
>                       mtx_leave(&w_mtx);
>                       continue;
>               }
> -
> -             /* Copy w1 locally so we can release the spin lock. */
> -             tmp_w1 = *w1;
>               mtx_leave(&w_mtx);
>  
> -             if (tmp_w1.w_reversed == 0)
> +             if (w1->w_reversed == 0)
>                       continue;
>               for (j = 1; j < w_max_used_index; j++) {
>                       if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
> @@ -2187,47 +2232,13 @@ restart:
>                       }
>  
>                       w2 = &w_data[j];
> -                     data1 = witness_lock_order_get(w1, w2);
> -                     data2 = witness_lock_order_get(w2, w1);
> -
> -                     /*
> -                      * Copy information locally so we can release the
> -                      * spin lock.
> -                      */
> -                     tmp_w2 = *w2;
> -
> -                     if (data1)
> -                             tmp_data1.wlod_stack = data1->wlod_stack;
> -                     if (data2 && data2 != data1)
> -                             tmp_data2.wlod_stack = data2->wlod_stack;
>                       mtx_leave(&w_mtx);
>  
>                       db_printf("\nLock order reversal between \"%s\"(%s) "
>                           "and \"%s\"(%s)!\n",
> -                         tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
> -                         tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
> -                     if (data1) {
> -                             db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> -                                 "first seen at:\n",
> -                                 tmp_w1.w_type->lt_name,
> -                                 tmp_w1.w_class->lc_name,
> -                                 tmp_w2.w_type->lt_name,
> -                                 tmp_w2.w_class->lc_name);
> -                             stacktrace_print(&tmp_data1.wlod_stack,
> -                                 db_printf);
> -                             db_printf("\n");
> -                     }
> -                     if (data2 && data2 != data1) {
> -                             db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> -                                 "first seen at:\n",
> -                                 tmp_w2.w_type->lt_name,
> -                                 tmp_w2.w_class->lc_name,
> -                                 tmp_w1.w_type->lt_name,
> -                                 tmp_w1.w_class->lc_name);
> -                             stacktrace_print(&tmp_data2.wlod_stack,
> -                                 db_printf);
> -                             db_printf("\n");
> -                     }
> +                         w1->w_type->lt_name, w1->w_class->lc_name,
> +                         w2->w_type->lt_name, w2->w_class->lc_name);
> +                     witness_print_cycle(db_printf, w1, w2);
>               }
>       }
>       mtx_enter(&w_mtx);

Re: lock order reversal in soreceive and NFS

Reply via email to