The branch main has been updated by jtl: URL: https://cgit.FreeBSD.org/src/commit/?id=fb4b0c91195195561560bb2fb2c1ba8da81f7ccf
commit fb4b0c91195195561560bb2fb2c1ba8da81f7ccf Author: Jonathan T. Looney <[email protected]> AuthorDate: 2026-01-17 01:20:26 +0000 Commit: Jonathan T. Looney <[email protected]> CommitDate: 2026-01-26 20:22:57 +0000 witness: Provide facility to print detailed lock tree When witness(4) detects lock order reversals (LORs), it prints information about the stack trace which caused the LOR. If available, it can also print information about the first stack trace which established the other lock ordering. However, it only does this for "simple" LORs where the two locks in question were directly locked in the opposite order. When the lock order was established through a more complex pattern of intermediate locks, WITNESS only prints the stack trace where it detected the LOR. This commit provides new functionality to provide more verbose information about the lock chain(s) which established the lock ordering. The new functionality can be disabled by setting the debug.witness.trace sysctl/tunable to 1. The new functionality is also available through the debug.witness.badstacks sysctl, which has been modified to always show the more verbose information. Reviewed by: markj, glebius (previous version), kib (previous version) Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D54785 MFC after: 1 month --- share/man/man4/witness.4 | 50 ++++++- sys/kern/subr_witness.c | 343 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 368 insertions(+), 25 deletions(-) diff --git a/share/man/man4/witness.4 b/share/man/man4/witness.4 index f382a9378727..9016da3b39c6 100644 --- a/share/man/man4/witness.4 +++ b/share/man/man4/witness.4 @@ -21,7 +21,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd March 5, 2025 +.Dd January 26, 2026 .Dt WITNESS 4 .Os .Sh NAME @@ -125,6 +125,26 @@ can be set via .Xr loader 8 . .Pp The sysctl +.Va debug.witness.trace +controls whether +.Nm +prints stack traces when it detects lock violations. +A value of 0 disables stack printing. +A value of 1 specifies that +.Nm +should print a stack trace when a lock hierarchy violation occurs or +non-sleepable locks are held when going to sleep or acquiring a sleepable lock. +A value of 2 specifies that +.Nm +should attempt to display all observed lock chains that contribute to the +established lock order, along with stack traces for when those locks were +first acquired. +The sysctl +.Va debug.witness.trace +can be set via +.Xr loader 8 . +.Pp +The sysctl .Va debug.witness.output_channel specifies the output channel used to display warnings emitted by .Nm . @@ -139,9 +159,28 @@ and This sysctl can be set via .Xr loader 8 . .Pp +The sysctl +.Va debug.witness.badstacks +displays a listing of detected lock order violations cached in the +.Nm +module's current view of the lock hierarchy. +(This means that it may not display information for locks which have been +destroyed.) +It displays a similar level of detail to the messages produced by the run-time +checks. +However, it always tries to show all observed lock chains that contribute to the +established lock order. +(In other words, it acts like +.Va debug.witness.trace +was set to 2.) +It uses '**' to mark lock orders which were attempted but not added to the +hierarchy because they violated the hierarchy the +.Nm +code had previously observed. +.Pp The .Nm -code also provides three extra +code also provides a few extra .Xr ddb 4 commands if both .Nm @@ -166,10 +205,15 @@ then the locks held by the current thread are displayed. Outputs the list of locks held by all threads in the system to the kernel console. .It Ic show witness -Dump the current order list to the kernel console. +Dumps the current order list to the kernel console. The code first displays the lock order tree for all of the sleep locks. Then it displays the lock order tree for all of the spin locks. Finally, it displays a list of locks that have not yet been acquired. +.It Ic show badstacks +Displays a listing of all WITNESS lock order violations. +This listing is identical to that produced by the +.Va debug.witness.badstacks +sysctl. .El .Sh SEE ALSO .Xr ddb 4 , diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index abed76315c34..a332d6bace2a 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -390,12 +390,16 @@ SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #if defined(DDB) || defined(KDB) /* - * When DDB or KDB is enabled and witness_trace is 1, it will cause the system - * to print a stack trace: + * When DDB or KDB is enabled and witness_trace is > 0, it will cause the system + * to print a stack trace when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. + * + * Additionally, if witness_trace is 2, it will cause the system to search + * for all locks which established the known lock ordering and print + * stack traces of where the lock ordering was first established. */ -int witness_trace = 1; +int witness_trace = 2; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ @@ -1075,6 +1079,237 @@ witness_ddb_display(int(*prnt)(const char *fmt, ...)) } #endif /* DDB */ +#define NUM_VERBOSE_STACKS 256 +#define MAX_LOCKCHAIN_RECURSION 32 + +/* + * Struct used by the verbose witness functionality. Only sb, generation, + * pairs, pair_count, check_generation, and alloc_flags communicate data + * between multiple functions. The rest are used to pre-allocate space for + * data which would otherwise end up on the stack. + */ +struct verbose_tracker { + struct witness t_w1, t_w2; + struct stack t_stack; + struct sbuf *sb; + int generation; + int alloc_flags; + int pairs[2 * NUM_VERBOSE_STACKS]; + int pair_count; + int recursion_list[MAX_LOCKCHAIN_RECURSION]; + int found[MAX_LOCKCHAIN_RECURSION + 1]; + int iter[MAX_LOCKCHAIN_RECURSION]; + bool check_generation; +}; + +static void +init_verbose_tracker(struct verbose_tracker *t, struct sbuf *sb, + int alloc_flags, bool check_generation) +{ + + KASSERT(t != NULL, + ("%s: NULL t argument", __func__)); + KASSERT(alloc_flags == M_WAITOK || alloc_flags == M_NOWAIT, + ("%s: Unexpected alloc_flags %d", __func__, alloc_flags)); + t->sb = sb; + t->check_generation = check_generation; + t->alloc_flags = alloc_flags; +} + +static void +reset_verbose_tracker(struct verbose_tracker *t, int generation) +{ + + KASSERT(t != NULL, + ("%s: NULL t argument", __func__)); + t->pair_count = 0; + t->generation = generation; +} + +static bool +has_verbose_lockpair(const struct verbose_tracker *t, int from, int to) +{ + int i; + + /* Look for value. */ + for (i = 0; i < (2 * t->pair_count); i += 2) + if (t->pairs[i] == from && t->pairs[i + 1] == to) + return (true); + return (false); +} + +static void +add_verbose_lockpair(struct verbose_tracker *t, int from, int to) +{ + + /* Check for duplicates. */ + if (has_verbose_lockpair(t, from, to)) + return; + + /* Add a new value. */ + if (t->pair_count < NUM_VERBOSE_STACKS) { + t->pairs[t->pair_count * 2] = from; + t->pairs[(t->pair_count * 2) + 1] = to; + t->pair_count++; + } +} + +static void +sbuf_print_verbose_witness_chains(struct verbose_tracker *t, int from, int to) +{ + struct witness *w1, *w2; + int i, recursion_count; + + recursion_count = 0; + + mtx_lock_spin(&w_mtx); + if (t->check_generation && t->generation != w_generation) { + mtx_unlock_spin(&w_mtx); + + /* + * The graph has changed. Break the recursion loop. + * The calling function should figure out what happened and + * restart. + */ + return; + } + +top: + t->found[recursion_count] = 0; + + /* + * Check for a direct dependence. If so, print that here. + * However, we keep scanning just in case there are other + * locking paths between these two locks. + */ + w1 = &w_data[from]; + w2 = &w_data[to]; + if (isitmychild(w1, w2)) { + t->t_w1 = *w1; + t->t_w2 = *w2; + mtx_unlock_spin(&w_mtx); + + sbuf_printf(t->sb, "\"%s\" -> \"%s\"", + t->t_w1.w_name, t->t_w2.w_name); + + /* Add the lockchain which got us here. */ + KASSERT(recursion_count >= 0 && + recursion_count <= MAX_LOCKCHAIN_RECURSION, + ("Invalid recursion_count: %d", recursion_count)); + for (i = recursion_count - 1; i >= 0; i--) { + mtx_lock_spin(&w_mtx); + if (t->check_generation && + t->generation != w_generation) { + mtx_unlock_spin(&w_mtx); + /* The graph has changed. */ + return; + } + /* + * Make a local copy, drop the lock, and add the lock + * to the sbuf. + */ + t->t_w1 = w_data[t->recursion_list[i]]; + mtx_unlock_spin(&w_mtx); + sbuf_printf(t->sb, " -> \"%s\"", t->t_w1.w_name); + } + + sbuf_putc(t->sb, '\n'); + add_verbose_lockpair(t, from, to); + t->found[recursion_count]++; + + mtx_lock_spin(&w_mtx); + if (t->check_generation && t->generation != w_generation) { + mtx_unlock_spin(&w_mtx); + return; + } + } + + /* + * Ensure we aren't recursing too many times. We do this check + * after looking for direct dependencies so we don't fail to + * catch at least those at the limits of our recursion. + */ + if (recursion_count >= MAX_LOCKCHAIN_RECURSION) + goto end; + + /* + * Record our 'to' lock on the recursion list. We will use this + * to build successful lock chains later. + */ + t->recursion_list[recursion_count] = to; + t->iter[recursion_count] = 1; + +loop: + /* Walk all parents of 'to' to see if any have a path to 'from'. */ + for (; t->iter[recursion_count] < w_max_used_index; + t->iter[recursion_count]++) { + if (t->iter[recursion_count] == to || + t->iter[recursion_count] == from) + continue; + if (isitmychild(&w_data[t->iter[recursion_count]], + &w_data[to])) { + /* Recurse to the parent. */ + to = t->iter[recursion_count]; + recursion_count++; + goto top; + } + } +end: + if (recursion_count != 0) { + recursion_count--; + to = t->recursion_list[recursion_count]; + if (t->found[recursion_count + 1] > 0) { + add_verbose_lockpair(t, t->iter[recursion_count], to); + t->found[recursion_count]++; + } + t->iter[recursion_count]++; + goto loop; + } + mtx_unlock_spin(&w_mtx); +} + +static void +sbuf_print_verbose_witness_stacks(struct verbose_tracker *t) +{ + struct witness_lock_order_data *data; + int i; + + for (i = 0; i < (2 * t->pair_count); i += 2) { + mtx_lock_spin(&w_mtx); + if (t->check_generation && t->generation != w_generation) { + /* + * The graph has changed. Return to the calling + * function so it can restart. + */ + mtx_unlock_spin(&w_mtx); + break; + } + + /* + * Make a local copy of the data we need so we can drop + * the lock. + */ + t->t_w1 = w_data[t->pairs[i]]; + t->t_w2 = w_data[t->pairs[i + 1]]; + data = witness_lock_order_get(&t->t_w1, &t->t_w2); + if (data != NULL) + stack_copy(&data->wlod_stack, &t->t_stack); + mtx_unlock_spin(&w_mtx); + + sbuf_printf(t->sb, + "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", + t->t_w1.w_name, t->t_w1.w_class->lc_name, + t->t_w2.w_name, t->t_w2.w_class->lc_name); + if (data != NULL) + stack_sbuf_print_flags(t->sb, &t->t_stack, + t->alloc_flags, STACK_SBUF_FMT_LONG); + else + sbuf_printf(t->sb, + "(No stack trace--hardcoded relationship?)\n"); + sbuf_putc(t->sb, '\n'); + } +} + int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { @@ -1117,6 +1352,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, struct witness *w, *w1; struct thread *td; int i, j; + bool print_lock_order; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || KERNEL_PANICKED()) @@ -1279,7 +1515,8 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { struct stack pstack; - bool pstackv, trace; + int trace; + bool pstackv; MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; @@ -1396,6 +1633,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, /* * Ok, yell about it. */ + print_lock_order = false; if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && (lock1->li_flags & LI_SLEEPABLE) == 0) @@ -1405,8 +1643,10 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); - else + else { witness_output("lock order reversal:\n"); + print_lock_order = true; + } /* * Try to locate an earlier lock with @@ -1455,6 +1695,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, if (trace) { char buf[64]; struct sbuf sb; + struct verbose_tracker *t; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, @@ -1466,6 +1707,37 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, w->w_name, w1->w_name); stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT, STACK_SBUF_FMT_LONG); + } else if (trace > 1 && print_lock_order && + (t = malloc(sizeof(struct verbose_tracker), + M_TEMP, M_NOWAIT | M_ZERO)) != NULL) { + /* + * We make a purposeful decision to + * ignore generation changes while + * printing. The two locks in + * question are in use, so won't be + * going away. There is a small + * chance that intermediate locks + * in a lock chain get destroyed + * while we are traversing the + * chain or printing them, but even + * then nothing "bad" should happen + * with the current code since the + * WITNESS objects are not actually + * freed and re-used. If that changes, + * we might need to reassess the + * decision to ignore generation. + */ + init_verbose_tracker(t, &sb, M_NOWAIT, + false); + reset_verbose_tracker(t, 0); + sbuf_printf(&sb, + "All lock orders from %s -> %s:\n", + w->w_name, w1->w_name); + sbuf_print_verbose_witness_chains(t, + w->w_index, w1->w_index); + sbuf_putc(&sb, '\n'); + sbuf_print_verbose_witness_stacks(t); + free(t, M_TEMP); } sbuf_printf(&sb, @@ -2645,16 +2917,14 @@ DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE) #endif static void -sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) +sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx, + bool check_generation) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; + struct verbose_tracker *t; int generation, i, j; - - tmp_data1 = NULL; - tmp_data2 = NULL; - tmp_w1 = NULL; - tmp_w2 = NULL; + bool w1_is_parent, w2_is_parent; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); @@ -2665,16 +2935,19 @@ sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); + t = malloc(sizeof(struct verbose_tracker), M_TEMP, M_WAITOK | M_ZERO); + init_verbose_tracker(t, sb, M_WAITOK, check_generation); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); + reset_verbose_tracker(t, generation); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); - if (generation != w_generation) { + if (check_generation && generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ @@ -2700,7 +2973,7 @@ restart: continue; mtx_lock_spin(&w_mtx); - if (generation != w_generation) { + if (check_generation && generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ @@ -2729,6 +3002,8 @@ restart: stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } + w1_is_parent = isitmydescendant(w1, w2); + w2_is_parent = isitmydescendant(w2, w1); mtx_unlock_spin(&w_mtx); if (blessed(tmp_w1, tmp_w2)) @@ -2738,26 +3013,49 @@ restart: "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); - if (data1) { + if (w1_is_parent || data1 != NULL) { sbuf_printf(sb, - "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", + "All lock orders from \"%s\"(%s) -> \"%s\"(%s):\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); - stack_sbuf_print(sb, &tmp_data1->wlod_stack); + if (w1_is_parent) + sbuf_print_verbose_witness_chains(t, i, + j); + if (data1 && !has_verbose_lockpair(t, i, j)) { + sbuf_printf(t->sb, + "** \"%s\" -> \"%s\"\n", + tmp_w1->w_name, tmp_w2->w_name); + add_verbose_lockpair(t, i, j); + } + sbuf_putc(sb, '\n'); + sbuf_print_verbose_witness_stacks(t); sbuf_putc(sb, '\n'); + reset_verbose_tracker(t, generation); } - if (data2 && data2 != data1) { + if (w2_is_parent || (data2 != NULL && data2 != data1)) { sbuf_printf(sb, - "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", + "All lock orders from \"%s\"(%s) -> \"%s\"(%s):\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); - stack_sbuf_print(sb, &tmp_data2->wlod_stack); + if (w2_is_parent) + sbuf_print_verbose_witness_chains(t, j, + i); + if (data2 && data2 != data1 && + !has_verbose_lockpair(t, j, i)) { + sbuf_printf(t->sb, + "** \"%s\" -> \"%s\"\n", + tmp_w2->w_name, tmp_w1->w_name); + add_verbose_lockpair(t, j, i); + } + sbuf_putc(sb, '\n'); + sbuf_print_verbose_witness_stacks(t); sbuf_putc(sb, '\n'); + reset_verbose_tracker(t, generation); } } } mtx_lock_spin(&w_mtx); - if (generation != w_generation) { + if (check_generation && generation != w_generation) { mtx_unlock_spin(&w_mtx); /* @@ -2775,6 +3073,7 @@ restart: free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); + free(t, M_TEMP); } static int @@ -2796,7 +3095,7 @@ sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) if (sb == NULL) return (ENOMEM); - sbuf_print_witness_badstacks(sb, &req->oldidx); + sbuf_print_witness_badstacks(sb, &req->oldidx, true); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); @@ -2820,7 +3119,7 @@ DB_SHOW_COMMAND_FLAGS(badstacks, db_witness_badstacks, DB_CMD_MEMSAFE) sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); - sbuf_print_witness_badstacks(&sb, &dummy); + sbuf_print_witness_badstacks(&sb, &dummy, false); sbuf_finish(&sb); } #endif
