Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/12/19 1:55 AM, Peter Zijlstra wrote: On Thu, Jul 11, 2019 at 11:53:12AM -0700, Bart Van Assche wrote: On 7/10/19 3:09 PM, Peter Zijlstra wrote: One thing I mentioned when Thomas did the unwinder API changes was trying to move lockdep over to something like stackdepot. We can't directly use stackdepot as is, because it uses locks and memory allocation, but we could maybe add a lower level API to it and use that under the graph_lock() on static storage or something. Otherwise we'll have to (re)implement something like it. I've not looked at it in detail. Hi Peter, Is something like the untested patch below perhaps what you had in mind? Most excellent, yes! Now I suppose the $64000 question is if it actually reduces the amount of storage we use for stack traces.. Seems to boot just fine.. :-) Hi Peter, On my setup after some time the space occupied by stack traces stabilizes to the following: # grep stack /proc/lockdep_stats stack-trace entries:169456 [max: 524288] number of stack traces:9073 number of stack hash chains: 6980 I think these numbers show that there are some but not too many hash collisions (the code for the hash statistics was added after I e-mailed my patch). Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On Thu, Jul 11, 2019 at 11:53:12AM -0700, Bart Van Assche wrote: > On 7/10/19 3:09 PM, Peter Zijlstra wrote: > > One thing I mentioned when Thomas did the unwinder API changes was > > trying to move lockdep over to something like stackdepot. > > > > We can't directly use stackdepot as is, because it uses locks and memory > > allocation, but we could maybe add a lower level API to it and use that > > under the graph_lock() on static storage or something. > > > > Otherwise we'll have to (re)implement something like it. > > > > I've not looked at it in detail. > > Hi Peter, > > Is something like the untested patch below perhaps what you had in mind? Most excellent, yes! Now I suppose the $64000 question is if it actually reduces the amount of storage we use for stack traces.. Seems to boot just fine.. :-)
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 3:09 PM, Peter Zijlstra wrote: One thing I mentioned when Thomas did the unwinder API changes was trying to move lockdep over to something like stackdepot. We can't directly use stackdepot as is, because it uses locks and memory allocation, but we could maybe add a lower level API to it and use that under the graph_lock() on static storage or something. Otherwise we'll have to (re)implement something like it. I've not looked at it in detail. Hi Peter, Is something like the untested patch below perhaps what you had in mind? Thanks, Bart. diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 57baa27f238c..a5c7c1813d35 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,10 +66,7 @@ struct lock_class_key { extern struct lock_class_key __lockdep_no_validate__; -struct lock_trace { - unsigned intnr_entries; - unsigned intoffset; -}; +struct lock_trace; #define LOCKSTAT_POINTS4 @@ -105,7 +102,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - struct lock_trace usage_traces[XXX_LOCK_USAGE_STATES]; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; /* * Generation counter, when doing certain classes of graph walking, @@ -193,7 +190,7 @@ struct lock_list { struct list_headentry; struct lock_class *class; struct lock_class *links_to; - struct lock_trace trace; + const struct lock_trace *trace; int distance; /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 341f52117f88..c3022367ba2a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,72 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/** + * struct lock_trace - single stack backtrace + * @hash_entry:Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries:Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); + + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * +sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { +
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On Wed, Jul 10, 2019 at 02:23:39PM -0700, Bart Van Assche wrote: > As one can see in remove_class_from_lock_chain() there is already code > present in lockdep for compacting the chain_hlocks[] array. Similar code > is not yet available for the stack_trace[] array because I had not > encountered any overflows of that array during my tests. One thing I mentioned when Thomas did the unwinder API changes was trying to move lockdep over to something like stackdepot. We can't directly use stackdepot as is, because it uses locks and memory allocation, but we could maybe add a lower level API to it and use that under the graph_lock() on static storage or something. Otherwise we'll have to (re)implement something like it. I've not looked at it in detail.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 1:47 PM, Eric Dumazet wrote: > > > On 7/10/19 9:09 PM, Bart Van Assche wrote: >> On 7/10/19 11:44 AM, Eric Dumazet wrote: >>> If anything using workqueues in dynamically allocated objects can turn off >>> lockdep, >>> we have a serious issue. >> >> As far as I know that issue is only hit by syzbot tests. > > > >> Anyway, I see >> two possible paths forward: >> * Revert the patch that makes workqueues use dynamic lockdep keys and >> thereby reintroduce the false positives that lockdep reports if >> different workqueues share a lockdep key. I think there is agreement >> that having to analyze lockdep false positives is annoying, time >> consuming and something nobody likes. >> * Modify lockdep such that space in its fixed size arrays that is no >> longer in use gets reused. Since the stack traces saved in the >> stack_trace[] array have a variable size that array will have to be >> compacted to avoid fragmentation. >> > > Can not destroy_workqueue() undo what alloc_workqueue() did ? destroy_workqueue() already calls lockdep_unregister_key(). If that wouldn't happen then sooner or later one of the warning statements in the lockdep code would be triggered, e.g. the WARN_ON_ONCE() statement in lockdep_register_key(). I think the root cause is that lockdep_unregister_key() does not free the stack traces recorded by the lockdep save_trace() function. save_trace() is called every time a lock is acquired and lockdep encounters a new lock dependency chain. As one can see in remove_class_from_lock_chain() there is already code present in lockdep for compacting the chain_hlocks[] array. Similar code is not yet available for the stack_trace[] array because I had not encountered any overflows of that array during my tests. Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 9:09 PM, Bart Van Assche wrote: > On 7/10/19 11:44 AM, Eric Dumazet wrote: >> If anything using workqueues in dynamically allocated objects can turn off >> lockdep, >> we have a serious issue. > > As far as I know that issue is only hit by syzbot tests. > Anyway, I see > two possible paths forward: > * Revert the patch that makes workqueues use dynamic lockdep keys and > thereby reintroduce the false positives that lockdep reports if > different workqueues share a lockdep key. I think there is agreement > that having to analyze lockdep false positives is annoying, time > consuming and something nobody likes. > * Modify lockdep such that space in its fixed size arrays that is no > longer in use gets reused. Since the stack traces saved in the > stack_trace[] array have a variable size that array will have to be > compacted to avoid fragmentation. > Can not destroy_workqueue() undo what alloc_workqueue() did ?
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 11:44 AM, Eric Dumazet wrote: > If anything using workqueues in dynamically allocated objects can turn off > lockdep, > we have a serious issue. As far as I know that issue is only hit by syzbot tests. Anyway, I see two possible paths forward: * Revert the patch that makes workqueues use dynamic lockdep keys and thereby reintroduce the false positives that lockdep reports if different workqueues share a lockdep key. I think there is agreement that having to analyze lockdep false positives is annoying, time consuming and something nobody likes. * Modify lockdep such that space in its fixed size arrays that is no longer in use gets reused. Since the stack traces saved in the stack_trace[] array have a variable size that array will have to be compacted to avoid fragmentation. Did I overlook anything? Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 8:36 PM, Bart Van Assche wrote: > On 7/10/19 11:02 AM, Eric Biggers wrote: >> I already mentioned that io_uring triggers it too. >> >> Those are just 2 cases that syzbot happened to generate reproducers for. I >> expect there are many others too, since many places in the kernel allocate >> workqueues. AFAICS most are placed in static or global variables which >> avoids >> this issue, but there are still many cases where a workqueue is owned by some >> dynamic structure that can have a much shorter lifetime. >> >> You can also check the other syzbot reports that look similar >> (https://lore.kernel.org/lkml/20190710055838.GC2152@sol.localdomain/). >> Two of them have C reproducers too. > > As you may know lockdep cannot use dynamic memory allocation because > doing so would introduce a circular dependency between lockdep and the > memory allocator. Hence the fixed size arrays in the lockdep code. > Additionally, as far as I know lockdep works fine for human kernel > developers and only syzbot runs code that triggers the lockdep limits. > So I think it's up to the syzbot authors to come up with a solution. I > mean another solution than finger pointing at kernel developers. This is silly, syzbot simply uses kernel standard system calls. If anything using workqueues in dynamically allocated objects can turn off lockdep, we have a serious issue. Please work on this issue instead of trying to blame someone else. Thank you.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 11:02 AM, Eric Biggers wrote: > I already mentioned that io_uring triggers it too. > > Those are just 2 cases that syzbot happened to generate reproducers for. I > expect there are many others too, since many places in the kernel allocate > workqueues. AFAICS most are placed in static or global variables which avoids > this issue, but there are still many cases where a workqueue is owned by some > dynamic structure that can have a much shorter lifetime. > > You can also check the other syzbot reports that look similar > (https://lore.kernel.org/lkml/20190710055838.GC2152@sol.localdomain/). > Two of them have C reproducers too. As you may know lockdep cannot use dynamic memory allocation because doing so would introduce a circular dependency between lockdep and the memory allocator. Hence the fixed size arrays in the lockdep code. Additionally, as far as I know lockdep works fine for human kernel developers and only syzbot runs code that triggers the lockdep limits. So I think it's up to the syzbot authors to come up with a solution. I mean another solution than finger pointing at kernel developers. Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On Wed, Jul 10, 2019 at 10:46:00AM -0700, Bart Van Assche wrote: > On 7/10/19 10:21 AM, Eric Biggers wrote: > > With my simplified reproducer, on commit 669de8bda87b ("kernel/workqueue: > > Use > > dynamic lockdep keys for workqueues") I see: > > > > WARNING: CPU: 3 PID: 189 at kernel/locking/lockdep.c:747 > > register_lock_class+0x4f6/0x580 > > > > and then somewhat later: > > > > BUG: MAX_LOCKDEP_KEYS too low! > > > > If on top of that I cherry pick commit 28d49e282665 ("locking/lockdep: > > Shrink > > struct lock_class_key"), I see instead: > > > > BUG: MAX_STACK_TRACE_ENTRIES too low! > > > > I also see that on mainline. > > > > Alternatively, if I check out 669de8bda87b and revert it, I don't see > > anything. > > Hi Eric, > > Is the rdma_ucm code the only code that triggers the "BUG: > MAX_STACK_TRACE_ENTRIES too low!" complaint or is this complaint also > triggered by other kernel code? I'm asking this because I think that > fixing this would require to implement garbage collection for the > stack_trace[] array in the lockdep code. That would make the lockdep > code slower. I don't think that making the lockdep code slower would be > welcome. > > Bart. I already mentioned that io_uring triggers it too. Those are just 2 cases that syzbot happened to generate reproducers for. I expect there are many others too, since many places in the kernel allocate workqueues. AFAICS most are placed in static or global variables which avoids this issue, but there are still many cases where a workqueue is owned by some dynamic structure that can have a much shorter lifetime. You can also check the other syzbot reports that look similar (https://lore.kernel.org/lkml/20190710055838.GC2152@sol.localdomain/). Two of them have C reproducers too. - Eric
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/10/19 10:21 AM, Eric Biggers wrote: > With my simplified reproducer, on commit 669de8bda87b ("kernel/workqueue: Use > dynamic lockdep keys for workqueues") I see: > > WARNING: CPU: 3 PID: 189 at kernel/locking/lockdep.c:747 > register_lock_class+0x4f6/0x580 > > and then somewhat later: > > BUG: MAX_LOCKDEP_KEYS too low! > > If on top of that I cherry pick commit 28d49e282665 ("locking/lockdep: Shrink > struct lock_class_key"), I see instead: > > BUG: MAX_STACK_TRACE_ENTRIES too low! > > I also see that on mainline. > > Alternatively, if I check out 669de8bda87b and revert it, I don't see > anything. Hi Eric, Is the rdma_ucm code the only code that triggers the "BUG: MAX_STACK_TRACE_ENTRIES too low!" complaint or is this complaint also triggered by other kernel code? I'm asking this because I think that fixing this would require to implement garbage collection for the stack_trace[] array in the lockdep code. That would make the lockdep code slower. I don't think that making the lockdep code slower would be welcome. Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On Wed, Jul 10, 2019 at 10:00:59AM -0700, Eric Biggers wrote: > On Wed, Jul 10, 2019 at 07:19:55AM -0700, Bart Van Assche wrote: > > On 7/9/19 10:30 PM, Eric Biggers wrote: > > > [Moved most people to Bcc; syzbot added way too many random people to > > > this.] > > > > > > Hi Bart, > > > > > > On Sat, Mar 30, 2019 at 07:17:09PM -0700, Bart Van Assche wrote: > > > > On 3/30/19 2:58 PM, syzbot wrote: > > > > > syzbot has bisected this bug to: > > > > > > > > > > commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f > > > > > Author: Bart Van Assche > > > > > Date: Thu Feb 14 23:00:54 2019 + > > > > > > > > > > kernel/workqueue: Use dynamic lockdep keys for workqueues > > > > > > > > > > bisection log: > > > > > https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 > > > > > start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of > > > > > git://git.kernel.. > > > > > git tree: upstream > > > > > final crash: > > > > > https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 > > > > > console output: > > > > > https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 > > > > > kernel config: > > > > > https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf > > > > > dashboard link: > > > > > https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 > > > > > syz repro: > > > > > https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 > > > > > C reproducer: > > > > > https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 > > > > > > > > > > Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com > > > > > Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for > > > > > workqueues") > > > > > > > > > > For information about bisection process see: > > > > > https://goo.gl/tpsmEJ#bisection > > > > > > > > Hi Dmitry, > > > > > > > > This bisection result doesn't make sense to me. As one can see, the > > > > message > > > > "BUG: MAX_STACK_TRACE_ENTRIES too low!" does not occur in the console > > > > output > > > > the above console output URL points at. > > > > > > > > Bart. > > > > > > This is still happening on mainline, and I think this bisection result is > > > probably correct. syzbot did start hitting something different at the > > > very end > > > of the bisection ("WARNING: CPU: 0 PID: 9153 at > > > kernel/locking/lockdep.c:747") > > > but that seems to be just because your commit had a lot of bugs in it, > > > which had > > > to be fixed by later commits. In particular, the WARNING seems to have > > > been > > > fixed by commit 28d49e282665e ("locking/lockdep: Shrink struct > > > lock_class_key"). > > > > > > What seems to still be happening is that the dynamic lockdep keys which > > > you > > > added make it possible for an unbounded number of entries to be added to > > > the > > > fixed length stack_trace[] array in kernel/locking/lockdep.c. Hence the > > > "BUG: > > > MAX_STACK_TRACE_ENTRIES too low!". > > > > > > Am I understanding it correctly? How did you intend this to work? > > > > The last two paragraphs do not make sense to me. My changes do not increase > > the number of stack traces that get recorded by the lockdep code. > > > > Bart. > > > > Interesting. How do we explain that repeatedly allocating and freeing a > workqueue is causing the number of lockdep stack trace entries to grow without > bound, though? > > This can be reproduced with the following (which I simplified from the C > reproducer that syzbot generated and used for its bisection): > > #include > #include > > int main() > { > for (;;) { > int fd = open("/dev/infiniband/rdma_cm", O_RDWR); > > close(fd); > } > } > > The workqueue is allocated in ucma_open() and freed in ucma_close(). If I run > 'grep stack-trace /proc/lockdep_stats' while reproducer is running, I can see > the number is growing continuously until it hits the limit. > > There is also a reproducer using io_uring instead of rdma_cm > (https://syzkaller.appspot.com/text?tag=ReproC=16483bf860). > In both cases the workqueue is associated with a file descriptor; the > workqueue > is allocated and freed as the file descriptor is opened and closed. > > Anyone have any ideas? > > - Eric With my simplified reproducer, on commit 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") I see: WARNING: CPU: 3 PID: 189 at kernel/locking/lockdep.c:747 register_lock_class+0x4f6/0x580 and then somewhat later: BUG: MAX_LOCKDEP_KEYS too low! If on top of that I cherry pick commit 28d49e282665 ("locking/lockdep: Shrink struct lock_class_key"), I see instead: BUG: MAX_STACK_TRACE_ENTRIES too low! I also see that on mainline. Alternatively, if I check out 669de8bda87b and revert it, I don't see anything. - Eric
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On Wed, Jul 10, 2019 at 07:19:55AM -0700, Bart Van Assche wrote: > On 7/9/19 10:30 PM, Eric Biggers wrote: > > [Moved most people to Bcc; syzbot added way too many random people to this.] > > > > Hi Bart, > > > > On Sat, Mar 30, 2019 at 07:17:09PM -0700, Bart Van Assche wrote: > > > On 3/30/19 2:58 PM, syzbot wrote: > > > > syzbot has bisected this bug to: > > > > > > > > commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f > > > > Author: Bart Van Assche > > > > Date: Thu Feb 14 23:00:54 2019 + > > > > > > > > kernel/workqueue: Use dynamic lockdep keys for workqueues > > > > > > > > bisection log: > > > > https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 > > > > start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of > > > > git://git.kernel.. > > > > git tree: upstream > > > > final crash: > > > > https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 > > > > console output: https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 > > > > kernel config: > > > > https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf > > > > dashboard link: > > > > https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 > > > > syz repro: > > > > https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 > > > > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 > > > > > > > > Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com > > > > Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for > > > > workqueues") > > > > > > > > For information about bisection process see: > > > > https://goo.gl/tpsmEJ#bisection > > > > > > Hi Dmitry, > > > > > > This bisection result doesn't make sense to me. As one can see, the > > > message > > > "BUG: MAX_STACK_TRACE_ENTRIES too low!" does not occur in the console > > > output > > > the above console output URL points at. > > > > > > Bart. > > > > This is still happening on mainline, and I think this bisection result is > > probably correct. syzbot did start hitting something different at the very > > end > > of the bisection ("WARNING: CPU: 0 PID: 9153 at > > kernel/locking/lockdep.c:747") > > but that seems to be just because your commit had a lot of bugs in it, > > which had > > to be fixed by later commits. In particular, the WARNING seems to have been > > fixed by commit 28d49e282665e ("locking/lockdep: Shrink struct > > lock_class_key"). > > > > What seems to still be happening is that the dynamic lockdep keys which you > > added make it possible for an unbounded number of entries to be added to the > > fixed length stack_trace[] array in kernel/locking/lockdep.c. Hence the > > "BUG: > > MAX_STACK_TRACE_ENTRIES too low!". > > > > Am I understanding it correctly? How did you intend this to work? > > The last two paragraphs do not make sense to me. My changes do not increase > the number of stack traces that get recorded by the lockdep code. > > Bart. > Interesting. How do we explain that repeatedly allocating and freeing a workqueue is causing the number of lockdep stack trace entries to grow without bound, though? This can be reproduced with the following (which I simplified from the C reproducer that syzbot generated and used for its bisection): #include #include int main() { for (;;) { int fd = open("/dev/infiniband/rdma_cm", O_RDWR); close(fd); } } The workqueue is allocated in ucma_open() and freed in ucma_close(). If I run 'grep stack-trace /proc/lockdep_stats' while reproducer is running, I can see the number is growing continuously until it hits the limit. There is also a reproducer using io_uring instead of rdma_cm (https://syzkaller.appspot.com/text?tag=ReproC=16483bf860). In both cases the workqueue is associated with a file descriptor; the workqueue is allocated and freed as the file descriptor is opened and closed. Anyone have any ideas? - Eric
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 7/9/19 10:30 PM, Eric Biggers wrote: [Moved most people to Bcc; syzbot added way too many random people to this.] Hi Bart, On Sat, Mar 30, 2019 at 07:17:09PM -0700, Bart Van Assche wrote: On 3/30/19 2:58 PM, syzbot wrote: syzbot has bisected this bug to: commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f Author: Bart Van Assche Date: Thu Feb 14 23:00:54 2019 + kernel/workqueue: Use dynamic lockdep keys for workqueues bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of git://git.kernel.. git tree: upstream final crash: https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 console output: https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 kernel config: https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf dashboard link: https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") For information about bisection process see: https://goo.gl/tpsmEJ#bisection Hi Dmitry, This bisection result doesn't make sense to me. As one can see, the message "BUG: MAX_STACK_TRACE_ENTRIES too low!" does not occur in the console output the above console output URL points at. Bart. This is still happening on mainline, and I think this bisection result is probably correct. syzbot did start hitting something different at the very end of the bisection ("WARNING: CPU: 0 PID: 9153 at kernel/locking/lockdep.c:747") but that seems to be just because your commit had a lot of bugs in it, which had to be fixed by later commits. In particular, the WARNING seems to have been fixed by commit 28d49e282665e ("locking/lockdep: Shrink struct lock_class_key"). What seems to still be happening is that the dynamic lockdep keys which you added make it possible for an unbounded number of entries to be added to the fixed length stack_trace[] array in kernel/locking/lockdep.c. Hence the "BUG: MAX_STACK_TRACE_ENTRIES too low!". Am I understanding it correctly? How did you intend this to work? The last two paragraphs do not make sense to me. My changes do not increase the number of stack traces that get recorded by the lockdep code. Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
[Moved most people to Bcc; syzbot added way too many random people to this.] Hi Bart, On Sat, Mar 30, 2019 at 07:17:09PM -0700, Bart Van Assche wrote: > On 3/30/19 2:58 PM, syzbot wrote: > > syzbot has bisected this bug to: > > > > commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f > > Author: Bart Van Assche > > Date: Thu Feb 14 23:00:54 2019 + > > > > kernel/workqueue: Use dynamic lockdep keys for workqueues > > > > bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 > > start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of > > git://git.kernel.. > > git tree: upstream > > final crash: https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 > > console output: https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 > > kernel config: https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf > > dashboard link: > > https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 > > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 > > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 > > > > Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com > > Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for > > workqueues") > > > > For information about bisection process see: > > https://goo.gl/tpsmEJ#bisection > > Hi Dmitry, > > This bisection result doesn't make sense to me. As one can see, the message > "BUG: MAX_STACK_TRACE_ENTRIES too low!" does not occur in the console output > the above console output URL points at. > > Bart. This is still happening on mainline, and I think this bisection result is probably correct. syzbot did start hitting something different at the very end of the bisection ("WARNING: CPU: 0 PID: 9153 at kernel/locking/lockdep.c:747") but that seems to be just because your commit had a lot of bugs in it, which had to be fixed by later commits. In particular, the WARNING seems to have been fixed by commit 28d49e282665e ("locking/lockdep: Shrink struct lock_class_key"). What seems to still be happening is that the dynamic lockdep keys which you added make it possible for an unbounded number of entries to be added to the fixed length stack_trace[] array in kernel/locking/lockdep.c. Hence the "BUG: MAX_STACK_TRACE_ENTRIES too low!". Am I understanding it correctly? How did you intend this to work? - Eric
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
On 3/30/19 2:58 PM, syzbot wrote: syzbot has bisected this bug to: commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f Author: Bart Van Assche Date: Thu Feb 14 23:00:54 2019 + kernel/workqueue: Use dynamic lockdep keys for workqueues bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of git://git.kernel.. git tree: upstream final crash: https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 console output: https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 kernel config: https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf dashboard link: https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") For information about bisection process see: https://goo.gl/tpsmEJ#bisection Hi Dmitry, This bisection result doesn't make sense to me. As one can see, the message "BUG: MAX_STACK_TRACE_ENTRIES too low!" does not occur in the console output the above console output URL points at. Bart.
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
syzbot has bisected this bug to: commit 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f Author: Bart Van Assche Date: Thu Feb 14 23:00:54 2019 + kernel/workqueue: Use dynamic lockdep keys for workqueues bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=17f1bacd20 start commit: 0e40da3e Merge tag 'kbuild-fixes-v5.1' of git://git.kernel.. git tree: upstream final crash:https://syzkaller.appspot.com/x/report.txt?x=1409bacd20 console output: https://syzkaller.appspot.com/x/log.txt?x=1009bacd20 kernel config: https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf dashboard link: https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10e1bacd20 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1120fe0f20 Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") For information about bisection process see: https://goo.gl/tpsmEJ#bisection
Re: BUG: MAX_STACK_TRACE_ENTRIES too low! (2)
syzbot has found a reproducer for the following crash on: HEAD commit:0e40da3e Merge tag 'kbuild-fixes-v5.1' of git://git.kernel.. git tree: upstream console output: https://syzkaller.appspot.com/x/log.txt?x=14d9123f20 kernel config: https://syzkaller.appspot.com/x/.config?x=8dcdce25ea72bedf dashboard link: https://syzkaller.appspot.com/bug?extid=6f39a9deb697359fe520 compiler: gcc (GCC) 9.0.0 20181231 (experimental) userspace arch: i386 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1244591f20 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=12611c7320 IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+6f39a9deb697359fe...@syzkaller.appspotmail.com BUG: MAX_STACK_TRACE_ENTRIES too low! turning off the locking correctness validator. CPU: 0 PID: 678 Comm: syz-executor519 Not tainted 5.1.0-rc2+ #43 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 save_trace kernel/locking/lockdep.c:467 [inline] save_trace.cold+0x14/0x19 kernel/locking/lockdep.c:437 mark_lock+0x2fb/0x1380 kernel/locking/lockdep.c:3410 __lock_acquire+0x548/0x3fb0 kernel/locking/lockdep.c:3657 lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:4211 flush_workqueue+0x126/0x14c0 kernel/workqueue.c:2774 drain_workqueue+0x1b4/0x470 kernel/workqueue.c:2939 destroy_workqueue+0x21/0x700 kernel/workqueue.c:4320 ucma_close+0x289/0x320 drivers/infiniband/core/ucma.c:1786 __fput+0x2e5/0x8d0 fs/file_table.c:278 fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_32_irqs_on arch/x86/entry/common.c:341 [inline] do_fast_syscall_32+0xa9d/0xc98 arch/x86/entry/common.c:397 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7fc4869 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:ffc382bc EFLAGS: 0246 ORIG_RAX: 0006 RAX: RBX: 0003 RCX: 22c0 RDX: 0004 RSI: 080bc36e RDI: 22d8 RBP: ffc38308 R08: R09: R10: R11: R12: R13: R14: R15: