Hello,
It crashed again.
I've never used 7.2 with this configuration and environment.
What i simply do is feeding kernel routing table via bgp rib table. (Learn from
rib table add it as mpath routing...)
While openbgpd doesn't support multipath routing I'm trying to manipulate it
via my golang software.
Here it is:
uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax
TID PID UID PRFLAGS PFLAGS CPU COMMAND
* 69167 59539 0 0 0 0 bgpd
337353 18233 0 0x2 0x4000000 3 mine-core
148193 18233 0 0x2 0x4000080 1 mine-core
186456 18233 0 0x2 0x4000000 2 mine-core
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpath_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstate_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_outp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofilew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: 4
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
ddb{0}> show panic
*cpu0: uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e
ddb{0}> trace
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat
h_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat
e_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou
tp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil
ew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+
0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: -11
ddb{0}> show malloc
Type InUse MemUse HighUse Limit Requests Type Lim
devbuf 12877 18897K 18897K 78643K 36405 0
pcb 13 14K 18K 78643K 21 0
rtable 443 61K 75K 78643K 6370 0
ifaddr 225 50K 51K 78643K 988 0
sysctl 3 1K 1K 78643K 3 0
counters 214 83K 83K 78643K 214 0
ioctlops 0 0K 4K 78643K 9241 0
iov 0 0K 2K 78643K 153 0
mount 8 8K 8K 78643K 9 0
log 0 0K 0K 78643K 1 0
vnodes 1220 77K 79K 78643K 3836 0
UFS quota 1 32K 32K 78643K 1 0
UFS mount 31 69K 69K 78643K 39 0
shm 2 1K 1K 78643K 2 0
VM map 2 1K 1K 78643K 2 0
sem 2 10K 10K 78643K 4 0
dirhash 138 25K 25K 78643K 180 0
ACPI 3556 426K 453K 78643K 13924 0
file desc 28 19K 23K 78643K 4162 0
sigio 1 0K 0K 78643K 1 0
proc 108 85K 93K 78643K 9005 0
MFS node 6 0K 0K 78643K 6 0
NFS srvsock 1 0K 0K 78643K 1 0
NFS daemon 1 16K 16K 78643K 1 0
in_multi 35 2K 2K 78643K 70 0
ether_multi 2 0K 0K 78643K 2 0
ISOFS mount 1 32K 32K 78643K 1 0
MSDOSFS mount 1 16K 16K 78643K 1 0
ttys 25 122K 122K 78643K 25 0
exec 0 0K 1K 78643K 9608 0
pfkey data 0 0K 0K 78643K 2 0
tdb 3 0K 0K 78643K 3 0
pagedep 1 8K 8K 78643K 1 0
inodedep 1 32K 32K 78643K 1 0
newblk 1 0K 0K 78643K 1 0
VM swap 8 1646K 1648K 78643K 10 0
UVM amap 3317 308K 311K 78643K 94660 0
UVM aobj 3 2K 2K 78643K 3 0
USB 11 10K 10K 78643K 13 0
USB device 4 0K 0K 78643K 4 0
USB HC 1 0K 0K 78643K 1 0
memdesc 1 4K 4K 78643K 1 0
crypto data 18 258K 258K 78643K 18 0
NDP 43 0K 0K 78643K 43 0
temp 187 5824K 5888K 78643K 206641 0
kqueue 71 130K 138K 78643K 397 0
SYN cache 2 16K 16K 78643K 2 0
ddb{0}> trace
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat
h_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat
e_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou
tp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil
ew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+
0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: -11
ddb{0}> machine ddbcpu 0
Invalid cpu 0
ddb{0}> machine ddbcpu 1
P.S. It didn't respond to machine ddb commands so i were just able to get these
outputs from ddb.
________________________________
From: Alexander Bluhm <[email protected]>
Sent: Thursday, July 6, 2023 19:17
To: Valdrin MUJA <[email protected]>
Cc: [email protected] <[email protected]>
Subject: Re: kernel diagnostic assertion "!_kernel_lock_held()" failed
On Thu, Jul 06, 2023 at 02:14:09PM +0000, Valdrin MUJA wrote:
> I've applied your patch but crashed again. Here it is:
> ddb{1}> show panic
> *cpu1: kernel diagnostic assertion "refcnt_read(&rt->rt_refcnt) >= 2" failed:
> f
> ile "/usr/src/sys/net/rtable.c", line 828
This kassert I added seems to be wrong. I copied it from above
without thinking enough. Just remove it, updated diff below.
I compared your crash 3 and 4 output:
TEST1> uvm_fault(0xfffffd826717bcc0, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax
TID PID UID PRFLAGS PFLAGS CPU COMMAND
*225335 47125 0 0 0 1 bgpd
231752 78299 73 0x1100010 0 3 syslogd
344909 6421 0 0x14000 0x200 2 wg_handshake
361415 98860 0 0x14000 0x200 0 reaper
SPOKE1> uvm_fault(0xfffffd81d5995878, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax
TID PID UID PRFLAGS PFLAGS CPU COMMAND
448769 98731 0 0x100002 0 3 sh
350289 69698 73 0x1100010 0 0 syslogd
*114462 84824 0 0 0 1 bgpd
256495 50081 0 0x14000 0x200 2 wg_handshake
It is interesting that bgpd and wireguard are running in both cases
when it crashes. Unfortunately you mail does not include this
output for crash 1 and 2. It is printed immediately when the machine
crashes. Do you have it in some console history?
I see a lot of different workload on your machine. That makes it
harder to identify the subsystem that has the bug. I see bgpd(8)
and wg(2) doing things with network and routing. Is there something
else?
What has changed to make these crashes happen? New workload? New
machine? Upgrade to 7.3? Was it stable with 7.2? ...
Thanks for testing.
bluhm
Index: net/rtable.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82
+++ net/rtable.c 6 Jul 2023 15:56:04 -0000
@@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str
SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
prev = art_insert(ar, an, addr, plen);
+ if (prev == an) {
+ rw_exit_write(&ar->ar_lock);
+ /* keep the refcount for rt while it is in an_rtlist */
+ return (0);
+ }
if (prev != an) {
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
rt_next);
@@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str
npaths++;
if (npaths > 1) {
- KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+ KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
rt_next);
+ rtfree(rt);
mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
if (npaths == 2)
@@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str
if (art_delete(ar, an, addr, plen) == NULL)
panic("art_delete failed to find node %p", an);
- KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+ KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
+ rtfree(rt);
art_put(an);
leave:
@@ -821,12 +828,10 @@ rtable_mpath_reprio(unsigned int rtablei
*/
rt->rt_priority = prio;
} else {
- rtref(rt); /* keep rt alive in between remove and insert */
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
rt, rtentry, rt_next);
rt->rt_priority = prio;
rtable_mpath_insert(an, rt);
- rtfree(rt);
error = EAGAIN;
}
rw_exit_write(&ar->ar_lock);
@@ -839,6 +844,9 @@ rtable_mpath_insert(struct art_node *an,
{
struct rtentry *mrt, *prt = NULL;
uint8_t prio = rt->rt_priority;
+
+ /* increment the refcount for rt while it is in an_rtlist */
+ rtref(rt);
if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);