On 2021/11/13 21:49, Stuart Henderson wrote:
> On 2021/11/13 18:04, Alexander Bluhm wrote:
> > Hi,
> >
> > To make IPsec MP safe we need refcounting for the tdb. The diff
> > below is part of something bigger we have at genua. Although it
> > does not cover timeouts and the tdb reaper yet, I want to get this
> > in as a frist step.
> >
> > It passes regress but there are setups that are not covered. Bridge
> > and pfsync with IPsec and TCP signature need special care.
> >
> > When testing, please check for tdb leaks. The vmstat -m tdb in use
> > culumn must be 0. It it is not, try ipsecctl -F. If ipsecctl -sa
> > shows nothing, but vmstat -m shows a used tdb, then it is a bug.
> >
> > Name Size Requests Fail InUse Pgreq Pgrel Npage Hiwat Minpg Maxpg
> > Idle
> > tdb 904 56 0 0 7 7 0 7 0 8
> > 0
>
> Tested with a box running isakmpd at home. Config is:
>
> isakmpd_flags=-Kv -D0=29 -D1=49 -D2=10 -D3=30 -D5=20 -D6=30 -D8=30 -D9=30
> -D10=20
>
> ike passive esp \
> from xxx.xx.xxx.1 (xxx.xx.xxx.0/27) to 0.0.0.0/0 \
> local xxx.xx.xxx.1 \
> main auth hmac-sha2-256 enc aes group ecp256 \
> quick enc aes-128-gcm group ecp256 \
> srcid xxx
>
> isakmpd starts at boot, 3 devices connect in and tunnels come up,
> FLOWS and SAD show in ipsecctl -sa.
>
> isakmpd crashes shortly after starting (as described in previous mails;
> this is not new, it happens very often on the first start at boot),
> FLOWS and SAD remain showing. At this point they still show InUse:
>
> Name Size Requests Fail InUse Pgreq Pgrel Npage Hiwat Minpg Maxpg
> Idle
> tdb 1088 9 0 6 1 0 1 1 0 8
> 0
>
> After ipsecctl -F they are freed:
>
> tdb 1088 9 0 0 1 0 1 1 0 8
> 1
>
> Restart isakmpd, ipsecctl -f /etc/ipsec.conf, devices connect back in:
>
> tdb 1088 18 0 6 2 1 1 1 0 8
> 0
>
> (And at this point isakmpd is usually stable).
>
> I will leave a shell loop running vmstat -m every few minutes to check
> if it grows.
>
> (Bad timing for testing TCPMD5 because over the last couple of days I have
> just removed bgpd+ospfd from the machines that would have been good
> choices to run test diffs..I might be able to get some simple setup
> back up and running though)
Ah, that was interesting:
Sat Nov 13 21:44:17 GMT 2021
tdb 1088 18 0 6 2 1 1 1 0 8 0
Sat Nov 13 21:54:44 GMT 2021
tdb 1088 18 0 6 2 1 1 1 0 8 0
Sat Nov 13 22:04:44 GMT 2021
tdb 1088 27 0 12 2 1 1 1 0 8 0
and then shortly after, boom. Unfortunately I didn't get
much from ddb before it hanged after I tried changing cpu.
[playback]
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7ffffb2d20, count: 248
End of stack trace.
splassert: tdb_free: want 2 have 0
Starting stack trace...
tdb_free(ffff800000f40b88) at tdb_free+0x47
pfkeyv2_send(fffffd80603603b8,ffff800000f44c00,170) at pfkeyv2_send+0x656
pfkeyv2_output(fffffd8064cb5e00,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
pfkeyv2_usrreq(fffffd80603603b8,9,fffffd8064cb5e00,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
sosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
dofilewritev(ffff8000ffff9508,7,ffff80001fd8f6f0,0,ffff80001fd8f7f0) at
dofilewritev+0x14d
sys_writev(ffff8000ffff9508,ffff80001fd8f790,ffff80001fd8f7f0) at
sys_writev+0xd2
syscall(ffff80001fd8f860) at syscall+0x3a9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7ffffb2d20, count: 248
End of stack trace.
splassert: tdb_free: want 2 have 0
Starting stack trace...
tdb_free(ffff800000f40308) at tdb_free+0x47
pfkeyv2_send(fffffd80603603b8,ffff800000eff300,50) at pfkeyv2_send+0x656
pfkeyv2_output(fffffd8062506e00,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
pfkeyv2_usrreq(fffffd80603603b8,9,fffffd8062506e00,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
sosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
dofilewritev(ffff8000ffff9508,7,ffff80001fd8f6f0,0,ffff80001fd8f7f0) at
dofilewritev+0x14d
sys_writev(ffff8000ffff9508,ffff80001fd8f790,ffff80001fd8f7f0) at
sys_writev+0xd2
syscall(ffff80001fd8f860) at syscall+0x3a9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7ffffc3170, count: 248
End of stack trace.
splassert: ipsp_ids_free: want 2 have 0
Starting stack trace...
ipsp_ids_free(ffff800000f30880) at ipsp_ids_free+0x4e
tdb_free(ffff800000f40308) at tdb_free+0x13c
pfkeyv2_send(fffffd80603603b8,ffff800000eff300,50) at pfkeyv2_send+0x656
pfkeyv2_output(fffffd8062506e00,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
pfkeyv2_usrreq(fffffd80603603b8,9,fffffd8062506e00,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
sosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
dopfainlice:wr iketernv(eflf ffd8i0a0gn0offsftfic9 5a08ss,7er,ftfiofnf
8"0r00ef1cfdnt8 f6!=f 0,~00",f ffaiflfe80d:0 01fiflde8 f7"f/s0ys)/ akter n/kern_
sdyofncilh.ecw"ri,t elivn+e0 x8124d4
Stopped at db_enter+0x10: popq %rbp
TID PID UID PRFLAGS PFLAGS CPU COMMAND
185954 5669 68 0x10 0 1 isakmpd
67015 91815 0 0x14000 0x200 3 softnet
*318814 6181 0 0x14000 0x40000200 0 softclock
db_enter() at db_enter+0x10
panic(ffffffff81e4a37b) at panic+0xbf
__assert(ffffffff81eb3b74,ffffffff81e1e19f,338,ffffffff81e4f078) at __assert+0x
25
refcnt_rele(ffff800000f40330) at refcnt_rele+0x6f
tdb_delete(ffff800000f40308) at tdb_delete+0x84
tdb_timeout(ffff800000f40308) at tdb_timeout+0x39
softclock_thread(ffff8000fffff500) at softclock_thread+0x16e
end trace frame: 0x0, count: 8
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
ddb{0}> [-- sthen@localhost attached -- Sat Nov 13 22:23:17 2021]
tr
db_enter() at db_enter+0x10
panic(ffffffff81e4a37b) at panic+0xbf
__assert(ffffffff81eb3b74,ffffffff81e1e19f,338,ffffffff81e4f078) at __assert+0x
25
refcnt_rele(ffff800000f40330) at refcnt_rele+0x6f
tdb_delete(ffff800000f40308) at tdb_delete+0x84
tdb_timeout(ffff800000f40308) at tdb_timeout+0x39
softclock_thread(ffff8000fffff500) at softclock_thread+0x16e
end trace frame: 0x0, count: -7
ddb{0}> sh reg
rdi 0
rsi 0x14
rbp 0xffff80001fbdc6e0
rbx 0
rdx 0xde00000003ff0239
rcx 0x202
rax 0x62
r8 0x101010101010101
r9 0
r10 0
r11 0x254de177953953f9
r12 0xffffffff82196a00 cpu_info_full_primary+0x2a00
r13 0xffffffff82275110 timeout_proc
r14 0
r15 0xffffffff81e4a37b cmd0646_9_tim_udma+0x2e00b
rip 0xffffffff81dad9f0 db_enter+0x10
cs 0x8
rflags 0x206
rsp 0xffff80001fbdc6e0
ss 0x10
db_enter+0x10: popq %rbp
ddb{0}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
20274 32066 93874 1000 3 0x100083 nanoslp sleep
93874 445500 10192 1000 3 0x10008b sigsusp ksh
10192 370220 36870 1000 3 0x98 kqread sshd
36870 222408 98260 0 3 0x92 kqread sshd
5669 185954 75457 68 7 0x10 isakmpd
75457 276857 1 0 3 0x80 netio isakmpd
21598 237571 1 0 3 0x100083 ttyin getty
4506 488964 1 0 3 0x100098 kqread cron
39797 52192 15626 846 3 0x100098 kqread dhcpcd
59318 229873 15626 846 3 0x100098 kqread dhcpcd
7830 517562 15626 0 3 0x88 kqread dhcpcd
15626 323417 1 846 3 0x100098 kqread dhcpcd
19237 354049 75741 720 3 0x90 kqread lldpd
75741 66119 1 0 3 0x80 netio lldpd
61311 177515 1 99 3 0x100090 kqread sndiod
11783 282500 1 110 3 0x100090 kqread sndiod
82651 136045 37762 0 3 0x100090 kqread tftp-proxy
37762 379684 1 108 3 0x90 kqread tftp-proxy
1745 324688 1 109 3 0x90 kqread ftp-proxy
39681 116848 88999 95 3 0x100092 kqread smtpd
25384 256665 88999 103 3 0x100092 kqread smtpd
92345 176419 88999 95 3 0x100092 kqread smtpd
55442 232201 88999 95 3 0x100092 kqread smtpd
77443 215878 88999 95 3 0x100092 kqread smtpd
69162 261319 88999 95 3 0x100092 kqread smtpd
88999 111572 1 0 3 0x100080 kqread smtpd
58046 516005 58609 94 3 0x100092 kqread rad
90680 269647 58609 94 3 0x100092 kqread rad
58609 188614 1 0 3 0x100080 kqread rad
3419 227743 1 77 3 0x100090 kqread dhcpd
61767 205183 1 0 3 0x100080 kqread snmpd
51987 257332 1 91 3 0x92 kqread snmpd
98260 56979 1 0 3 0x88 kqread sshd
13528 58314 1 0 3 0x100080 kqread ntpd
18801 320473 51204 83 3 0x100092 kqread ntpd
51204 405642 1 83 3 0x100092 kqread ntpd
53483 209302 30989 53 3 0x90 kqread unbound
30989 499872 1 53 3 0x90 kqread unbound
16457 234403 32406 74 3 0x100092 bpf pflogd
32406 88142 1 0 3 0x80 netio pflogd
80238 516045 79768 73 3 0x100090 kqread syslogd
79768 67472 1 0 3 0x100082 netio syslogd
43550 409362 1 0 3 0x100080 kqread resolvd
442 379277 99464 77 3 0x100092 kqread dhcpleased
47451 452762 99464 77 3 0x100092 kqread dhcpleased
99464 433528 1 0 3 0x80 kqread dhcpleased
21886 318205 0 0 3 0x14200 bored wg_crypt
97050 229388 0 0 3 0x14200 bored wg_crypt
4440 5494 0 0 3 0x14200 bored wg_crypt
72609 120590 0 0 3 0x14200 bored wg_crypt
20025 29295 0 0 3 0x14200 bored wg_handshake
43757 478463 0 0 3 0x14200 bored wg_handshake
55892 346923 51675 115 3 0x100092 kqread slaacd
76843 163407 51675 115 3 0x100092 kqread slaacd
51675 404409 1 0 3 0x100080 kqread slaacd
28362 202426 0 0 3 0x14200 bored smr
82207 376534 0 0 3 0x14200 pgzero zerothread
1524 51196 0 0 3 0x14200 aiodoned aiodoned
22770 341109 0 0 3 0x14200 syncer update
33241 337755 0 0 3 0x14200 cleaner cleaner
27893 102676 0 0 3 0x14200 reaper reaper
79368 439400 0 0 3 0x14200 pgdaemon pagedaemon
98877 12806 0 0 3 0x14200 bored sensors
71274 388091 0 0 3 0x14200 mmctsk sdmmc0
33311 377928 0 0 3 0x14200 usbtsk usbtask
83240 116348 0 0 3 0x14200 usbatsk usbatsk
45461 344150 0 0 3 0x40014200 acpi0 acpi0
94708 135825 0 0 3 0x40014200 idle3
46611 496404 0 0 7 0x40014200 idle2
26280 26936 0 0 3 0x40014200 idle1
91815 67015 0 0 7 0x14200 softnet
90110 476121 0 0 3 0x14200 bored systqmp
12475 134606 0 0 3 0x14200 bored systq
* 6181 318814 0 0 7 0x40014200 softclock
38662 5703 0 0 3 0x40014200 idle0
1 335266 0 0 3 0x82 wait init
0 0 -1 0 3 0x10200 scheduler swapper
ddb{0}> ps /o
TID PID UID PRFLAGS PFLAGS CPU COMMAND
185954 5669 68 0x10 0 1 isakmpd
67015 91815 0 0x14000 0x200 3 softnet
*318814 6181 0 0x14000 0x40000200 0 softclock
ddb{0}> tr /t 0t185954
ffff800000f40308(ffffffffffffffff,ffffffffffffffff,ffff800000f402e8,0,0,0) at 0
xffff800000f40308
end trace frame: 0x0, count: -1
ddb{0}> mach ddbcpu 1
<ddb hanged here>
However, looking at the earlier console log, there were splasserts
starting from before it went multiuser:
reordering libraries: done.^M
starting early daemons: syslogd pflogd unbound ntpd isakmpd.^M
starting RPC daemons:.^M
savecore: no core dump^M
checking quotas: done.^M
clearing /tmp^M
kern.securelevel: 0 -> 1^M
creating runtime link editor directory cache.^M
preserving editor files.^M
starting network daemons: sshd snmpd dhcpd rad smtpd ftpproxy ftpproxy6(failed)
tftpproxy sndiod.^M
starting package daemons: lldpd dhcpcdsplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b80) at tdb_free+0x47
^Mpfkeyv2_send(fffffd805f7ace00,ffff800000f35c00,170) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd8064cb5300,fffffd805f7ace00,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd805f7ace00,9,fffffd8064cb5300,0,0,ffff80001fcb22a8) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd805f7ace00,0,ffff80001fcd97f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff80001fcb22a8,7,ffff80001fcd97f0,0,ffff80001fcd98f0) at
dofilewritev+0x14d
^Msys_writev(ffff80001fcb22a8,ffff80001fcd9890,ffff80001fcd98f0) at
sys_writev+0xd2
^Msyscall(ffff80001fcd9960) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb15b0, count: 248
^MEnd of stack trace.
^Msplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b80) at tdb_free+0x47
^Mpfkeyv2_send(fffffd805f7ace00,ffff800000f44400,190) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd806285fe00,fffffd805f7ace00,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd805f7ace00,9,fffffd806285fe00,0,0,ffff80001fcb22a8) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd805f7ace00,0,ffff80001fcd97f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff80001fcb22a8,7,ffff80001fcd97f0,0,ffff80001fcd98f0) at
dofilewritev+0x14d
^Msys_writev(ffff80001fcb22a8,ffff80001fcd9890,ffff80001fcd98f0) at
sys_writev+0xd2
^Msyscall(ffff80001fcd9960) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb15b0, count: 248
^MEnd of stack trace.
^Msplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b80) at tdb_free+0x47
^Mpfkeyv2_send(fffffd805f7ace00,ffff800000f44400,190) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd806285fe00,fffffd805f7ace00,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd805f7ace00,9,fffffd806285fe00,0,0,ffff80001fcb22a8) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd805f7ace00,0,ffff80001fcd97f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff80001fcb22a8,7,ffff80001fcd97f0,0,ffff80001fcd98f0) at
dofilewritev+0x14d
^Msys_writev(ffff80001fcb22a8,ffff80001fcd9890,ffff80001fcd98f0) at
sys_writev+0xd2
^Msyscall(ffff80001fcd9960) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb15b0, count: 248
^MEnd of stack trace.
^Msplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b80) at tdb_free+0x47
^Mpfkeyv2_send(fffffd805f7ace00,ffff800000f35c00,170) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd8062497e00,fffffd805f7ace00,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd805f7ace00,9,fffffd8062497e00,0,0,ffff80001fcb22a8) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd805f7ace00,0,ffff80001fcd97f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff80001fcb22a8,7,ffff80001fcd97f0,0,ffff80001fcd98f0) at
dofilewritev+0x14d
^Msys_writev(ffff80001fcb22a8,ffff80001fcd9890,ffff80001fcd98f0) at
sys_writev+0xd2
^Msyscall(ffff80001fcd9960) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb15b0, count: 248
^MEnd of stack trace.
^M.^M
starting local daemons: cron.^M
Sat Nov 13 21:34:29 GMT 2021^M
reorder_kernel: failed -- see /usr/share/relink/kernel/GENERIC.MP/relink.log^M
^M
OpenBSD/amd64 (jodrell.spacehopper.org) (tty00)^M
^M
login: splassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b88) at tdb_free+0x47
^Mpfkeyv2_send(fffffd80603603b8,ffff800000f44600,170) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd8064cb5a00,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd80603603b8,9,fffffd8064cb5a00,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff8000ffff9508,7,ffff80001fd8f6f0,0,ffff80001fd8f7f0) at
dofilewritev+0x14d
^Msys_writev(ffff8000ffff9508,ffff80001fd8f790,ffff80001fd8f7f0) at
sys_writev+0xd2
^Msyscall(ffff80001fd8f860) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb2d20, count: 248
^MEnd of stack trace.
^Msplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b88) at tdb_free+0x47
^Mpfkeyv2_send(fffffd80603603b8,ffff800000f44c00,190) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd8067cc9e00,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd80603603b8,9,fffffd8067cc9e00,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff8000ffff9508,7,ffff80001fd8f6f0,0,ffff80001fd8f7f0) at
dofilewritev+0x14d
^Msys_writev(ffff8000ffff9508,ffff80001fd8f790,ffff80001fd8f7f0) at
sys_writev+0xd2
^Msyscall(ffff80001fd8f860) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb2d20, count: 248
^MEnd of stack trace.
^Msplassert: tdb_free: want 2 have 0
^MStarting stack trace...
^Mtdb_free(ffff800000f40b88) at tdb_free+0x47
^Mpfkeyv2_send(fffffd80603603b8,ffff800000f44a00,170) at pfkeyv2_send+0x656
^Mpfkeyv2_output(fffffd805d1da700,fffffd80603603b8,0,0) at pfkeyv2_output+0x8a
^Mpfkeyv2_usrreq(fffffd80603603b8,9,fffffd805d1da700,0,0,ffff8000ffff9508) at
pfkeyv2_usrreq+0x1b0
^Msosend(fffffd80603603b8,0,ffff80001fd8f6f0,0,0,0) at sosend+0x3a9
^Mdofilewritev(ffff8000ffff9508,7,ffff80001fd8f6f0,0,ffff80001fd8f7f0) at
dofilewritev+0x14d
^Msys_writev(ffff8000ffff9508,ffff80001fd8f790,ffff80001fd8f7f0) at
sys_writev+0xd2
^Msyscall(ffff80001fd8f860) at syscall+0x3a9
^MXsyscall() at Xsyscall+0x128
^Mend of kernel
^Mend trace frame: 0x7f7ffffb2d20, count: 248
^MEnd of stack trace.
^M[-- MARK -- Sat Nov 13 22:00:00 2021]^M
splassert: tdb_free: want 2 have 0
[...]
I'll see if I can get something else out of it.