On Thu 08/06/2017 16:55, Martin Pieuchot wrote:
> On 07/06/17(Wed) 09:43, Björn Ketelaars wrote:
> > On Sat 03/06/2017 08:44, Björn Ketelaars wrote:
> > >
> > > Reverting back to the previous kernel fixed the issue above. Question: can
> > > someone give a hint on how to track this issue?
> >
> > After a bit of experimenting I'm able to reproduce the problem. Summary is
> > that queueing in pf and use of a current (after May 30), multi processor
> > kernel (bsd.mp from snapshots) causes these specific watchdog timeouts
> > followed by a system freeze.
> >
> > Issue is 'gone' when:
> > 1.) using an older kernel (before May 30);
> > 2.) removal of queueing statements from pf.conf. Included below the specific
> > snippet;
> > 3.) switch from MP kernel to SP kernel.
> >
> > New observation is that while queueing, using a MP kernel, the download
> > bandwidth is only a fraction of what is expected. Exchanging the MP kernel
> > with a SP kernel restores the download bandwidth to expected level.
> >
> > I'm guessing that this issue is related to recent work on PF?
>
> It's certainly a problem in, or exposed by, re(4) with the recent MP work
> in the network stack.
>
> It would help if you could build a kernel with MP_LOCKDEBUG defined and
> see if the resulting kernel enters ddb(4) instead of freezing.
>
> Thanks,
> Martin
Thanks for the hint! It helped in entering ddb. I collected a bit of output,
which you can find below. If I read the trace correctly the crash is related
to line 1750 of sys/dev/ic/re.c:
d->rl_cmdstat |= htole32(RL_TDESC_CMD_EOF);
ddb{0}> show panic
the kernel did not panic
ddb{0}> trace
db_enter(ffffffff8196acc0,ffff80002200b8b8,ffff8000ffffe6a8,10,ffff800022023c58
,282) at db_enter+0x9
x86_ipi_handler(ffffffff8196acd0,ffffffff819f26a0,14a2d1,c,ffff800022023cd0,fff
fffff819f95a0) at x86_ipi_handler+0x85
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x1c
--- interrupt ---
___mp_lock(ffffffff819f26a0,ffffffff818552f0,1,1,ffff80002200b8b8,ffff8000ffffe
6a8) at ___mp_lock+0x4a
___mp_acquire_count(ffffffff819f26a0,1,ffff80002200b8b8,ffff8000ffffe6a8,ffffff
ff81a11680,ffffffff81707182) at ___mp_acquire_count+0x33
mi_switch(0,0,ffff8000ffffe6a8,ffff800022023ed0,ffff8000ffffe6a8,ffff800022023e
c0) at mi_switch+0x22b
sleep_finish(ffff800022023ed0,1,ffffffff810b7110,ffff800022023ed0,0,0) at sleep
_finish+0xc2
softclock_thread(ffff8000ffffe6a8,2a2,0,0,29e123d3cabb9a12,ffff800022023f10) at
softclock_thread+0x94
end trace frame: 0x0, count: -8
ddb{0}> machine ddbcpu 1
Stopped at re_encap+0x24d: movl 0(%r15),%eax
ddb{1}> show panic
the kernel did not panic
ddb{1}> trace
re_encap(ffff800000084000,363,ffffff00d221cc00,ffff8000000842c0,400,ffff8000000
84000) at re_encap+0x24d
re_start(ffff8000000842c0,7,ffffff00d171d000,ffff800000084338,ffff8000000842c0,
ffffffff815fdbf0) at re_start+0x8c
ifq_serialize(ffff8000000842c0,ffff800000084360,ffff800000084090,ffffff00d2fbd0
00,ffff8000000842c0,0) at ifq_serialize+0xf8
if_enqueue(ffff800000084090,ffffff00d2fbd000,ffff80000096a240,ffffff00d2fbd000,
37,2) at if_enqueue+0x90
ether_output(ffff800000084090,ffffff00d171d000,ffff80000096a240,ffffff011b66777
8,ffff8000004d1000,ffff80000096a240) at ether_output+0x1d6
ip_output(ffffff00d171d000,0,ffff800022032cc0,1,0,0) at ip_output+0x8a1
ip_forward(ffffff00d171d000,ffff8000002a4090,0,0,8244c78a,8244c78a) at ip_forwa
rd+0x1e7
ipv4_input(ffff8000002a4090,ffffff00d171d000,800,800,ffffff000990ff80,ffff80000
02a4090) at ipv4_input+0x4f7
ether_input(ffff8000002a4090,ffffff00d171d000,0,ffffffff810afed0,ffff8000002a42
88,ffff8000002a4090) at ether_input+0xbd
if_input_process(2,ffff800022032eb0,0,0,ffff800022032eb0,ffff800000019080) at i
f_input_process+0xfa
taskq_thread(ffff800000019080,2a2,ffff800000019080,ffffffff81493240,0,ffff80002
2032f10) at taskq_thread+0x79
end trace frame: 0x0, count: -11
ddb{1}> dmesg
OpenBSD 6.1-current (GENERIC.MP) #7: Thu Jun 8 19:10:36 CEST 2017
[email protected]:/home/code/src/sys/arch/amd64/compile/GENERIC.MP
real mem = 4245995520 (4049MB)
avail mem = 4111519744 (3921MB)
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xdf16d820 (6 entries)
bios0: vendor coreboot version "SageBios_PCEngines_APU-45" date 04/05/2014
bios0: PC Engines APU
acpi0 at bios0: rev 0
acpi0: sleep states S0 S1 S3 S4 S5
acpi0: tables DSDT FACP SPCR HPET APIC HEST SSDT SSDT SSDT
acpi0: wakeup devices AGPB(S4) HDMI(S4) PBR4(S4) PBR5(S4) PBR6(S4) PBR7(S4) PE2
0(S4) PE21(S4) PE22(S4) PE23(S4) PIBR(S4) UOH1(S3) UOH2(S3) UOH3(S3) UOH4(S3) U
OH5(S3) [...]
acpitimer0 at acpi0: 3579545 Hz, 32 bits
acpihpet0 at acpi0: 14318180 Hz
acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: AMD G-T40E Processor, 1000.13 MHz
cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,C
FLUSH,MMX,FXSR,SSE,SSE2,HTT,SSE3,MWAIT,SSSE3,CX16,POPCNT,NXE,MMXX,FFXSR,PAGE1GB
,RDTSCP,LONG,LAHF,CMPLEG,SVM,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,IBS,SKINIT,IT
SC
cpu0: 32KB 64b/line 2-way I-cache, 32KB 64b/line 8-way D-cache, 512KB 64b/line 1
6-way L2 cache
cpu0: 8 4MB entries fully associative
cpu0: DTLB 40 4KB entries fully associative, 8 4MB entries fully associative
cpu0: TSC frequency 1000129460 Hz
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
cpu0: apic clock running at 199MHz
cpu0: mwait min=64, max=64, IBE
cpu1 at mainbus0: apid 1 (application processor)
cpu1: AMD G-T40E Processor, 1000.00 MHz
cpu1: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,C
FLUSH,MMX,FXSR,SSE,SSE2,HTT,SSE3,MWAIT,SSSE3,CX16,POPCNT,NXE,MMXX,FFXSR,PAGE1GB
,RDTSCP,LONG,LAHF,CMPLEG,SVM,EAPICSP,AMCR8,ABM,SSE4A,MASSE,3DNOWP,IBS,SKINIT,IT
SC
cpu1: 32KB 64b/line 2-way I-cache, 32KB 64b/line 8-way D-cache, 512KB 64b/line 1
6-way L2 cache
cpu1: 8 4MB entries fully associative
cpu1: DTLB 40 4KB entries fully associative, 8 4MB entries fully associative
cpu1: smt 0, core 1, package 0
ioapic0 at mainbus0: apid 2 pa 0xfec00000, version 21, 24 pins
acpiprt0 at acpi0: bus -1 (AGPB)
acpiprt1 at acpi0: bus -1 (HDMI)
acpiprt2 at acpi0: bus 1 (PBR4)
acpiprt3 at acpi0: bus 2 (PBR5)
acpiprt4 at acpi0: bus 3 (PBR6)
acpiprt5 at acpi0: bus -1 (PBR7)
acpiprt6 at acpi0: bus 5 (PE20)
acpiprt7 at acpi0: bus -1 (PE21)
acpiprt8 at acpi0: bus -1 (PE22)
acpiprt9 at acpi0: bus -1 (PE23)
acpiprt10 at acpi0: bus 0 (PCI0)
acpiprt11 at acpi0: bus 4 (PIBR)
acpicpu0 at acpi0: C2(0@100 io@0x841), C1(@1 halt!), PSS
acpicpu1 at acpi0: C2(0@100 io@0x841), C1(@1 halt!), PSS
acpibtn0 at acpi0: PWRB
cpu0: 1000 MHz: speeds: 1000 800 MHz
pci0 at mainbus0 bus 0
pchb0 at pci0 dev 0 function 0 "AMD AMD64 14h Host" rev 0x00
ppb0 at pci0 dev 4 function 0 "AMD AMD64 14h PCIE" rev 0x00: msi
pci1 at ppb0 bus 1
re0 at pci1 dev 0 function 0 "Realtek 8168" rev 0x06: RTL8168E/8111E (0x2c00), m
si, address 00:0d:b9:33:8e:a4
rgephy0 at re0 phy 7: RTL8169S/8110S/8211 PHY, rev. 4
ppb1 at pci0 dev 5 function 0 "AMD AMD64 14h PCIE" rev 0x00: msi
pci2 at ppb1 bus 2
re1 at pci2 dev 0 function 0 "Realtek 8168" rev 0x06: RTL8168E/8111E (0x2c00), m
si, address 00:0d:b9:33:8e:a5
rgephy1 at re1 phy 7: RTL8169S/8110S/8211 PHY, rev. 4
ppb2 at pci0 dev 6 function 0 "AMD AMD64 14h PCIE" rev 0x00: msi
pci3 at ppb2 bus 3
re2 at pci3 dev 0 function 0 "Realtek 8168" rev 0x06: RTL8168E/8111E (0x2c00), m
si, address 00:0d:b9:33:8e:a6
rgephy2 at re2 phy 7: RTL8169S/8110S/8211 PHY, rev. 4
ahci0 at pci0 dev 17 function 0 "ATI SBx00 SATA" rev 0x40: apic 2 int 19, AHCI 1
.2
ahci0: port 0: 6.0Gb/s
scsibus1 at ahci0: 32 targets
sd0 at scsibus1 targ 0 lun 0: <ATA, SAMSUNG SSD PM83, CXM1> SCSI3 0/direct fixe
d naa.5002538043584d30
sd0: 30533MB, 512 bytes/sector, 62533296 sectors, thin
ohci0 at pci0 dev 18 function 0 "ATI SB700 USB" rev 0x00: apic 2 int 18, versio
n 1.0, legacy support
ehci0 at pci0 dev 18 function 2 "ATI SB700 USB2" rev 0x00: apic 2 int 17
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 configuration 1 interface 0 "ATI EHCI root hub" rev 2.00/1.00 add
r 1
ohci1 at pci0 dev 19 function 0 "ATI SB700 USB" rev 0x00: apic 2 int 18, versio
n 1.0, legacy support
ehci1 at pci0 dev 19 function 2 "ATI SB700 USB2" rev 0x00: apic 2 int 17
usb1 at ehci1: USB revision 2.0
uhub1 at usb1 configuration 1 interface 0 "ATI EHCI root hub" rev 2.00/1.00 add
r 1
piixpm0 at pci0 dev 20 function 0 "ATI SBx00 SMBus" rev 0x42: polling
iic0 at piixpm0
pcib0 at pci0 dev 20 function 3 "ATI SB700 ISA" rev 0x40
ppb3 at pci0 dev 20 function 4 "ATI SB600 PCI" rev 0x40
pci4 at ppb3 bus 4
ohci2 at pci0 dev 20 function 5 "ATI SB700 USB" rev 0x00: apic 2 int 18, versio
n 1.0, legacy support
ppb4 at pci0 dev 21 function 0 "ATI SB800 PCIE" rev 0x00
pci5 at ppb4 bus 5
ohci3 at pci0 dev 22 function 0 "ATI SB700 USB" rev 0x00: apic 2 int 18, versio
n 1.0, legacy support
ehci2 at pci0 dev 22 function 2 "ATI SB700 USB2" rev 0x00: apic 2 int 17
usb2 at ehci2: USB revision 2.0
uhub2 at usb2 configuration 1 interface 0 "ATI EHCI root hub" rev 2.00/1.00 add
r 1
pchb1 at pci0 dev 24 function 0 "AMD AMD64 14h Link Cfg" rev 0x43
pchb2 at pci0 dev 24 function 1 "AMD AMD64 14h Address Map" rev 0x00
pchb3 at pci0 dev 24 function 2 "AMD AMD64 14h DRAM Cfg" rev 0x00
km0 at pci0 dev 24 function 3 "AMD AMD64 14h Misc Cfg" rev 0x00
pchb4 at pci0 dev 24 function 4 "AMD AMD64 14h CPU Power" rev 0x00
pchb5 at pci0 dev 24 function 5 "AMD AMD64 14h Reserved" rev 0x00
pchb6 at pci0 dev 24 function 6 "AMD AMD64 14h NB Power" rev 0x00
pchb7 at pci0 dev 24 function 7 "AMD AMD64 14h Reserved" rev 0x00
usb3 at ohci0: USB revision 1.0
uhub3 at usb3 configuration 1 interface 0 "ATI OHCI root hub" rev 1.00/1.00 add
r 1
usb4 at ohci1: USB revision 1.0
uhub4 at usb4 configuration 1 interface 0 "ATI OHCI root hub" rev 1.00/1.00 add
r 1
isa0 at pcib0
isadma0 at isa0
com0 at isa0 port 0x3f8/8 irq 4: ns16550a, 16 byte fifo
com0: console
com1 at isa0 port 0x2f8/8 irq 3: ns16550a, 16 byte fifo
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
lpt0 at isa0 port 0x378/4 irq 7
wbsio0 at isa0 port 0x2e/2: NCT5104D rev 0x52
usb5 at ohci2: USB revision 1.0
uhub5 at usb5 configuration 1 interface 0 "ATI OHCI root hub" rev 1.00/1.00 add
r 1
usb6 at ohci3: USB revision 1.0
uhub6 at usb6 configuration 1 interface 0 "ATI OHCI root hub" rev 1.00/1.00 add
r 1
vmm0 at mainbus0: SVM/RVI
umass0 at uhub2 port 1 configuration 1 interface 0 "Generic Flash Card Reader/W
riter" rev 2.01/1.00 addr 2
umass0: using SCSI over Bulk-Only
scsibus2 at umass0: 2 targets, initiator 0
sd1 at scsibus2 targ 1 lun 0: <Multiple, Card Reader, 1.00> SCSI2 0/direct remo
vable serial.058f6366058F63666485
sd1: 7600MB, 512 bytes/sector, 15564800 sectors
vscsi0 at root
scsibus3 at vscsi0: 256 targets
softraid0 at root
scsibus4 at softraid0: 256 targets
root on sd0a (28fcdc10008babff.a) swap on sd0b dump on sd0b
re1: watchdog timeout
re1: watchdog timeout
re1: watchdog timeout
re1: watchdog timeout
re1: watchdog timeout
re1: watchdog timeout
re0: watchdog timeout
re1: watchdog timeout
re1: watchdog timeout
re0: watchdog timeout
re1: watchdog timeout
uvm_fault(0xffffffff81a10fc0, 0x400, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at re_encap+0x24d: movl 0(%r15),%eax
ddb{1}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
82195 369179 56172 0 3 0x83 poll systat
56172 147644 79847 1000 3 0x10008b pause ksh
79847 461268 7799 1000 3 0x90 select sshd
7799 55628 13498 0 3 0x92 poll sshd
61811 278207 1 0 3 0x100083 ttyin getty
74015 14393 1 0 3 0x100098 poll cron
59960 272647 1 0 3 0x80 kqread apmd
87200 406555 1 109 3 0x90 kqread ftp-proxy
45038 23411 15230 95 3 0x100092 kqread smtpd
93110 210830 15230 103 3 0x100092 kqread smtpd
73423 112738 15230 95 3 0x100092 kqread smtpd
68375 247337 15230 95 3 0x100092 kqread smtpd
27674 140042 15230 95 3 0x100092 kqread smtpd
48015 17028 15230 95 3 0x100092 kqread smtpd
15230 242238 1 0 3 0x100080 kqread smtpd
12034 314719 1 92 3 0x100090 kqread rtadvd
18251 517215 1 77 3 0x100090 poll dhcpd
13498 267120 1 0 3 0x80 select sshd
95968 319521 1 0 3 0x100080 poll ntpd
52955 506732 85253 83 3 0x100092 poll ntpd
85253 410399 1 83 3 0x100092 poll ntpd
60302 14142 1 53 3 0x90 kqread unbound
73871 165819 5712 74 3 0x100090 bpf pflogd
5712 338263 1 0 3 0x80 netio pflogd
29839 479602 54341 73 2 0x100090 syslogd
54341 102467 1 0 3 0x100082 netio syslogd
53017 285789 1 77 3 0x100090 poll dhclient
74581 122082 1 0 3 0x80 poll dhclient
54066 116457 74298 115 3 0x100092 kqread slaacd
64846 12763 74298 115 3 0x100092 kqread slaacd
74298 260076 1 0 3 0x80 kqread slaacd
64842 392827 1 0 3 0x80 mfsidl mount_mfs
51405 232671 0 0 3 0x14200 pgzero zerothread
48871 331472 0 0 3 0x14200 aiodoned aiodoned
34037 13488 0 0 3 0x14200 syncer update
91469 431877 0 0 3 0x14200 cleaner cleaner
4505 464914 0 0 3 0x14200 reaper reaper
58806 32124 0 0 3 0x14200 pgdaemon pagedaemon
11816 283378 0 0 3 0x14200 bored crynlk
20420 416822 0 0 3 0x14200 bored crypto
42218 73014 0 0 3 0x14200 pftm pfpurge
23356 409989 0 0 3 0x14200 bored sensors
75378 434260 0 0 3 0x14200 usbtsk usbtask
42666 210924 0 0 3 0x14200 usbatsk usbatsk
3299 130961 0 0 3 0x40014200 acpi0 acpi0
24358 424982 0 0 3 0x40014200 idle1
*90228 269751 0 0 7 0x14200 softnet
11942 439335 0 0 3 0x14200 bored systqmp
42690 165546 0 0 3 0x14200 bored systq
98103 311380 0 0 7 0x40014200 softclock
75229 252289 0 0 3 0x40014200 idle0
65336 512854 0 0 3 0x14200 bored sbar
1 38293 0 0 3 0x82 wait init
0 0 -1 0 3 0x10200 scheduler swapper
ddb{1}> show uvm
Current UVM status:
pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
1005861 VM pages: 145661 active, 2430 inactive, 1 wired, 806399 free (106967 z
ero)
min 10% (25) anon, 10% (25) vnode, 5% (12) vtext
pages 0 anon, 0 vnode, 0 vtext
freemin=33528, free-target=44704, inactive-target=0, wired-max=335287
faults=524182, traps=507313, intrs=368594, ctxswitch=212365 fpuswitch=198
softint=62364, syscalls=818564, kmapent=15
fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=6921(6921), anget(retries)=193190(0), amapcopy=349847
neighbor anon/obj pg=14074/136541, gets(lock/unlock)=48468/6921
cases: anon=164917, anoncow=28273, obj=41539, prcopy=6929, przero=282512
daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=0, nanon=0, nanonneeded=0 nfreeanon=0
swpages=0, swpginuse=0, swpgonly=0 paging=0
kernel pointers:
objs(kern)=0xffffffff819d3d80
ddb{1}> show bcstats
Current Buffer Cache status:
numbufs 6896 busymapped 1, delwri 5
kvaslots 6553 avail kva slots 6552
bufpages 27498, dmapages 27498, dirtypages 20
pendingreads 0, pendingwrites 0
highflips 0, highflops 0, dmaflips 0
ddb{1}> show registers
rdi 0xffffffff81974840 pci_bus_dma_tag
rsi 0
rbp 0xffff800022032900
rbx 0xffff8000000842c0
rdx 0
rcx 0xffff800000240800
rax 0xffffff00d221cc00
r8 0x4
r9 0
r10 0x2361a6a100
r11 0xffffffff8139e930 x86_bus_space_mem_read_4
r12 0x3630 __ALIGN_SIZE+0x2630
r13 0x363 hib_hlt_real+0x5e
r14 0xffff800000084000
r15 0x400 hib_hlt_real+0xfb
rip 0xffffffff812757fd re_encap+0x24d
cs 0x8
rflags 0x10216 mptramp_longmode+0x16e
rsp 0xffff800022032760
ss 0x10
re_encap+0x24d: movl 0(%r15),%eax
ddb{1}> boot reboot
panic: rw_enter: netlock locking against myself
Stopped at db_enter+0x9: leave
TID PID UID PRFLAGS PFLAGS CPU COMMAND
*269751 90228 0 0x14000 0x200 1 softnet
311380 98103 0 0x14000 0x40000200 0 softclock
db_enter(ffffffff8172639e,ffff800022032130,10,ffff800022032110,282,8) at db_ent
er+0x9
panic(ffffffff817ad478,14,ffff8000ffffed50,ffffffff8196a2b0,ffff8000ffffed54,ff
ff800022032240) at panic+0x102
_rw_enter(ffffffff8196a2b0,1,ffff80002202e000,0,ffffffff8197dea0,ffff8000220324
18) at _rw_enter+0x20f
if_downall(ffffffff8197dea0,ffff800022032418,4804,ffff8000220323b8,29e123d3cabb
9a12,0) at if_downall+0x2d
boot(4804,ffffffff81945120,4804,ffffffff8197dea0,ffff800022032418,4804) at boot
+0xeb
reboot(4804,5,0,ffff8000220323f0,ffff8000220323b8,d) at reboot+0x37
db_boot_reboot_cmd(ffffffff812757fd,0,ffffffffffffffff,ffff800022032420,0,29e12
3d3cabb9a12) at db_boot_reboot_cmd+0x22
db_command(ffffffff8197dea0,ffffffff8197dae0,ffff800022032690,6,0,78) at db_com
mand+0x13a
db_command_loop(29e123d3cabb9a12,0,ffffffff8165620f,ffff80002202e000,0,ffff8000
22032690) at db_command_loop+0xb0
db_trap(6,0,7,29e123d3cabb9a12,813b34c7,ffffffff813c2144) at db_trap+0xf4
db_ktrap(6,0,ffff800022032690,1,ffff800022032690,ffff8000ffffed50) at db_ktrap+
0x10f
trap() at trap+0x16d
--- trap (number 6) ---
re_encap(ffff800000084000,363,ffffff00d221cc00,ffff8000000842c0,400,ffff8000000
84000) at re_encap+0x24d
re_start(ffff8000000842c0,7,ffffff00d171d000,ffff800000084338,ffff8000000842c0,
ffffffff815fdbf0) at re_start+0x8c
end trace frame: 0xffff8000220329c0, count: 0
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
--
Björn Ketelaars
GPG key: 0x4F0E5F21