Re: Cron running at 99% CPU for seemingly no reason

2022-06-27 Thread Claudio Jeker
On Sun, Jun 19, 2022 at 01:26:27PM +0200, Stephan Mending wrote:
> Hi, 
> it crashed again. 
> Here is the dmesg, this time the kernel had debugging symbols enabled. 
> 
> [...]
> ic0 at ichiic0
> spdmem0 at iic0 addr 0x50: 2GB DDR3 SDRAM PC3-12800 SO-DIMM
> isa0 at pcib0
> isadma0 at isa0
> vga0 at isa0 port 0x3b0/48 iomem 0xa/131072
> wsdisplay at vga0 not configured
> pcppi0 at isa0 port 0x61
> spkr0 at pcppi0
> wbsio0 at isa0 port 0x2e/2: NCT5104D rev 0x53
> wbsio0 port 0xa10/2 not configured
> vmm0 at mainbus0: VMX/EPT
> run0 at uhub0 port 4 configuration 1 interface 0 "Ralink 802.11 n WLAN" rev 
> 2.0
> 0/1.01 addr 2
> run0: MAC/BBP RT5592 (rev 0x0222), RF RT5592 (MIMO 2T2R), address 
> d8:61:62:37:5
> 6:c8   
> uhub2 at uhub1 port 1 configuration 1 interface 0 "Intel Rate Matching Hub" 
> rev
>  2.00/0.04 addr 2
>  vscsi0 at root
>  scsibus2 at vscsi0: 256 targets
>  softraid0 at root
>  scsibus3 at softraid0: 256 targets
>  root on sd0a (7ec83d15890e2a71.a) swap on sd0b dump on sd0b
>  inteldrm0: 1024x768, 32bpp
>  wsdisplay0 at inteldrm0 mux 1
>  wsdisplay0: screen 0-5 added (std, vt100 emulation)
>  kernel: protection fault trap, code=0
>  Stopped at  icmp_mtudisc_timeout+0x77 
> [/usr/src/sys/netinet/ip_icmp.c:1072]
>  :   movq0(%rax),%rcx
>  ddb{0}> ddb{0}>
>  ddb{0}> bt  
>  icmp_mtudisc_timeout(fd807a4e0620,0) at icmp_mtudisc_timeout+0x77 
> [/usr/src
>  /sys/netinet/ip_icmp.c:1072]
>  rt_timer_timer(82324248) at rt_timer_timer+0x1cc 
> [/usr/src/sys/net/rout
>  e.c:1551]
>  softclock_thread(8000f260) at softclock_thread+0x13b 
> [/usr/src/sys/kern
>  /kern_timeout.c:681]
>  end trace frame: 0x0, count: -3
>  ddb{0}> call db_show_rtentry(fd807a4e0620, 0, 0)  
>  Symbol not found
> 
> I'd love to know whats going wrong here.
> 

This is a race condition in the rttimer code that was introduced by bluhm@
when he added the mutex around the global list.
Can you try the diff below which is a refactor I did some time ago which
changes this and uses a per route timeout instead of the global one. With
this we should not have this use after free anymore.

-- 
:wq Claudio

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.410
diff -u -p -r1.410 route.c
--- net/route.c 5 May 2022 13:57:40 -   1.410
+++ net/route.c 13 May 2022 11:49:00 -
@@ -1361,7 +1361,16 @@ rt_ifa_purge_walker(struct rtentry *rt, 
  */
 
 struct mutex   rttimer_mtx;
-LIST_HEAD(, rttimer_queue) rttimer_queue_head; /* [T] */
+
+struct rttimer {
+   TAILQ_ENTRY(rttimer)rtt_next;   /* [T] entry on timer queue */
+   LIST_ENTRY(rttimer) rtt_link;   /* [T] timers per rtentry */
+   struct timeout  rtt_timeout;/* [I] timeout for this entry */
+   struct rttimer_queue*rtt_queue; /* [I] back pointer to queue */
+   struct rtentry  *rtt_rt;/* [T] back pointer to route */
+   time_t  rtt_expire; /* [I] rt expire time */
+   u_int   rtt_tableid;/* [I] rtable id of rtt_rt */
+};
 
 #define RTTIMER_CALLOUT(r) {   \
if (r->rtt_queue->rtq_func != NULL) {   \
@@ -1388,15 +1397,9 @@ LIST_HEAD(, rttimer_queue)   rttimer_queue
 void
 rt_timer_init(void)
 {
-   static struct timeout   rt_timer_timeout;
-
pool_init(_pool, sizeof(struct rttimer), 0,
IPL_MPFLOOR, 0, "rttmr", NULL);
-
mtx_init(_mtx, IPL_MPFLOOR);
-   LIST_INIT(_queue_head);
-   timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
-   timeout_add_sec(_timer_timeout, 1);
 }
 
 void
@@ -1407,10 +1410,6 @@ rt_timer_queue_init(struct rttimer_queue
rtq->rtq_count = 0;
rtq->rtq_func = func;
TAILQ_INIT(>rtq_head);
-
-   mtx_enter(_mtx);
-   LIST_INSERT_HEAD(_queue_head, rtq, rtq_link);
-   mtx_leave(_mtx);
 }
 
 void
@@ -1453,6 +1452,25 @@ rt_timer_queue_count(struct rttimer_queu
return (rtq->rtq_count);
 }
 
+static inline struct rttimer *
+rt_timer_unlink(struct rttimer *r)
+{
+   MUTEX_ASSERT_LOCKED(_mtx);
+
+   LIST_REMOVE(r, rtt_link);
+   r->rtt_rt = NULL;
+
+   if (timeout_del(>rtt_timeout) == 0) {
+   /* timeout fired, so rt_timer_timer will do the cleanup */
+   return NULL;
+   }
+
+   TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
+   KASSERT(r->rtt_queue->rtq_count > 0);
+   r->rtt_queue->rtq_count--;
+   return r;
+}
+
 void
 rt_timer_remove_all(struct rtentry *rt)
 {
@@ -1462,11 +1480,9 @@ rt_timer_remove_all(struct rtentry *rt)
TAILQ_INIT();
mtx_enter(_mtx);
while ((r = LIST_FIRST(>rt_timer)) != NULL) {
-   LIST_REMOVE(r, rtt_link);
-   TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
-   

Re: Cron running at 99% CPU for seemingly no reason

2022-06-19 Thread Stephan Mending
Hi, 
it crashed again. 
Here is the dmesg, this time the kernel had debugging symbols enabled. 

[...]
ic0 at ichiic0
spdmem0 at iic0 addr 0x50: 2GB DDR3 SDRAM PC3-12800 SO-DIMM
isa0 at pcib0
isadma0 at isa0
vga0 at isa0 port 0x3b0/48 iomem 0xa/131072
wsdisplay at vga0 not configured
pcppi0 at isa0 port 0x61
spkr0 at pcppi0
wbsio0 at isa0 port 0x2e/2: NCT5104D rev 0x53
wbsio0 port 0xa10/2 not configured
vmm0 at mainbus0: VMX/EPT
run0 at uhub0 port 4 configuration 1 interface 0 "Ralink 802.11 n WLAN" rev 2.0
0/1.01 addr 2
run0: MAC/BBP RT5592 (rev 0x0222), RF RT5592 (MIMO 2T2R), address d8:61:62:37:5
6:c8   
uhub2 at uhub1 port 1 configuration 1 interface 0 "Intel Rate Matching Hub" rev
 2.00/0.04 addr 2
 vscsi0 at root
 scsibus2 at vscsi0: 256 targets
 softraid0 at root
 scsibus3 at softraid0: 256 targets
 root on sd0a (7ec83d15890e2a71.a) swap on sd0b dump on sd0b
 inteldrm0: 1024x768, 32bpp
 wsdisplay0 at inteldrm0 mux 1
 wsdisplay0: screen 0-5 added (std, vt100 emulation)
 kernel: protection fault trap, code=0
 Stopped at  icmp_mtudisc_timeout+0x77 [/usr/src/sys/netinet/ip_icmp.c:1072]
 :   movq0(%rax),%rcx
 ddb{0}> ddb{0}>
 ddb{0}> bt  
 icmp_mtudisc_timeout(fd807a4e0620,0) at icmp_mtudisc_timeout+0x77 [/usr/src
 /sys/netinet/ip_icmp.c:1072]
 rt_timer_timer(82324248) at rt_timer_timer+0x1cc [/usr/src/sys/net/rout
 e.c:1551]
 softclock_thread(8000f260) at softclock_thread+0x13b [/usr/src/sys/kern
 /kern_timeout.c:681]
 end trace frame: 0x0, count: -3
 ddb{0}> call db_show_rtentry(fd807a4e0620, 0, 0)  
 Symbol not found

I'd love to know whats going wrong here.

Best regards, 
Stephan



Re: Cron running at 99% CPU for seemingly no reason

2022-05-28 Thread Stephan Mending
>From l...@md5collisions.eu Thu May 26 19:51:47 2022
Date: Thu, 26 May 2022 19:51:47 +0200
From: Stephan Mending 
To: misc@openbsd.org
Subject: Re: Cron running at 99% CPU for seemingly no reason
Message-ID: 
Mail-Followup-To: Stephan Mending , misc@openbsd.org
References: 
 
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: 
X-Operating-System: not-disclosed
Status: RO
Content-Length: 10166
Lines: 244

On Sun, May 15, 2022 at 12:25:24PM +0200, Claudio Jeker wrote:
> On Sun, May 15, 2022 at 12:06:33PM +0200, Stephan Mending wrote:
> > Hi *, 
> > I've got a system running -current that keeps crashing on me every couple 
> > of days. 
> > Output of ddb: 
> > 
> > Connected to /dev/cuaU0 (speed 115200)
> > 
> > ddb{0}> show panic
> > the kernel did not panic
> > ddb{0}> show uvm
> > Current UVM status:
> >   pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
> >   482451 VM pages: 43158 active, 132795 inactive, 35 wired, 192336 free 
> > (24054 z
> > ero)
> >   min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
> >   freemin=16081, free-target=21441, inactive-target=0, wired-max=160817
> >   faults=2487210, traps=2404140, intrs=211883, ctxswitch=1960560 fpuswitch=0
> >   softint=3499069, syscalls=2015497, kmapent=9
> >   fault counts:
> > noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
> > ok relocks(total)=192470(193514), anget(retries)=603205(0), 
> > amapcopy=177151
> > 
> > neighbor anon/obj pg=82033/639788, gets(lock/unlock)=415897/193548
> > cases: anon=570367, anoncow=32838, obj=347149, prcopy=67670, 
> > przero=1469152
> > 
> >   daemon and swap counts:
> > woke=0, revs=0, scans=0, obscans=0, anscans=0
> > busy=0, freed=0, reactivate=0, deactivate=0
> > pageouts=0, pending=0, nswget=0
> > nswapdev=1
> > swpages=526020, swpginuse=0, swpgonly=0 paging=0
> >   kernel pointers:
> > objs(kern)=0x8238a038
> > ddb{0}> show trace
> > No such command
> > ddb{0}> trace
> > icmp_mtudisc_timeout(fd807a50b070,0) at icmp_mtudisc_timeout+0x77
> > rt_timer_timer(8235d668) at rt_timer_timer+0x1cc
> > softclock_thread(8000f260) at softclock_thread+0x13b
> > end trace frame: 0x0, count: -3
> > ddb{0}> 
> > 
> 
> This looks like some bad memory access. This is a fault and not really a
> panic this is why 'show panic' returns 'the kernel did not panic'.
> 
> The crash in icmp_mtudisc_timeout() points to some error in the rttimer
> code refactor I made. Please try a newer snapshot and include the dmesg.
> 
> If it happens again a call to db_show_rtentry in ddb may help better
> understand what is going on:
> 
> call db_show_rtentry(fd807a50b070, 0, 0)
> Where the first argument is derived from the first argument of
> icmp_mtudisc_timeout from the trace.
> 
> -- 
> :wq Claudio

@Claudio

The error seems to persist as attached occurred a couple of days back. (This 
time I've got the full dmesg for you)
I tried your "call db_show_rtentry<...>". That somehow returned an error.

Just reporting right here. I'll try and run a kernel with symbols and wait for 
the next crash. 

'Til then. 

Best regards, 
Stephan


dmesg:
OpenBSD 7.1-current (GENERIC.MP) #525: Sat May 14 23:04:32 MDT 2022
dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
real mem = 2046992384 (1952MB)
avail mem = 1967644672 (1876MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.8 @ 0x7bef9000 (81 entries)
bios0: vendor American Megatrends Inc. version "5.011" date 10/10/2016
bios0: INTEL Corporation CRESCENTBAY
acpi0 at bios0: ACPI 5.0
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP APIC FPDT FIDT MCFG HPET SSDT UEFI SSDT ASF! SSDT SSDT
acpi0: wakeup devices PEG0(S4) PEGP(S4) PEG1(S4) PEGP(S4) PEG2(S4) PEGP(S4) RP0
1(S4) PXSX(S4) RP02(S4) PXSX(S4) RP03(S4) PXSX(S4) RP04(S4) PXSX(S4) RP05(S4) P
XSX(S4) [...]
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: Intel(R) Pentium(R) 3558U @ 1.70GHz, 1696.36 MHz, 06-45-01
cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,C
FLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,V
MX,EST,TM2,SSSE3,SDBG,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,MOVBE,POPCNT,DEADLINE,X
SAVE,RDRAND,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,PERF,ITSC,FSGSBASE,TSC_ADJUST,ERMS
,INVPCID,SRBDS_CTRL,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,SENSOR,ARAT,XSAVEOPT,MEL
TDOWN  
cpu0: 256KB 64b/line 8-way L2 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
cpu0: apic clock running at 99MHz
cpu0: mwait min=64, max=64, C-substates=0.2.1.2.4.1.1.1, IBE
cpu1 at mainbus0: apid 2 (application processor)
cpu1: Intel(R) Pentium(R) 3558U @ 1.70GHz, 1696.08 MHz, 06-45-01
cpu1: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,C

Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Theo de Raadt
This is a bug in a diff I put into snapshots.



Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Hrvoje Popovski
On 15.5.2022. 16:56, Todd C. Miller wrote:
> On Sun, 15 May 2022 16:02:03 +0200, Hrvoje Popovski wrote:
> 
>> I know how to rebuild cron
>>
>> cd /usr/src/usr.sbin/cron/
>> make obj && make depend && make && make install
>>
>> but i don't know how to enabled debug symbols ..
> 
> Easiest would be to do:
> 
> cd /usr/src/usr.sbin/cron/
> make obj && make depend && make DEBUG=-g && make install
> 
>  - todd
> 

Thank you... I will remember this ..



Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Todd C . Miller
On Sun, 15 May 2022 16:02:03 +0200, Hrvoje Popovski wrote:

> I know how to rebuild cron
>
> cd /usr/src/usr.sbin/cron/
> make obj && make depend && make && make install
>
> but i don't know how to enabled debug symbols ..

Easiest would be to do:

cd /usr/src/usr.sbin/cron/
make obj && make depend && make DEBUG=-g && make install

 - todd



Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Hrvoje Popovski
On 15.5.2022. 15:38, Todd C. Miller wrote:
> On Sun, 15 May 2022 14:29:28 +0200, Hrvoje Popovski wrote:
> 
>> I'm seeing same as Stephan on few servers in lab.
>> I've killed cron and did ktrace -i cron. Is this ok?
>> In attachment you can find kdump -f ktrace.out output.
> 
> That's very odd.  It looks like cron parses root's crontab and then
> somehow gets into a cpu loop.  I don't see how that can happen from
> code inspection.  What would be most useful is to get a stack trace
> of cron when this occurs but that will require rebuilding cron from
> source with debug symbols.

Hi,

I know how to rebuild cron

cd /usr/src/usr.sbin/cron/
make obj && make depend && make && make install

but i don't know how to enabled debug symbols ..






Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Todd C . Miller
On Sun, 15 May 2022 14:29:28 +0200, Hrvoje Popovski wrote:

> I'm seeing same as Stephan on few servers in lab.
> I've killed cron and did ktrace -i cron. Is this ok?
> In attachment you can find kdump -f ktrace.out output.

That's very odd.  It looks like cron parses root's crontab and then
somehow gets into a cpu loop.  I don't see how that can happen from
code inspection.  What would be most useful is to get a stack trace
of cron when this occurs but that will require rebuilding cron from
source with debug symbols.

 - todd



Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Hrvoje Popovski
On 15.5.2022. 14:39, Hrvoje Popovski wrote:
> On 15.5.2022. 14:29, Hrvoje Popovski wrote:
>> On 15.5.2022. 12:32, Claudio Jeker wrote:
>>> Also for cron, please attach ktrace to the cron process for a few seconds
>>> and look at the kdump of that. Most probably it is constantly woken up for
>>> some reasons.
>>
>> Hi,
>>
>> I'm seeing same as Stephan on few servers in lab.
>> I've killed cron and did ktrace -i cron. Is this ok?
>> In attachment you can find kdump -f ktrace.out output.
> 
> In attachment you can find kdump from server where cron is ok and from
> server where cron is at 99% ...
> 

it seems that last mail didn't pass so here's kdump output
https://kosjenka.srce.hr/~hrvoje/openbsd/cron-kdump-ok.txt
https://kosjenka.srce.hr/~hrvoje/openbsd/cron-kdump-not-ok.txt




Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Hrvoje Popovski
On 15.5.2022. 12:32, Claudio Jeker wrote:
> Also for cron, please attach ktrace to the cron process for a few seconds
> and look at the kdump of that. Most probably it is constantly woken up for
> some reasons.

Hi,

I'm seeing same as Stephan on few servers in lab.
I've killed cron and did ktrace -i cron. Is this ok?
In attachment you can find kdump -f ktrace.out output.

x3550m4# kdump -f ktrace.out
 64517 ktrace   RET   ktrace 0
 64517 ktrace   CALL  
mmap(0,0x4c,0x3,0x1002,-1,0)
 64517 ktrace   RET   mmap 15937734819840/0xe7ecb05c000
 64517 ktrace   CALL  execve(0x7f7dea50,0x7f7df048,0x7f7df058)
 64517 ktrace   NAMI  "/sbin/cron"
 64517 ktrace   RET   execve -1 errno 2 No such file or directory
 64517 ktrace   CALL  execve(0x7f7dea50,0x7f7df048,0x7f7df058)
 64517 ktrace   NAMI  "/usr/sbin/cron"
 64517 ktrace   ARGS
[0] = "cron"
 64517 cron NAMI  "/usr/libexec/ld.so"
 64517 cron RET   execve 0
 64517 cron CALL  getentropy(0x7f7cfdf0,40)
 64517 cron RET   getentropy 0
 64517 cron CALL  getentropy(0x7f7cfdf0,40)
 64517 cron RET   getentropy 0
 64517 cron CALL  
mmap(0,0x4000,0,0x1002,-1,0)
 64517 cron RET   mmap 13386418999296/0xc2cc4bfd000
 64517 cron CALL  mprotect(0xc2cc4bfe000,0x2000,0x3)
 64517 cron RET   mprotect 0
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13387667914752/0xc2d0f30c000
 64517 cron CALL  issetugid()
 64517 cron RET   issetugid 0
 64517 cron CALL  mprotect(0xc2cb555f000,0x1000,0x1)
 64517 cron RET   mprotect 0
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13386366095360/0xc2cc1989000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13387417739264/0xc2d00476000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13387366080512/0xc2cfd332000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13384896667648/0xc2c6a02e000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388899835904/0xc2d589e6000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388106907648/0xc2d295b4000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388940087296/0xc2d5b049000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13386492366848/0xc2cc91f5000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13385890013184/0xc2ca5382000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13386167992320/0xc2cb5c9c000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388150337536/0xc2d2bf1f000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388628615168/0xc2d4873e000
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13388490608640/0xc2d403a1000
 64517 cron CALL  open(0xc2cb535f526,0x1)
 64517 cron NAMI  "/var/run/ld.so.hints"
 64517 cron RET   open 3
 64517 cron CALL  fstat(3,0x7f7cfc20)
 64517 cron STRU  struct stat { dev=1028, ino=5560594, mode=-r--r--r-- , 
nlink=1, uid=0<"root">, gid=0<"wheel">, rdev=22186032, atime=1652606555<"May 15 
11:22:35 2022">.291841835, mtime=1652606555<"May 15 11:22:35 2022">.291841835, 
ctime=1652606555<"May 15 11:22:35 2022">.291841835, size=13459, blocks=28, 
blksize=16384, flags=0x0, gen=0xa9be6c86 }
 64517 cron RET   fstat 0
 64517 cron CALL  mmap(0,0x3493,0x1,0x2,3,0)
 64517 cron RET   mmap 13388976291840/0xc2d5d2d
 64517 cron CALL  
mmap(0,0x1000,0x3,0x1002,-1,0)
 64517 cron RET   mmap 13387006468096/0xc2ce7c3e000
 64517 cron CALL  close(3)
 64517 cron RET   close 0
 64517 cron CALL  open(0xc2d5d2d1dab,0x1)
 64517 cron NAMI  "/usr/lib/libc.so.96.1"
 64517 cron RET   open 3
 64517 cron CALL  fstat(3,0x7f7cfcf0)
 64517 cron STRU  struct stat { dev=1029, ino=181998, mode=-r--r--r-- , 
nlink=1, uid=0<"root">, gid=7<"bin">, rdev=812000, atime=1652606551<"May 15 
11:22:31 2022">.831851254, mtime=1652606551<"May 15 11:22:31 2022">.861850623, 
ctime=1652606553<"May 15 11:22:33 2022">.101848106, size=3618608, blocks=7104, 
blksize=16384, flags=0x0, gen=0x5a285317 }
 64517 cron RET   fstat 0
 64517 cron CALL  read(3,0x7f7cecf0,0x1000)
 64517 cron GIO   fd 3 read 4096 bytes
   
"\^?ELF\^B\^A\^A\0\0\0\0\0\0\0\0\0\^C\0>\0\^A\0\0\w\^C\0\0\0\0\0@\0\0\0\0\0\0\0\M-0+7\0\0\0\0\0\0\0\0\0@\08\0\v\0@\0.\0,\0\^F\0\0\0\^D\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0h\^B\0\0\0\0\0\0h\^B\0\0\0\0\0\0\b\0\0\

\0\0\0\0\0\^A\0\0\0\^D\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0,g\^C\0\0\0\0\0,g\^C\0\0\0\0\0\0\^P\0\0\0\0\0\0\^A\0\0\0\^E\0\0\g\^C\0\0\0\0\w\^C\0\0\0\0\w\^C\0\0\0\0\0\M-PK
\0\0\0\0\0\M-PK


Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Stuart Henderson
On 2022-05-15, Stephan Mending  wrote:
> Especially the line stating "the kernel did not panic" surprises me, as I am 
> greeted by the kernel debugger. Not sure how to interpret that.

ddb is entered for panics (which are explicit calls from kernel
code) and for other exceptions (which are not) - only the former
have anything to show in "show panic".

To catch the reason for entering ddb in the latter case, either
leave serial console connected and logged, or type "dmesg" _in ddb_
which should also show it at the bottom.

You will likely get a better trace from ddb if you boot a kernel
with debug symbols. Build a kernel from source yourself and you'll have
"bsd.gdb" in the obj directory - either copy that to /bsd, or copy it to
/bsd.gdb and type "boot bsd.gdb" in the boot loader .




Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Claudio Jeker
On Sun, May 15, 2022 at 12:06:33PM +0200, Stephan Mending wrote:
> Hi *, 
> I've got a system running -current that keeps crashing on me every couple of 
> days. 
> Output of ddb: 
> 
> Connected to /dev/cuaU0 (speed 115200)
> 
> ddb{0}> show panic
> the kernel did not panic
> ddb{0}> show uvm
> Current UVM status:
>   pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
>   482451 VM pages: 43158 active, 132795 inactive, 35 wired, 192336 free 
> (24054 z
> ero)
>   min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
>   freemin=16081, free-target=21441, inactive-target=0, wired-max=160817
>   faults=2487210, traps=2404140, intrs=211883, ctxswitch=1960560 fpuswitch=0
>   softint=3499069, syscalls=2015497, kmapent=9
>   fault counts:
> noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
> ok relocks(total)=192470(193514), anget(retries)=603205(0), 
> amapcopy=177151
> 
> neighbor anon/obj pg=82033/639788, gets(lock/unlock)=415897/193548
> cases: anon=570367, anoncow=32838, obj=347149, prcopy=67670, 
> przero=1469152
> 
>   daemon and swap counts:
> woke=0, revs=0, scans=0, obscans=0, anscans=0
> busy=0, freed=0, reactivate=0, deactivate=0
> pageouts=0, pending=0, nswget=0
> nswapdev=1
> swpages=526020, swpginuse=0, swpgonly=0 paging=0
>   kernel pointers:
> objs(kern)=0x8238a038
> ddb{0}> show trace
> No such command
> ddb{0}> trace
> icmp_mtudisc_timeout(fd807a50b070,0) at icmp_mtudisc_timeout+0x77
> rt_timer_timer(8235d668) at rt_timer_timer+0x1cc
> softclock_thread(8000f260) at softclock_thread+0x13b
> end trace frame: 0x0, count: -3
> ddb{0}> 
> 
> Output of a second crash: 
> 
> ddb{0}> show panic
> the kernel did not panic
> ddb{0}> trace
> icmp_mtudisc_timeout(fd8069f9f700,0) at icmp_mtudisc_timeout+0x77
> rt_timer_timer(8231bfc8) at rt_timer_timer+0x1cc
> softclock_thread(8000f500) at softclock_thread+0x13b
> end trace frame: 0x0, count: -3
> ddb{0}> show uvm
> Current UVM status:
>   pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
>   482457 VM pages: 29240 active, 133535 inactive, 35 wired, 205028 free 
> (25630 z
> ero)
>   min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
>   freemin=16081, free-target=21441, inactive-target=0, wired-max=160819
>   faults=687274, traps=693441, intrs=75204, ctxswitch=381252 fpuswitch=0
>   softint=615411, syscalls=607703, kmapent=9
>   fault counts:
> noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
> ok relocks(total)=185433(186477), anget(retries)=141598(0), amapcopy=75047
> neighbor anon/obj pg=69895/201703, gets(lock/unlock)=256502/186509
> cases: anon=114948, anoncow=26650, obj=237702, prcopy=17724, przero=290216
>   daemon and swap counts:
> woke=0, revs=0, scans=0, obscans=0, anscans=0
> busy=0, freed=0, reactivate=0, deactivate=0
> pageouts=0, pending=0, nswget=0
> nswapdev=1
> swpages=526020, swpginuse=0, swpgonly=0 paging=0
>   kernel pointers:
> objs(kern)=0x82317458
> ddb{0}> show bcstats
> Current Buffer Cache status:
> numbufs 24114 busymapped 0, delwri 5
> kvaslots 6030 avail kva slots 6030
> bufpages 96426, dmapages 96426, dirtypages 20
> pendingreads 0, pendingwrites 0
> highflips 0, highflops 0, dmaflips 0
> ddb{0}> mount
> No such command
> ddb{0}> trace
> icmp_mtudisc_timeout(fd8069f9f700,0) at icmp_mtudisc_timeout+0x77
> rt_timer_timer(8231bfc8) at rt_timer_timer+0x1cc
> softclock_thread(8000f500) at softclock_thread+0x13b
> end trace frame: 0x0, count: -3
> 
> 
> 
> Especially the line stating "the kernel did not panic" surprises me, as I am 
> greeted by the kernel debugger. Not sure how to interpret that.
> While looking for the reason behind these "crashes", I noticed that cron is 
> constantly running at 99% cpu. 
> 
> As a first measure I commented out all cronjobs in place (except for daily 
> weekly monthly as I figured these shouldnt
> pose a problem). But that did not remedy the problem. Right after startup 
> cron starts eating away at the cpu. Does 
> anybody have an idea how to further analyze the issue (apart from giving it a 
> go by recompiling cron and using gdb) ? 
> 

Also for cron, please attach ktrace to the cron process for a few seconds
and look at the kdump of that. Most probably it is constantly woken up for
some reasons.

-- 
:wq Claudio



Re: Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Claudio Jeker
On Sun, May 15, 2022 at 12:06:33PM +0200, Stephan Mending wrote:
> Hi *, 
> I've got a system running -current that keeps crashing on me every couple of 
> days. 
> Output of ddb: 
> 
> Connected to /dev/cuaU0 (speed 115200)
> 
> ddb{0}> show panic
> the kernel did not panic
> ddb{0}> show uvm
> Current UVM status:
>   pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
>   482451 VM pages: 43158 active, 132795 inactive, 35 wired, 192336 free 
> (24054 z
> ero)
>   min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
>   freemin=16081, free-target=21441, inactive-target=0, wired-max=160817
>   faults=2487210, traps=2404140, intrs=211883, ctxswitch=1960560 fpuswitch=0
>   softint=3499069, syscalls=2015497, kmapent=9
>   fault counts:
> noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
> ok relocks(total)=192470(193514), anget(retries)=603205(0), 
> amapcopy=177151
> 
> neighbor anon/obj pg=82033/639788, gets(lock/unlock)=415897/193548
> cases: anon=570367, anoncow=32838, obj=347149, prcopy=67670, 
> przero=1469152
> 
>   daemon and swap counts:
> woke=0, revs=0, scans=0, obscans=0, anscans=0
> busy=0, freed=0, reactivate=0, deactivate=0
> pageouts=0, pending=0, nswget=0
> nswapdev=1
> swpages=526020, swpginuse=0, swpgonly=0 paging=0
>   kernel pointers:
> objs(kern)=0x8238a038
> ddb{0}> show trace
> No such command
> ddb{0}> trace
> icmp_mtudisc_timeout(fd807a50b070,0) at icmp_mtudisc_timeout+0x77
> rt_timer_timer(8235d668) at rt_timer_timer+0x1cc
> softclock_thread(8000f260) at softclock_thread+0x13b
> end trace frame: 0x0, count: -3
> ddb{0}> 
> 

This looks like some bad memory access. This is a fault and not really a
panic this is why 'show panic' returns 'the kernel did not panic'.

The crash in icmp_mtudisc_timeout() points to some error in the rttimer
code refactor I made. Please try a newer snapshot and include the dmesg.

If it happens again a call to db_show_rtentry in ddb may help better
understand what is going on:

call db_show_rtentry(fd807a50b070, 0, 0)
Where the first argument is derived from the first argument of
icmp_mtudisc_timeout from the trace.

-- 
:wq Claudio



Cron running at 99% CPU for seemingly no reason

2022-05-15 Thread Stephan Mending
Hi *, 
I've got a system running -current that keeps crashing on me every couple of 
days. 
Output of ddb: 

Connected to /dev/cuaU0 (speed 115200)

ddb{0}> show panic
the kernel did not panic
ddb{0}> show uvm
Current UVM status:
  pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
  482451 VM pages: 43158 active, 132795 inactive, 35 wired, 192336 free (24054 z
ero)
  min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
  freemin=16081, free-target=21441, inactive-target=0, wired-max=160817
  faults=2487210, traps=2404140, intrs=211883, ctxswitch=1960560 fpuswitch=0
  softint=3499069, syscalls=2015497, kmapent=9
  fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=192470(193514), anget(retries)=603205(0), amapcopy=177151

neighbor anon/obj pg=82033/639788, gets(lock/unlock)=415897/193548
cases: anon=570367, anoncow=32838, obj=347149, prcopy=67670, przero=1469152

  daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=1
swpages=526020, swpginuse=0, swpgonly=0 paging=0
  kernel pointers:
objs(kern)=0x8238a038
ddb{0}> show trace
No such command
ddb{0}> trace
icmp_mtudisc_timeout(fd807a50b070,0) at icmp_mtudisc_timeout+0x77
rt_timer_timer(8235d668) at rt_timer_timer+0x1cc
softclock_thread(8000f260) at softclock_thread+0x13b
end trace frame: 0x0, count: -3
ddb{0}> 

Output of a second crash: 

ddb{0}> show panic
the kernel did not panic
ddb{0}> trace
icmp_mtudisc_timeout(fd8069f9f700,0) at icmp_mtudisc_timeout+0x77
rt_timer_timer(8231bfc8) at rt_timer_timer+0x1cc
softclock_thread(8000f500) at softclock_thread+0x13b
end trace frame: 0x0, count: -3
ddb{0}> show uvm
Current UVM status:
  pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
  482457 VM pages: 29240 active, 133535 inactive, 35 wired, 205028 free (25630 z
ero)
  min  10% (25) anon, 10% (25) vnode, 5% (12) vtext
  freemin=16081, free-target=21441, inactive-target=0, wired-max=160819
  faults=687274, traps=693441, intrs=75204, ctxswitch=381252 fpuswitch=0
  softint=615411, syscalls=607703, kmapent=9
  fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=185433(186477), anget(retries)=141598(0), amapcopy=75047
neighbor anon/obj pg=69895/201703, gets(lock/unlock)=256502/186509
cases: anon=114948, anoncow=26650, obj=237702, prcopy=17724, przero=290216
  daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=1
swpages=526020, swpginuse=0, swpgonly=0 paging=0
  kernel pointers:
objs(kern)=0x82317458
ddb{0}> show bcstats
Current Buffer Cache status:
numbufs 24114 busymapped 0, delwri 5
kvaslots 6030 avail kva slots 6030
bufpages 96426, dmapages 96426, dirtypages 20
pendingreads 0, pendingwrites 0
highflips 0, highflops 0, dmaflips 0
ddb{0}> mount
No such command
ddb{0}> trace
icmp_mtudisc_timeout(fd8069f9f700,0) at icmp_mtudisc_timeout+0x77
rt_timer_timer(8231bfc8) at rt_timer_timer+0x1cc
softclock_thread(8000f500) at softclock_thread+0x13b
end trace frame: 0x0, count: -3



Especially the line stating "the kernel did not panic" surprises me, as I am 
greeted by the kernel debugger. Not sure how to interpret that.
While looking for the reason behind these "crashes", I noticed that cron is 
constantly running at 99% cpu. 

As a first measure I commented out all cronjobs in place (except for daily 
weekly monthly as I figured these shouldnt
pose a problem). But that did not remedy the problem. Right after startup cron 
starts eating away at the cpu. Does 
anybody have an idea how to further analyze the issue (apart from giving it a 
go by recompiling cron and using gdb) ? 

Best regards, 
Stephan