Re: macppc panic: vref used where vget required

2022-04-13 Thread Sebastien Marie
Andrew,

Here a new diff which will record a bit more information.

It keep track of the last 10 v_usecount changes with:
 - pid and ps_comm
 - usecount value (before and after)
 - stacktrace

We will need:
- the vnode_history (without offsets)
- the panic and the backtrace of it

but keep also photos.


On Wed, Apr 13, 2022 at 11:42:22AM +0200, Martin Pieuchot wrote:
> On 12/04/22(Tue) 14:58, Sebastien Marie wrote:
> > [...] 
> > uvn_io: start: 0x8000ffab9688, type VREG, use 0, write 0, hold 0, flags 
> > (VBIOONFREELIST)
> > tag VT_UFS, ino 14802284, on dev 4, 30 flags 0x100, effnlink 1, nlink 1
> > mode 0100644, owner 858, group 1000, size 345603015
> > => vnode_history_print 0x8000ffab9688, next=1
> 
> So we see the same crash in the pdaemon when trying to sync, at least,
> an inactive page to disk...
> 
> [...]
> The vnode is mmaped here. 
> 
> [...] 
> The process exit here, it seems single-threaded.
> 
> [...]
> The reaper unmap all the memory and release the referenced grabbed
> during mmap.
> 
> [...]
> Who is doing this?  Could you print the pid of the process?  It seems
> the vnode has been recycled at this point.
> 
> [...]
> Same for this, has the vnode been recycled?
> 
> 
> > panic: vn_lock: v_usecount == 0
> > Stopped at   db_enter+0x10: popq %rbp
> > TIDPID UID PRFLAGS PFLAGS CPU COMMAND
> >  448838  87877 858   0  0x400  3  qbittorrent-nox
> > *281933  50305   0 0x14000  0x200  1K pagedaemon
> > db_enter() at ...
> > panic(...)
> > vn_lock(800ffab9688,81)
> > uvn_io(fd837a66bee8,...,1,90,1)
> > uvm_pager_put(fd837a66bee8, )
> > uvmpd_scan_inactive(...)
> > uvmpd_scan()
> > uvm_pageout(80005270)
> 
> This indicates a page hasn't been freed during uvn_detach() which is
> called as part of uvm_unmap_detach() by the reaper... Weird.
> 

-- 
Sebastien Marie

diff 153abd98269720429ee0b19133ca24d5c39a7f36 /home/semarie/repos/openbsd/src
blob - 13c70890ccc11f131013d14b853f4f06941924cc
file + sys/kern/vfs_subr.c
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -66,6 +66,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -126,8 +127,27 @@ void vfs_unmountall(void);
 void printlockedvnodes(void);
 #endif
 
+#define VH_MAXST   10  /* ring size */
+#define VH_INCR0
+#define VH_DECR1
+struct vhistory {
+   pid_t   vhh_pid;
+   charvhh_comm[_MAXCOMLEN]; /* incl NUL */
+   u_int   vhh_old_usecount;
+   u_int   vhh_new_usecount;
+   struct stacktrace vhh_st;
+};
+struct vnode_history {
+   u_int   vh_next;/* [K] next free slot in vh_ring */
+   struct vhistory vh_ring[VH_MAXST]; /* [K] ring for holding the vnode 
history */
+};
+void vnode_history_init(struct vnode *);
+void vnode_history_record(struct vnode *, int);
+void vnode_history_print(struct vnode *, int (*pr)(const char *, ...) 
__attribute__((__format__(__kprintf__,1,2;
+
 struct pool vnode_pool;
 struct pool uvm_vnode_pool;
+struct pool vnode_history_pool;
 
 static inline int rb_buf_compare(const struct buf *b1, const struct buf *b2);
 RBT_GENERATE(buf_rb_bufs, buf, b_rbbufs, rb_buf_compare);
@@ -154,6 +174,8 @@ vntblinit(void)
PR_WAITOK, "vnodes", NULL);
pool_init(_vnode_pool, sizeof(struct uvm_vnode), 0, IPL_NONE,
PR_WAITOK, "uvmvnodes", NULL);
+   pool_init(_history_pool, sizeof(struct vnode_history), 0,
+   IPL_NONE, PR_WAITOK, "vnodehis", NULL);
TAILQ_INIT(_hold_list);
TAILQ_INIT(_free_list);
TAILQ_INIT();
@@ -410,6 +432,7 @@ getnewvnode(enum vtagtype tag, struct mount *mp, const
splx(s);
vp = pool_get(_pool, PR_WAITOK | PR_ZERO);
vp->v_uvm = pool_get(_vnode_pool, PR_WAITOK | PR_ZERO);
+   vp->v_history = pool_get(_history_pool, PR_WAITOK);
vp->v_uvm->u_vnode = vp;
uvm_obj_init(>v_uvm->u_obj, _vnodeops, 0);
RBT_INIT(buf_rb_bufs, >v_bufs_tree);
@@ -460,6 +483,7 @@ getnewvnode(enum vtagtype tag, struct mount *mp, const
vp->v_socket = NULL;
}
cache_purge(vp);
+   vnode_history_init(vp);
vp->v_type = VNON;
vp->v_tag = tag;
vp->v_op = vops;
@@ -674,9 +698,13 @@ vget(struct vnode *vp, int flags)
}
 
vp->v_usecount++;
+   vnode_history_record(vp, VH_INCR);
+
if (flags & LK_TYPE_MASK) {
if ((error = vn_lock(vp, flags)) != 0) {
vp->v_usecount--;
+   vnode_history_record(vp, VH_DECR);
+
if (vp->v_usecount == 0 && onfreelist)
vputonfreelist(vp);
}
@@ -700,6 +728,7 @@ vref(struct vnode *vp)
panic("vref on a VNON vnode");
 #endif
vp->v_usecount++;
+   vnode_history_record(vp, VH_INCR);
 }
 
 void
@@ -762,6 +791,7 @@ vput(struct vnode *vp)
}
 #endif

Re: macppc panic: vref used where vget required

2022-04-13 Thread Martin Pieuchot
On 12/04/22(Tue) 14:58, Sebastien Marie wrote:
> [...] 
> uvn_io: start: 0x8000ffab9688, type VREG, use 0, write 0, hold 0, flags 
> (VBIOONFREELIST)
>   tag VT_UFS, ino 14802284, on dev 4, 30 flags 0x100, effnlink 1, nlink 1
>   mode 0100644, owner 858, group 1000, size 345603015
> => vnode_history_print 0x8000ffab9688, next=1

So we see the same crash in the pdaemon when trying to sync, at least,
an inactive page to disk...

>  [0]
> #0 vput
> #1 dofstatat
> #2 syscall
> #3 Xsyscall
>  [1>]
> #0 vput
> #1 dofstatat
> #2 syscall
> #3 Xsyscall
>  [2]
> #0 vget
> #1 ufs_ihashget
> #2 ffs_vget
> #3 ufs_lookup
> #4 VOP_LOOKUP
> #5 vfs_lookup
> #6 namei
> #7 vn_open
> #8 doopenat
> #9 syscall
> #10 Xsyscall
>  [3]
> #0 uvn_attach
> #1 uvm_mmapfile
> #2 sys_mmap
> #3 syscall
> #4 Xsyscall

The vnode is mmaped here. 

>  [4]
> #0 vput
> #1 vn_closefile
> #2 fdrop
> #3 closef
> #4 fdfree
> #5 exit1
> #6 single_thread_check_locked
> #7 userret
> #8 intr_user_exit

The process exit here, it seems single-threaded.

>  [5]
> #0 vrele
> #1 uvm_unmap_detach
> #2 uvm_map_teardown
> #3 uvmspace_free
> #4 reaper
> #5 proc_trampoline

The reaper unmap all the memory and release the referenced grabbed
during mmap.

>  [6]
> #0 vget
> #1 ufs_ihashget
> #2 ffs_vget
> #3 ufs_lookup
> #4 VOP_LOOKUP
> #5 vfs_lookup
> #6 namei
> #7 dofstatat
> #8 syscall
> #9 Xsyscall

Who is doing this?  Could you print the pid of the process?  It seems
the vnode has been recycled at this point.

>  [7]
> #0 vput
> #1 dofstatat
> #2 syscall
> #3 Xsyscall
>  [8]
> #0 vget
> #1 ufs_ihashget
> #2 ffs_vget
> #3 ufs_lookup
> #4 VOP_LOOKUP
> #5 vfs_lookup
> #6 namei
> #7 dofstatat
> #8 syscall
> #9 Xsyscall

Same for this, has the vnode been recycled?


> panic: vn_lock: v_usecount == 0
> Stopped at   db_enter+0x10: popq %rbp
> TIDPID UID PRFLAGS PFLAGS CPU COMMAND
>  448838  87877 858   0  0x400  3  qbittorrent-nox
> *281933  50305   0 0x14000  0x200  1K pagedaemon
> db_enter() at ...
> panic(...)
> vn_lock(800ffab9688,81)
> uvn_io(fd837a66bee8,...,1,90,1)
> uvm_pager_put(fd837a66bee8, )
> uvmpd_scan_inactive(...)
> uvmpd_scan()
> uvm_pageout(80005270)

This indicates a page hasn't been freed during uvn_detach() which is
called as part of uvm_unmap_detach() by the reaper... Weird.

> > But just in case, here is also the output of the 'show vnode'
> > command itself:
> > http://46.23.91.227:8098/show_vnode0.jpg
> > http://46.23.91.227:8098/show_vnode1.jpg
> > http://46.23.91.227:8098/show_vnode2.jpg
> > http://46.23.91.227:8098/show_vnode3.jpg
> > 
> > show mount:
> > http://46.23.91.227:8098/show_mount.jpg
> > 
> > show uvm:
> > http://46.23.91.227:8098/show_uvm.jpg
> > 
> > After I rebooted into OS from ddb and got distracted by some
> > personal stuff, qbittorrent continued rechecking from where it
> > stopped during the panic (about half of 27Gb). I was searching for
> > something in firefox at the time.
> > Very soon I saw the kernel panic again, now with a slightly different
> > content. I'm not sure if these things are related, but just in case,
> > I'll leave the data from the console for this panic as well:
> > 
> > http://46.23.91.227:8098/second_panic0.jpg
> > http://46.23.91.227:8098/second_panic_bcstats.jpg
> > http://46.23.91.227:8098/second_panic_precpu0.jpg
> > http://46.23.91.227:8098/second_panic_precpu1.jpg
> > http://46.23.91.227:8098/second_panic_uvm.jpg
> > 
> > (I suspect I should open a separate bug report for this problem?)
> 
> at this point, I dunno if it is related.
> 
> just transcribing the main elements:
> 
> panic: kernel diagnostic assertion "rv" failed: file "/sys/uvm/uvm_glue.c", 
> line 428
> 
> current process: aiodoned
> (others: 3 firefox processes)
> 
> trace:
> __assert(...)
> uvm_atopg(...)
> uvm_aio_aiodone(...)
> uvm_aiodone_daemon(...)
> 
> the assert itself is:
>417  /*
>418   * uvm_atopg: convert KVAs back to their page structures.
>419   */
>420  struct vm_page *
>421  uvm_atopg(vaddr_t kva)
>422  {
>423  struct vm_page *pg;
>424  paddr_t pa;
>425  boolean_t rv;
>426   
>427  rv = pmap_extract(pmap_kernel(), kva, );
>428  KASSERT(rv);
>429  pg = PHYS_TO_VM_PAGE(pa);
>430  KASSERT(pg != NULL);
>431  return (pg);
>432  }
> 
> Thanks.
> -- 
> Sebastien Marie
>