On Oct,Sunday 2 2011, at 3:00 PM, Juergen Hannken-Illjes wrote: > Module Name: src > Committed By: hannken > Date: Sun Oct 2 13:00:07 UTC 2011 > > Modified Files: > src/sys/kern: vfs_vnode.c > > Log Message: > The path getnewvnode()->getcleanvnode()->vclean()->VOP_LOCK() will panic > if the vnode we want to clean is a layered vnode and the caller already > locked its lower vnode. > > Change getnewvnode() to always allocate a fresh vnode and add a helper > thread (vdrain) to keep the number of allocated vnodes within desiredvnodes. > > Rename getcleanvnode() to cleanvnode() and let it take a vnode from the > lists, clean and free it.
Thanks for doing this. This should help zfs too I saw couple of panics with zfs which were caused by this. Have you done any benchmarks ? when I proposed solution like this main objection was then before vnodes were not allocated from storage pool every time(sometime we reused already allocated one). And this may affect performance. > > Reviewed by: David Holland <dholl...@netbsd.org> > > Should fix: > PR #19110 (nullfs mounts over NFS cause lock manager problems) > PR #34102 (ffs panic in NetBSD 3.0_STABLE) > PR #45115 (lock error panic when build.sh*3 and daily script is running) > PR #45355 (Reader/writer lock error: rw_vector_enter: locking against myself) > > > To generate a diff of this commit: > cvs rdiff -u -r1.11 -r1.12 src/sys/kern/vfs_vnode.c > > Please note that diffs are not public domain; they are subject to the > copyright notices on the relevant files. > > Modified files: > > Index: src/sys/kern/vfs_vnode.c > diff -u src/sys/kern/vfs_vnode.c:1.11 src/sys/kern/vfs_vnode.c:1.12 > --- src/sys/kern/vfs_vnode.c:1.11 Thu Sep 29 20:51:38 2011 > +++ src/sys/kern/vfs_vnode.c Sun Oct 2 13:00:06 2011 > @@ -1,4 +1,4 @@ > -/* $NetBSD: vfs_vnode.c,v 1.11 2011/09/29 20:51:38 christos Exp $ */ > +/* $NetBSD: vfs_vnode.c,v 1.12 2011/10/02 13:00:06 hannken Exp $ */ > > /*- > * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. > @@ -76,7 +76,6 @@ > * starts in one of the following ways: > * > * - Allocation, via getnewvnode(9) and/or vnalloc(9). > - * - Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9). > * - Reclamation of inactive vnode, via vget(9). > * > * The life-cycle ends when the last reference is dropped, usually > @@ -121,7 +120,7 @@ > */ > > #include <sys/cdefs.h> > -__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.11 2011/09/29 20:51:38 christos > Exp $"); > +__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.12 2011/10/02 13:00:06 hannken > Exp $"); > > #include <sys/param.h> > #include <sys/kernel.h> > @@ -159,8 +158,10 @@ static kcondvar_t vrele_cv > __cacheline_ > static lwp_t * vrele_lwp __cacheline_aligned; > static int vrele_pending __cacheline_aligned; > static int vrele_gen __cacheline_aligned; > +static kcondvar_t vdrain_cv __cacheline_aligned; > > -static vnode_t * getcleanvnode(void); > +static int cleanvnode(void); > +static void vdrain_thread(void *); > static void vrele_thread(void *); > static void vnpanic(vnode_t *, const char *, ...) > __attribute__((__format__(__printf__, 2, 3))); > @@ -183,7 +184,11 @@ vfs_vnode_sysinit(void) > TAILQ_INIT(&vrele_list); > > mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); > + cv_init(&vdrain_cv, "vdrain"); > cv_init(&vrele_cv, "vrele"); > + error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, > + NULL, NULL, "vdrain"); > + KASSERT(error == 0); > error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, > NULL, &vrele_lwp, "vrele"); > KASSERT(error == 0); > @@ -249,13 +254,12 @@ vnfree(vnode_t *vp) > } > > /* > - * getcleanvnode: grab a vnode from freelist and clean it. > + * cleanvnode: grab a vnode from freelist, clean and free it. > * > * => Releases vnode_free_list_lock. > - * => Returns referenced vnode on success. > */ > -static vnode_t * > -getcleanvnode(void) > +static int > +cleanvnode(void) > { > vnode_t *vp; > vnodelst_t *listhd; > @@ -288,7 +292,7 @@ try_nextlist: > goto try_nextlist; > } > mutex_exit(&vnode_free_list_lock); > - return NULL; > + return EBUSY; > } > > /* Remove it from the freelist. */ > @@ -300,7 +304,7 @@ try_nextlist: > > /* > * The vnode is still associated with a file system, so we must > - * clean it out before reusing it. We need to add a reference > + * clean it out before freeing it. We need to add a reference > * before doing this. If the vnode gains another reference while > * being cleaned out then we lose - retry. > */ > @@ -308,15 +312,7 @@ try_nextlist: > vclean(vp, DOCLOSE); > KASSERT(vp->v_usecount >= 1 + VC_XLOCK); > atomic_add_int(&vp->v_usecount, -VC_XLOCK); > - if (vp->v_usecount == 1) { > - /* We're about to dirty it. */ > - vp->v_iflag &= ~VI_CLEAN; > - mutex_exit(vp->v_interlock); > - if (vp->v_type == VBLK || vp->v_type == VCHR) { > - spec_node_destroy(vp); > - } > - vp->v_type = VNON; > - } else { > + if (vp->v_usecount > 1) { > /* > * Don't return to freelist - the holder of the last > * reference will destroy it. > @@ -326,17 +322,26 @@ try_nextlist: > goto retry; > } > > + KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN); > + mutex_exit(vp->v_interlock); > + if (vp->v_type == VBLK || vp->v_type == VCHR) { > + spec_node_destroy(vp); > + } > + vp->v_type = VNON; > + > KASSERT(vp->v_data == NULL); > KASSERT(vp->v_uobj.uo_npages == 0); > KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); > KASSERT(vp->v_numoutput == 0); > KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); > > - return vp; > + vrele(vp); > + > + return 0; > } > > /* > - * getnewvnode: return the next vnode from the free list. > + * getnewvnode: return a fresh vnode. > * > * => Returns referenced vnode, moved into the mount queue. > * => Shares the interlock specified by 'slock', if it is not NULL. > @@ -346,9 +351,8 @@ getnewvnode(enum vtagtype tag, struct mo > kmutex_t *slock, vnode_t **vpp) > { > struct uvm_object *uobj; > - static int toggle; > vnode_t *vp; > - int error = 0, tryalloc; > + int error = 0; > > try_again: > if (mp != NULL) { > @@ -361,80 +365,28 @@ try_again: > return error; > } > > - /* > - * We must choose whether to allocate a new vnode or recycle an > - * existing one. The criterion for allocating a new one is that > - * the total number of vnodes is less than the number desired or > - * there are no vnodes on either free list. Generally we only > - * want to recycle vnodes that have no buffers associated with > - * them, so we look first on the vnode_free_list. If it is empty, > - * we next consider vnodes with referencing buffers on the > - * vnode_hold_list. The toggle ensures that half the time we > - * will use a buffer from the vnode_hold_list, and half the time > - * we will allocate a new one unless the list has grown to twice > - * the desired size. We are reticent to recycle vnodes from the > - * vnode_hold_list because we will lose the identity of all its > - * referencing buffers. > - */ > - > vp = NULL; > > + /* Allocate a new vnode. */ > mutex_enter(&vnode_free_list_lock); > - > - toggle ^= 1; > - if (numvnodes > 2 * desiredvnodes) > - toggle = 0; > - > - tryalloc = numvnodes < desiredvnodes || > - (TAILQ_FIRST(&vnode_free_list) == NULL && > - (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); > - > - if (tryalloc) { > - /* Allocate a new vnode. */ > - numvnodes++; > + numvnodes++; > + if (numvnodes > desiredvnodes + desiredvnodes / 10) > + cv_signal(&vdrain_cv); > + mutex_exit(&vnode_free_list_lock); > + if ((vp = vnalloc(NULL)) == NULL) { > + mutex_enter(&vnode_free_list_lock); > + numvnodes--; > mutex_exit(&vnode_free_list_lock); > - if ((vp = vnalloc(NULL)) == NULL) { > - mutex_enter(&vnode_free_list_lock); > - numvnodes--; > - } else > - vp->v_usecount = 1; > - } > - > - if (vp == NULL) { > - /* Recycle and get vnode clean. */ > - vp = getcleanvnode(); > - if (vp == NULL) { > - if (mp != NULL) { > - vfs_unbusy(mp, false, NULL); > - } > - if (tryalloc) { > - printf("WARNING: unable to allocate new " > - "vnode, retrying...\n"); > - kpause("newvn", false, hz, NULL); > - goto try_again; > - } > - tablefull("vnode", "increase kern.maxvnodes or NVNODE"); > - *vpp = 0; > - return ENFILE; > - } > - if ((vp->v_iflag & VI_LOCKSHARE) != 0 || slock) { > - /* We must remove vnode from the old mount point. */ > - if (vp->v_mount) { > - vfs_insmntque(vp, NULL); > - } > - /* Allocate a new interlock, if it was shared. */ > - if (vp->v_iflag & VI_LOCKSHARE) { > - uvm_obj_setlock(&vp->v_uobj, NULL); > - vp->v_iflag &= ~VI_LOCKSHARE; > - } > + if (mp != NULL) { > + vfs_unbusy(mp, false, NULL); > } > - vp->v_iflag = 0; > - vp->v_vflag = 0; > - vp->v_uflag = 0; > - vp->v_socket = NULL; > + printf("WARNING: unable to allocate new vnode, retrying...\n"); > + kpause("newvn", false, hz, NULL); > + goto try_again; > } > > - KASSERT(vp->v_usecount == 1); > + vp->v_usecount = 1; > + > KASSERT(vp->v_freelisthd == NULL); > KASSERT(LIST_EMPTY(&vp->v_nclist)); > KASSERT(LIST_EMPTY(&vp->v_dnclist)); > @@ -493,6 +445,29 @@ ungetnewvnode(vnode_t *vp) > } > > /* > + * Helper thread to keep the number of vnodes below desiredvnodes. > + */ > +static void > +vdrain_thread(void *cookie) > +{ > + int error; > + > + mutex_enter(&vnode_free_list_lock); > + > + for (;;) { > + cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); > + while (numvnodes > desiredvnodes) { > + error = cleanvnode(); > + if (error) > + kpause("vndsbusy", false, hz, NULL); > + mutex_enter(&vnode_free_list_lock); > + if (error) > + break; > + } > + } > +} > + > +/* > * Remove a vnode from its freelist. > */ > void > @@ -1204,17 +1179,19 @@ vwait(vnode_t *vp, int flags) > int > vfs_drainvnodes(long target) > { > + int error; > > - while (numvnodes > target) { > - vnode_t *vp; > + mutex_enter(&vnode_free_list_lock); > > + while (numvnodes > target) { > + error = cleanvnode(); > + if (error != 0) > + return error; > mutex_enter(&vnode_free_list_lock); > - vp = getcleanvnode(); > - if (vp == NULL) { > - return EBUSY; > - } > - ungetnewvnode(vp); > } > + > + mutex_exit(&vnode_free_list_lock); > + > return 0; > } > > Regards Adam.