Author: kmacy
Date: Thu May  7 20:28:06 2009
New Revision: 191900
URL: http://svn.freebsd.org/changeset/base/191900

Log:
  Asynchronously release vnodes to avoid blocking on range locks when calling 
back in to zfs.
  This is based on a fix that went in to opensolaris on March 9th. However, it 
uses a dedicated
  thread instead of a Solaris' taskq to avoid doing a blocking memory 
allocation with the vnode
  interlock held.
  
  This fixes a long-time deadlock in ZFS. This is not, strictly speaking, an 
LOR. The spa_zio
  thread releases a vnode, this calls in to vn_reclaim which in turn needs to 
acquire range locks
  to sync dirty data out to disk. The range locks are already held by a 
user-level process waiting
  on a condition variable that it the process is waiting on a spa_zio thread to 
signal it on. The
  process could not be signalled because the spa_zio thread could not proceed.
  
  The nature of this problem was not apparent due to ZFS locks opting out of 
witness which meant
  that DDB did not know about the locks that were held by ZFS.
  
  Reviewed by:  pjd
  MFC after:    7 days

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c     Thu May  7 
19:57:14 2009        (r191899)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c     Thu May  7 
20:28:06 2009        (r191900)
@@ -41,6 +41,7 @@
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/proc.h>
 #include <sys/vnode.h>
 
 /* Extensible attribute (xva) routines. */
@@ -72,3 +73,139 @@ xva_getxoptattr(xvattr_t *xvap)
                xoap = &xvap->xva_xoptattrs;
        return (xoap);
 }
+
+static STAILQ_HEAD(, vnode) vn_rele_async_list;
+static struct mtx vn_rele_async_lock;
+static struct cv vn_rele_async_cv;
+static int vn_rele_list_length;
+static int vn_rele_async_thread_exit;
+
+typedef struct  {
+       struct vnode *stqe_next;
+} vnode_link_t;
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq /* unused */)
+{
+       
+       KASSERT(vp != NULL, ("vrele: null vp"));
+       VFS_ASSERT_GIANT(vp->v_mount);
+       VI_LOCK(vp);
+
+       if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
+           vp->v_usecount == 1)) {
+               vp->v_usecount--;
+               vdropl(vp);
+               return;
+       }       
+       if (vp->v_usecount != 1) {
+#ifdef DIAGNOSTIC
+               vprint("vrele: negative ref count", vp);
+#endif
+               VI_UNLOCK(vp);
+               panic("vrele: negative ref cnt");
+       }
+       /*
+        * We are exiting
+        */
+       if (vn_rele_async_thread_exit != 0) {
+               vrele(vp);
+               return;
+       }
+       
+       mtx_lock(&vn_rele_async_lock);
+
+       /*  STAILQ_INSERT_TAIL                  */
+       (*(vnode_link_t *)&vp->v_cstart).stqe_next = NULL;
+       *vn_rele_async_list.stqh_last = vp;
+       vn_rele_async_list.stqh_last =
+           &((vnode_link_t *)&vp->v_cstart)->stqe_next;
+
+       /****************************************/
+       vn_rele_list_length++;
+       if ((vn_rele_list_length % 100) == 0)
+               cv_signal(&vn_rele_async_cv);
+       mtx_unlock(&vn_rele_async_lock);
+       VI_UNLOCK(vp);
+}
+
+static void
+vn_rele_async_init(void *arg)
+{
+
+       mtx_init(&vn_rele_async_lock, "valock", NULL, MTX_DEF);
+       STAILQ_INIT(&vn_rele_async_list);
+
+       /* cv_init(&vn_rele_async_cv, "vacv"); */
+       vn_rele_async_cv.cv_description = "vacv";
+       vn_rele_async_cv.cv_waiters = 0;
+}
+
+void
+vn_rele_async_fini(void)
+{
+
+       mtx_lock(&vn_rele_async_lock);
+       vn_rele_async_thread_exit = 1;
+       cv_signal(&vn_rele_async_cv);
+       while (vn_rele_async_thread_exit != 0)
+               cv_wait(&vn_rele_async_cv, &vn_rele_async_lock);
+       mtx_unlock(&vn_rele_async_lock);
+       mtx_destroy(&vn_rele_async_lock);
+}
+
+
+static void
+vn_rele_async_cleaner(void)
+{
+       STAILQ_HEAD(, vnode) vn_tmp_list;
+       struct vnode *curvnode;
+
+       STAILQ_INIT(&vn_tmp_list);
+       mtx_lock(&vn_rele_async_lock);
+       while (vn_rele_async_thread_exit == 0) {
+               STAILQ_CONCAT(&vn_tmp_list, &vn_rele_async_list);
+               vn_rele_list_length = 0;
+               mtx_unlock(&vn_rele_async_lock);
+               
+               while (!STAILQ_EMPTY(&vn_tmp_list)) {
+                       curvnode = STAILQ_FIRST(&vn_tmp_list);
+
+                       /*   STAILQ_REMOVE_HEAD */
+                       STAILQ_FIRST(&vn_tmp_list) =
+                           ((vnode_link_t *)&curvnode->v_cstart)->stqe_next;
+                       if (STAILQ_FIRST(&vn_tmp_list) == NULL)
+                                        vn_tmp_list.stqh_last = 
&STAILQ_FIRST(&vn_tmp_list);
+                       /***********************/
+                       vrele(curvnode);
+               }
+               mtx_lock(&vn_rele_async_lock);
+               if (vn_rele_list_length == 0)
+                       cv_timedwait(&vn_rele_async_cv, &vn_rele_async_lock,
+                           hz/10);
+       }
+
+       vn_rele_async_thread_exit = 0;
+       cv_broadcast(&vn_rele_async_cv);
+       mtx_unlock(&vn_rele_async_lock);
+       thread_exit();
+}
+
+static struct proc *vn_rele_async_proc;
+static struct kproc_desc up_kp = {
+       "vaclean",
+       vn_rele_async_cleaner,
+       &vn_rele_async_proc
+};
+SYSINIT(vaclean, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+SYSINIT(vn_rele_async_setup, SI_SUB_VFS, SI_ORDER_FIRST, vn_rele_async_init, 
NULL);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c   Thu May  7 
19:57:14 2009        (r191899)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c   Thu May  7 
20:28:06 2009        (r191900)
@@ -1199,6 +1199,7 @@ dmu_init(void)
 void
 dmu_fini(void)
 {
+       vn_rele_async_fini();
        arc_fini();
        dnode_fini();
        dbuf_fini();

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c     Thu May 
 7 19:57:14 2009        (r191899)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c     Thu May 
 7 20:28:06 2009        (r191900)
@@ -93,6 +93,7 @@
  *     pushing cached pages (which acquires range locks) and syncing out
  *     cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *     which could deadlock the system if you were already holding one.
+ *     If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)        All range locks must be grabbed before calling dmu_tx_assign(),
  *     as they can span dmu_tx_assign() calls.
@@ -928,7 +929,11 @@ zfs_get_done(dmu_buf_t *db, void *vzgd)
        vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
        dmu_buf_rele(db, vzgd);
        zfs_range_unlock(rl);
-       VN_RELE(vp);
+       /*
+        * Release the vnode asynchronously as we currently have the
+        * txg stopped from syncing.
+        */
+       VN_RELE_ASYNC(vp, NULL);
        zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
        kmem_free(zgd, sizeof (zgd_t));
        VFS_UNLOCK_GIANT(vfslocked);
@@ -959,7 +964,12 @@ zfs_get_data(void *arg, lr_write_t *lr, 
        if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
                return (ENOENT);
        if (zp->z_unlinked) {
-               VN_RELE(ZTOV(zp));
+               /*
+                * Release the vnode asynchronously as we currently have the
+                * txg stopped from syncing.
+                */
+               VN_RELE_ASYNC(ZTOV(zp), NULL);
+
                return (ENOENT);
        }
 
@@ -1031,7 +1041,11 @@ zfs_get_data(void *arg, lr_write_t *lr, 
        }
 out:
        zfs_range_unlock(rl);
-       VN_RELE(ZTOV(zp));
+       /*
+        * Release the vnode asynchronously as we currently have the
+        * txg stopped from syncing.
+        */
+       VN_RELE_ASYNC(ZTOV(zp), NULL);
        return (error);
 }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c   Thu May  7 
19:57:14 2009        (r191899)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c   Thu May  7 
20:28:06 2009        (r191900)
@@ -1041,7 +1041,7 @@ zil_clean(zilog_t *zilog)
        if ((itx != NULL) &&
            (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
                (void) taskq_dispatch(zilog->zl_clean_taskq,
-                   (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+                   (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
        }
        mutex_exit(&zilog->zl_lock);
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h    Thu May  7 
19:57:14 2009        (r191899)
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h    Thu May  7 
20:28:06 2009        (r191900)
@@ -377,6 +377,15 @@ typedef struct caller_context {
 void           xva_init(xvattr_t *);
 xoptattr_t     *xva_getxoptattr(xvattr_t *);   /* Get ptr to xoptattr_t */
 
+struct taskq;
+void   vn_rele_async(struct vnode *vp, struct taskq *taskq);
+void   vn_rele_async_fini(void);
+       
+       
+#define        VN_RELE_ASYNC(vp, taskq)        { \
+       vn_rele_async(vp, taskq); \
+}
+
 /*
  * Flags to VOP_SETATTR/VOP_GETATTR.
  */
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to