Author: jhb
Date: Tue Jan 20 19:01:59 2009
New Revision: 187478
URL: http://svn.freebsd.org/changeset/base/187478

Log:
  MFC: Close several races with using shared vnode locks for pathname lookups
  with UFS and enable shared lookups for UFS.
  - Change the name cache to fail lookups with EBADF if a directory vnode
    is recycled while it waits for a lock upgrade.
  - Rework the locking of the dirhash to use an sx lock and reference count
    on each hash structure.  Using an sx lock instead of a mutex allows the
    lock to be held across disk I/O closing a number of races when using
    shared vnode locks that were previously handled by exclusive vnode
    locks.
  - Remove the 'i_ino' and 'i_reclen' fields from the i-node.  i_ino is now
    a local variable in ufs_lookup(), and i_reclen is not needed since
    ufs_dirremove() always has the entire block holding the directory
    entry in memory when it updates the directory.
  - 'i_diroff' and 'i_offset' are now local variables in ufs_lookup().
    'i_diroff' is updated after a successful lookup.
  - Only set i_offset in the parent directory's i-node during a lookup for
    non-LOOKUP operations.
  - Remove the LOOKUP_SHARED option.  One can set vfs.lookup_shared to 1
    in either loader.conf or sysctl.conf instead.  The default setting for
    vfs.lookup_shared is not changed and remains off by default.

Modified:
  stable/7/sys/   (props changed)
  stable/7/sys/conf/options
  stable/7/sys/contrib/pf/   (props changed)
  stable/7/sys/dev/ath/ath_hal/   (props changed)
  stable/7/sys/dev/cxgb/   (props changed)
  stable/7/sys/kern/vfs_cache.c
  stable/7/sys/kern/vfs_lookup.c
  stable/7/sys/nfsclient/nfs_vnops.c
  stable/7/sys/ufs/ffs/ffs_vfsops.c
  stable/7/sys/ufs/ufs/dirhash.h
  stable/7/sys/ufs/ufs/inode.h
  stable/7/sys/ufs/ufs/ufs_dirhash.c
  stable/7/sys/ufs/ufs/ufs_lookup.c

Modified: stable/7/sys/conf/options
==============================================================================
--- stable/7/sys/conf/options   Tue Jan 20 18:16:31 2009        (r187477)
+++ stable/7/sys/conf/options   Tue Jan 20 19:01:59 2009        (r187478)
@@ -741,9 +741,6 @@ NI4BTEL                     opt_i4b.h
 #XXXBZ#NI4BING                 opt_i4b.h
 #XXXBZ#NI4BISPPP               opt_i4b.h
 
-# VFS options
-LOOKUP_SHARED          opt_vfs.h
-
 # HWPMC options
 HWPMC_HOOKS
 

Modified: stable/7/sys/kern/vfs_cache.c
==============================================================================
--- stable/7/sys/kern/vfs_cache.c       Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/kern/vfs_cache.c       Tue Jan 20 19:01:59 2009        
(r187478)
@@ -300,7 +300,9 @@ cache_zap(ncp)
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
- * fails, a status of zero is returned.
+ * fails, a status of zero is returned.  If the directory vnode is
+ * recycled out from under us due to a forced unmount, a status of
+ * EBADF is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
@@ -425,11 +427,19 @@ success:
                 * When we lookup "." we still can be asked to lock it
                 * differently...
                 */
-               ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
-               if (ltype == VOP_ISLOCKED(*vpp, td))
-                       return (-1);
-               else if (ltype == LK_EXCLUSIVE)
-                       vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+               ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+               if (ltype != VOP_ISLOCKED(*vpp, td)) {
+                       if (ltype == LK_EXCLUSIVE) {
+                               vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+                               if ((*vpp)->v_iflag & VI_DOOMED) {
+                                       /* forced unmount */
+                                       vrele(*vpp);
+                                       *vpp = NULL;
+                                       return (EBADF);
+                               }
+                       } else
+                               vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY, td);
+               }
                return (-1);
        }
        ltype = 0;      /* silence gcc warning */
@@ -442,12 +452,14 @@ success:
        error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
        if (cnp->cn_flags & ISDOTDOT)
                vn_lock(dvp, ltype | LK_RETRY, td);
-       if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
-               ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
        if (error) {
                *vpp = NULL;
                goto retry;
        }
+       if ((cnp->cn_flags & ISLASTCN) &&
+           (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+               ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+       }
        return (-1);
 }
 
@@ -663,9 +675,9 @@ vfs_cache_lookup(ap)
        error = cache_lookup(dvp, vpp, cnp);
        if (error == 0)
                return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
-       if (error == ENOENT)
-               return (error);
-       return (0);
+       if (error == -1)
+               return (0);
+       return (error);
 }
 
 

Modified: stable/7/sys/kern/vfs_lookup.c
==============================================================================
--- stable/7/sys/kern/vfs_lookup.c      Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/kern/vfs_lookup.c      Tue Jan 20 19:01:59 2009        
(r187478)
@@ -39,7 +39,6 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
-#include "opt_vfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -88,13 +87,10 @@ nameiinit(void *dummy __unused)
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
 
-#ifdef LOOKUP_SHARED
-static int lookup_shared = 1;
-#else
 static int lookup_shared = 0;
-#endif
 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
     "Enables/Disables shared locks for path name translation");
+TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
 
 /*
  * Convert a pathname into a pointer to a locked vnode.

Modified: stable/7/sys/nfsclient/nfs_vnops.c
==============================================================================
--- stable/7/sys/nfsclient/nfs_vnops.c  Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/nfsclient/nfs_vnops.c  Tue Jan 20 19:01:59 2009        
(r187478)
@@ -868,7 +868,10 @@ nfs_lookup(struct vop_lookup_args *ap)
                *vpp = NULLVP;
                return (error);
        }
-       if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
+       error = cache_lookup(dvp, vpp, cnp);
+       if (error > 0 && error != ENOENT)
+               return (error);
+       if (error == -1) {
                struct vattr vattr;
 
                newvp = *vpp;

Modified: stable/7/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- stable/7/sys/ufs/ffs/ffs_vfsops.c   Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/ufs/ffs/ffs_vfsops.c   Tue Jan 20 19:01:59 2009        
(r187478)
@@ -852,7 +852,7 @@ ffs_mountfs(devvp, mp, td)
         * Initialize filesystem stat information in mount struct.
         */
        MNT_ILOCK(mp);
-       mp->mnt_kern_flag |= MNTK_MPSAFE;
+       mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED;
        MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART

Modified: stable/7/sys/ufs/ufs/dirhash.h
==============================================================================
--- stable/7/sys/ufs/ufs/dirhash.h      Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/ufs/ufs/dirhash.h      Tue Jan 20 19:01:59 2009        
(r187478)
@@ -28,6 +28,9 @@
 #ifndef _UFS_UFS_DIRHASH_H_
 #define _UFS_UFS_DIRHASH_H_
 
+#include <sys/_lock.h>
+#include <sys/_sx.h>
+
 /*
  * For fast operations on large directories, we maintain a hash
  * that maps the file name to the offset of the directory entry within
@@ -80,12 +83,14 @@
     ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK])
 
 struct dirhash {
-       struct mtx dh_mtx;      /* protects all fields except dh_list */
+       struct sx dh_lock;      /* protects all fields except list & score */
+       int     dh_refcount;
 
        doff_t  **dh_hash;      /* the hash array (2-level) */
        int     dh_narrays;     /* number of entries in dh_hash */
        int     dh_hlen;        /* total slots in the 2-level hash array */
        int     dh_hused;       /* entries in use */
+       int     dh_memreq;      /* Memory used. */
 
        /* Free space statistics. XXX assumes DIRBLKSIZ is 512. */
        u_int8_t *dh_blkfree;   /* free DIRALIGN words in each dir block */

Modified: stable/7/sys/ufs/ufs/inode.h
==============================================================================
--- stable/7/sys/ufs/ufs/inode.h        Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/ufs/ufs/inode.h        Tue Jan 20 19:01:59 2009        
(r187478)
@@ -82,8 +82,6 @@ struct inode {
        doff_t    i_endoff;     /* End of useful stuff in directory. */
        doff_t    i_diroff;     /* Offset in dir, where we found last entry. */
        doff_t    i_offset;     /* Offset of free space in directory. */
-       ino_t     i_ino;        /* Inode number of found directory. */
-       u_int32_t i_reclen;     /* Size of found directory entry. */
 
        union {
                struct dirhash *dirhash; /* Hashing for large directories. */

Modified: stable/7/sys/ufs/ufs/ufs_dirhash.c
==============================================================================
--- stable/7/sys/ufs/ufs/ufs_dirhash.c  Tue Jan 20 18:16:31 2009        
(r187477)
+++ stable/7/sys/ufs/ufs/ufs_dirhash.c  Tue Jan 20 19:01:59 2009        
(r187478)
@@ -46,7 +46,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/refcount.h>
 #include <sys/sysctl.h>
+#include <sys/sx.h>
 #include <vm/uma.h>
 
 #include <ufs/ufs/quota.h>
@@ -88,15 +90,16 @@ static int ufsdirhash_findslot(struct di
           doff_t offset);
 static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
 static int ufsdirhash_recycle(int wanted);
+static void ufsdirhash_free_locked(struct inode *ip);
 
 static uma_zone_t      ufsdirhash_zone;
 
 #define DIRHASHLIST_LOCK()             mtx_lock(&ufsdirhash_mtx)
 #define DIRHASHLIST_UNLOCK()           mtx_unlock(&ufsdirhash_mtx)
-#define DIRHASH_LOCK(dh)               mtx_lock(&(dh)->dh_mtx)
-#define DIRHASH_UNLOCK(dh)             mtx_unlock(&(dh)->dh_mtx)
 #define DIRHASH_BLKALLOC_WAITOK()      uma_zalloc(ufsdirhash_zone, M_WAITOK)
 #define DIRHASH_BLKFREE(ptr)           uma_zfree(ufsdirhash_zone, (ptr))
+#define        DIRHASH_ASSERT_LOCKED(dh)                                       
\
+    sx_assert(&(dh)->dh_lock, SA_LOCKED)
 
 /* Dirhash list; recently-used entries are near the tail. */
 static TAILQ_HEAD(, dirhash) ufsdirhash_list;
@@ -105,14 +108,199 @@ static TAILQ_HEAD(, dirhash) ufsdirhash_
 static struct mtx      ufsdirhash_mtx;
 
 /*
- * Locking order:
- *     ufsdirhash_mtx
- *     dh_mtx
+ * Locking:
  *
- * The dh_mtx mutex should be acquired either via the inode lock, or via
- * ufsdirhash_mtx. Only the owner of the inode may free the associated
- * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ * The relationship between inode and dirhash is protected either by an
+ * exclusive vnode lock or the vnode interlock where a shared vnode lock
+ * may be used.  The dirhash_mtx is acquired after the dirhash lock.  To
+ * handle teardown races, code wishing to lock the dirhash for an inode
+ * when using a shared vnode lock must obtain a private reference on the
+ * dirhash while holding the vnode interlock.  They can drop it once they
+ * have obtained the dirhash lock and verified that the dirhash wasn't
+ * recycled while they waited for the dirhash lock.
+ *
+ * ufsdirhash_build() acquires a shared lock on the dirhash when it is
+ * successful.  This lock is released after a call to ufsdirhash_lookup().
+ *
+ * Functions requiring exclusive access use ufsdirhash_acquire() which may
+ * free a dirhash structure that was recycled by ufsdirhash_recycle().
+ *
+ * The dirhash lock may be held across io operations.
+ */
+
+static void
+ufsdirhash_hold(struct dirhash *dh)
+{
+
+       refcount_acquire(&dh->dh_refcount);
+}
+
+static void
+ufsdirhash_drop(struct dirhash *dh)
+{
+
+       if (refcount_release(&dh->dh_refcount)) {
+               sx_destroy(&dh->dh_lock);
+               free(dh, M_DIRHASH);
+       }
+}
+
+/*
+ * Release the lock on a dirhash.
+ */
+static void
+ufsdirhash_release(struct dirhash *dh)
+{
+
+       sx_unlock(&dh->dh_lock);
+}
+
+/*
+ * Either acquire an existing hash locked shared or create a new hash and
+ * return it exclusively locked.  May return NULL if the allocation fails.
+ *
+ * The vnode interlock is used to protect the i_dirhash pointer from
+ * simultaneous access while only a shared vnode lock is held.
+ */
+static struct dirhash *
+ufsdirhash_create(struct inode *ip)
+{
+       struct dirhash *ndh;
+       struct dirhash *dh;
+       struct vnode *vp;
+       int error;
+
+       error = 0;
+       ndh = dh = NULL;
+       vp = ip->i_vnode;
+       for (;;) {
+               /* Racy check for i_dirhash to prefetch an dirhash structure. */
+               if (ip->i_dirhash == NULL && ndh == NULL) {
+                       MALLOC(ndh, struct dirhash *, sizeof *dh, M_DIRHASH,
+                           M_NOWAIT | M_ZERO);
+                       if (ndh == NULL)
+                               return (NULL);
+                       refcount_init(&ndh->dh_refcount, 1);
+                       sx_init(&ndh->dh_lock, "dirhash");
+                       sx_xlock(&ndh->dh_lock);
+               }
+               /*
+                * Check i_dirhash.  If it's NULL just try to use a
+                * preallocated structure.  If none exists loop and try again.
+                */
+               VI_LOCK(vp);
+               dh = ip->i_dirhash;
+               if (dh == NULL) {
+                       ip->i_dirhash = ndh;
+                       VI_UNLOCK(vp);
+                       if (ndh == NULL)
+                               continue;
+                       return (ndh);
+               }
+               ufsdirhash_hold(dh);
+               VI_UNLOCK(vp);
+
+               /* Acquire a shared lock on existing hashes. */
+               sx_slock(&dh->dh_lock);
+
+               /* The hash could've been recycled while we were waiting. */
+               VI_LOCK(vp);
+               if (ip->i_dirhash != dh) {
+                       VI_UNLOCK(vp);
+                       ufsdirhash_release(dh);
+                       ufsdirhash_drop(dh);
+                       continue;
+               }
+               VI_UNLOCK(vp);
+               ufsdirhash_drop(dh);
+
+               /* If the hash is still valid we've succeeded. */
+               if (dh->dh_hash != NULL)
+                       break;
+               /*
+                * If the hash is NULL it has been recycled.  Try to upgrade
+                * so we can recreate it.  If we fail the upgrade, drop our
+                * lock and try again.
+                */
+               if (sx_try_upgrade(&dh->dh_lock))
+                       break;
+               sx_sunlock(&dh->dh_lock);
+       }
+       /* Free the preallocated structure if it was not necessary. */
+       if (ndh) {
+               ufsdirhash_release(ndh);
+               ufsdirhash_drop(ndh);
+       }
+       return (dh);
+}
+
+/*
+ * Acquire an exclusive lock on an existing hash.  Requires an exclusive
+ * vnode lock to protect the i_dirhash pointer.  hashes that have been
+ * recycled are reclaimed here and NULL is returned.
+ */
+static struct dirhash *
+ufsdirhash_acquire(struct inode *ip)
+{
+       struct dirhash *dh;
+       struct vnode *vp;
+
+       ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
+
+       vp = ip->i_vnode;
+       dh = ip->i_dirhash;
+       if (dh == NULL)
+               return (NULL);
+       sx_xlock(&dh->dh_lock);
+       if (dh->dh_hash != NULL)
+               return (dh);
+       ufsdirhash_free_locked(ip);
+       return (NULL);
+}
+
+/*
+ * Acquire exclusively and free the hash pointed to by ip.  Works with a
+ * shared or exclusive vnode lock.
  */
+void
+ufsdirhash_free(struct inode *ip)
+{
+       struct dirhash *dh;
+       struct vnode *vp;
+
+       vp = ip->i_vnode;
+       for (;;) {
+               /* Grab a reference on this inode's dirhash if it has one. */
+               VI_LOCK(vp);
+               dh = ip->i_dirhash;
+               if (dh == NULL) {
+                       VI_UNLOCK(vp);
+                       return;
+               }
+               ufsdirhash_hold(dh);
+               VI_UNLOCK(vp);
+
+               /* Exclusively lock the dirhash. */
+               sx_xlock(&dh->dh_lock);
+
+               /* If this dirhash still belongs to this inode, then free it. */
+               VI_LOCK(vp);
+               if (ip->i_dirhash == dh) {
+                       VI_UNLOCK(vp);
+                       ufsdirhash_drop(dh);
+                       break;
+               }
+               VI_UNLOCK(vp);
+
+               /*
+                * This inode's dirhash has changed while we were
+                * waiting for the dirhash lock, so try again.
+                */
+               ufsdirhash_release(dh);
+               ufsdirhash_drop(dh);
+       }
+       ufsdirhash_free_locked(ip);
+}
 
 /*
  * Attempt to build up a hash table for the directory contents in
@@ -128,27 +316,23 @@ ufsdirhash_build(struct inode *ip)
        doff_t bmask, pos;
        int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
 
-       /* Check if we can/should use dirhash. */
-       if (ip->i_dirhash == NULL) {
-               if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
+       /* Take care of a decreased sysctl value. */
+       while (ufs_dirhashmem > ufs_dirhashmaxmem)
+               if (ufsdirhash_recycle(0) != 0)
                        return (-1);
-       } else {
-               /* Hash exists, but sysctls could have changed. */
-               if (ip->i_size < ufs_mindirhashsize ||
-                   ufs_dirhashmem > ufs_dirhashmaxmem) {
+
+       /* Check if we can/should use dirhash. */
+       if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
+           ip->i_effnlink == 0) {
+               if (ip->i_dirhash)
                        ufsdirhash_free(ip);
-                       return (-1);
-               }
-               /* Check if hash exists and is intact (note: unlocked read). */
-               if (ip->i_dirhash->dh_hash != NULL)
-                       return (0);
-               /* Free the old, recycled hash and build a new one. */
-               ufsdirhash_free(ip);
+               return (-1);
        }
-
-       /* Don't hash removed directories. */
-       if (ip->i_effnlink == 0)
+       dh = ufsdirhash_create(ip);
+       if (dh == NULL)
                return (-1);
+       if (dh->dh_hash != NULL)
+               return (0);
 
        vp = ip->i_vnode;
        /* Allocate 50% more entries than this dir size could ever need. */
@@ -159,7 +343,6 @@ ufsdirhash_build(struct inode *ip)
        nslots = narrays * DH_NBLKOFF;
        dirblocks = howmany(ip->i_size, DIRBLKSIZ);
        nblocks = (dirblocks * 3 + 1) / 2;
-
        memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
            narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
            nblocks * sizeof(*dh->dh_blkfree);
@@ -167,33 +350,40 @@ ufsdirhash_build(struct inode *ip)
        if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
                DIRHASHLIST_UNLOCK();
                if (memreqd > ufs_dirhashmaxmem / 2)
-                       return (-1);
-
+                       goto fail;
                /* Try to free some space. */
                if (ufsdirhash_recycle(memreqd) != 0)
-                       return (-1);
+                       goto fail;
                /* Enough was freed, and list has been locked. */
        }
        ufs_dirhashmem += memreqd;
        DIRHASHLIST_UNLOCK();
 
+       /* Initialise the hash table and block statistics. */
+       dh->dh_memreq = memreqd;
+       dh->dh_narrays = narrays;
+       dh->dh_hlen = nslots;
+       dh->dh_nblk = nblocks;
+       dh->dh_dirblks = dirblocks;
+       for (i = 0; i < DH_NFSTATS; i++)
+               dh->dh_firstfree[i] = -1;
+       dh->dh_firstfree[DH_NFSTATS] = 0;
+       dh->dh_hused = 0;
+       dh->dh_seqopt = 0;
+       dh->dh_seqoff = 0;
+       dh->dh_score = DH_SCOREINIT;
+
        /*
         * Use non-blocking mallocs so that we will revert to a linear
         * lookup on failure rather than potentially blocking forever.
         */
-       MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
-       if (dh == NULL) {
-               DIRHASHLIST_LOCK();
-               ufs_dirhashmem -= memreqd;
-               DIRHASHLIST_UNLOCK();
-               return (-1);
-       }
-       mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF);
        MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
            M_DIRHASH, M_NOWAIT | M_ZERO);
+       if (dh->dh_hash == NULL)
+               goto fail;
        MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
            M_DIRHASH, M_NOWAIT);
-       if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+       if (dh->dh_blkfree == NULL)
                goto fail;
        for (i = 0; i < narrays; i++) {
                if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
@@ -201,22 +391,8 @@ ufsdirhash_build(struct inode *ip)
                for (j = 0; j < DH_NBLKOFF; j++)
                        dh->dh_hash[i][j] = DIRHASH_EMPTY;
        }
-
-       /* Initialise the hash table and block statistics. */
-       dh->dh_narrays = narrays;
-       dh->dh_hlen = nslots;
-       dh->dh_nblk = nblocks;
-       dh->dh_dirblks = dirblocks;
        for (i = 0; i < dirblocks; i++)
                dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
-       for (i = 0; i < DH_NFSTATS; i++)
-               dh->dh_firstfree[i] = -1;
-       dh->dh_firstfree[DH_NFSTATS] = 0;
-       dh->dh_seqopt = 0;
-       dh->dh_seqoff = 0;
-       dh->dh_score = DH_SCOREINIT;
-       ip->i_dirhash = dh;
-
        bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
        pos = 0;
        while (pos < ip->i_size) {
@@ -254,63 +430,70 @@ ufsdirhash_build(struct inode *ip)
        TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
        dh->dh_onlist = 1;
        DIRHASHLIST_UNLOCK();
+       sx_downgrade(&dh->dh_lock);
        return (0);
 
 fail:
-       if (dh->dh_hash != NULL) {
-               for (i = 0; i < narrays; i++)
-                       if (dh->dh_hash[i] != NULL)
-                               DIRHASH_BLKFREE(dh->dh_hash[i]);
-               FREE(dh->dh_hash, M_DIRHASH);
-       }
-       if (dh->dh_blkfree != NULL)
-               FREE(dh->dh_blkfree, M_DIRHASH);
-       mtx_destroy(&dh->dh_mtx);
-       FREE(dh, M_DIRHASH);
-       ip->i_dirhash = NULL;
-       DIRHASHLIST_LOCK();
-       ufs_dirhashmem -= memreqd;
-       DIRHASHLIST_UNLOCK();
+       ufsdirhash_free_locked(ip);
        return (-1);
 }
 
 /*
  * Free any hash table associated with inode 'ip'.
  */
-void
-ufsdirhash_free(struct inode *ip)
+static void
+ufsdirhash_free_locked(struct inode *ip)
 {
        struct dirhash *dh;
-       int i, mem;
+       struct vnode *vp;
+       int i;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return;
+       DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
+
+       /*
+        * Clear the pointer in the inode to prevent new threads from
+        * finding the dead structure.
+        */
+       vp = ip->i_vnode;
+       VI_LOCK(vp);
+       dh = ip->i_dirhash;
+       ip->i_dirhash = NULL;
+       VI_UNLOCK(vp);
+
+       /*
+        * Remove the hash from the list since we are going to free its
+        * memory.
+        */
        DIRHASHLIST_LOCK();
-       DIRHASH_LOCK(dh);
        if (dh->dh_onlist)
                TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
-       DIRHASH_UNLOCK(dh);
+       ufs_dirhashmem -= dh->dh_memreq;
        DIRHASHLIST_UNLOCK();
 
-       /* The dirhash pointed to by 'dh' is exclusively ours now. */
+       /*
+        * At this point, any waiters for the lock should hold their
+        * own reference on the dirhash structure.  They will drop
+        * that reference once they grab the vnode interlock and see
+        * that ip->i_dirhash is NULL.
+        */
+       sx_xunlock(&dh->dh_lock);
 
-       mem = sizeof(*dh);
+       /*
+        * Handle partially recycled as well as fully constructed hashes.
+        */
        if (dh->dh_hash != NULL) {
                for (i = 0; i < dh->dh_narrays; i++)
-                       DIRHASH_BLKFREE(dh->dh_hash[i]);
+                       if (dh->dh_hash[i] != NULL)
+                               DIRHASH_BLKFREE(dh->dh_hash[i]);
                FREE(dh->dh_hash, M_DIRHASH);
-               FREE(dh->dh_blkfree, M_DIRHASH);
-               mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
-                   dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
-                   dh->dh_nblk * sizeof(*dh->dh_blkfree);
+               if (dh->dh_blkfree != NULL)
+                       FREE(dh->dh_blkfree, M_DIRHASH);
        }
-       mtx_destroy(&dh->dh_mtx);
-       FREE(dh, M_DIRHASH);
-       ip->i_dirhash = NULL;
 
-       DIRHASHLIST_LOCK();
-       ufs_dirhashmem -= mem;
-       DIRHASHLIST_UNLOCK();
+       /*
+        * Drop the inode's reference to the data structure.
+        */
+       ufsdirhash_drop(dh);
 }
 
 /*
@@ -323,6 +506,8 @@ ufsdirhash_free(struct inode *ip)
  * prevoffp is non-NULL, the offset of the previous entry within
  * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
  * is the first in a block, the start of the block is used).
+ *
+ * Must be called with the hash locked.  Returns with the hash unlocked.
  */
 int
 ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
@@ -334,48 +519,36 @@ ufsdirhash_lookup(struct inode *ip, char
        struct buf *bp;
        doff_t blkoff, bmask, offset, prevoff;
        int i, slot;
+       int error;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return (EJUSTRETURN);
+       dh = ip->i_dirhash;
+       KASSERT(dh != NULL && dh->dh_hash != NULL,
+           ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
+       DIRHASH_ASSERT_LOCKED(dh);
        /*
         * Move this dirhash towards the end of the list if it has a
-        * score higher than the next entry, and acquire the dh_mtx.
-        * Optimise the case where it's already the last by performing
-        * an unlocked read of the TAILQ_NEXT pointer.
-        *
-        * In both cases, end up holding just dh_mtx.
+        * score higher than the next entry, and acquire the dh_lock.
         */
+       DIRHASHLIST_LOCK();
        if (TAILQ_NEXT(dh, dh_list) != NULL) {
-               DIRHASHLIST_LOCK();
-               DIRHASH_LOCK(dh);
                /*
                 * If the new score will be greater than that of the next
                 * entry, then move this entry past it. With both mutexes
                 * held, dh_next won't go away, but its dh_score could
                 * change; that's not important since it is just a hint.
                 */
-               if (dh->dh_hash != NULL &&
-                   (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+               if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
                    dh->dh_score >= dh_next->dh_score) {
                        KASSERT(dh->dh_onlist, ("dirhash: not on list"));
                        TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
                        TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
                            dh_list);
                }
-               DIRHASHLIST_UNLOCK();
-       } else {
-               /* Already the last, though that could change as we wait. */
-               DIRHASH_LOCK(dh);
-       }
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
-               return (EJUSTRETURN);
        }
-
        /* Update the score. */
        if (dh->dh_score < DH_SCOREMAX)
                dh->dh_score++;
+       DIRHASHLIST_UNLOCK();
 
        vp = ip->i_vnode;
        bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
@@ -410,23 +583,23 @@ restart:
            slot = WRAPINCR(slot, dh->dh_hlen)) {
                if (offset == DIRHASH_DEL)
                        continue;
-               DIRHASH_UNLOCK(dh);
-
                if (offset < 0 || offset >= ip->i_size)
                        panic("ufsdirhash_lookup: bad offset in hash array");
                if ((offset & ~bmask) != blkoff) {
                        if (bp != NULL)
                                brelse(bp);
                        blkoff = offset & ~bmask;
-                       if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
-                               return (EJUSTRETURN);
+                       if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
+                               error = EJUSTRETURN;
+                               goto fail;
+                       }
                }
                dp = (struct direct *)(bp->b_data + (offset & bmask));
                if (dp->d_reclen == 0 || dp->d_reclen >
                    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
                        /* Corrupted directory. */
-                       brelse(bp);
-                       return (EJUSTRETURN);
+                       error = EJUSTRETURN;
+                       goto fail;
                }
                if (dp->d_namlen == namelen &&
                    bcmp(dp->d_name, name, namelen) == 0) {
@@ -436,8 +609,8 @@ restart:
                                        prevoff = ufsdirhash_getprev(dp,
                                            offset);
                                        if (prevoff == -1) {
-                                               brelse(bp);
-                                               return (EJUSTRETURN);
+                                               error = EJUSTRETURN;
+                                               goto fail;
                                        }
                                } else
                                        prevoff = offset;
@@ -448,20 +621,12 @@ restart:
                        if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
                                dh->dh_seqopt = 1;
                        dh->dh_seqoff = offset + DIRSIZ(0, dp);
-
                        *bpp = bp;
                        *offp = offset;
+                       ufsdirhash_release(dh);
                        return (0);
                }
 
-               DIRHASH_LOCK(dh);
-               if (dh->dh_hash == NULL) {
-                       DIRHASH_UNLOCK(dh);
-                       if (bp != NULL)
-                               brelse(bp);
-                       ufsdirhash_free(ip);
-                       return (EJUSTRETURN);
-               }
                /*
                 * When the name doesn't match in the seqopt case, go back
                 * and search normally.
@@ -471,10 +636,12 @@ restart:
                        goto restart;
                }
        }
-       DIRHASH_UNLOCK(dh);
+       error = ENOENT;
+fail:
+       ufsdirhash_release(dh);
        if (bp != NULL)
                brelse(bp);
-       return (ENOENT);
+       return (error);
 }
 
 /*
@@ -502,29 +669,22 @@ ufsdirhash_findfree(struct inode *ip, in
        doff_t pos, slotstart;
        int dirblock, error, freebytes, i;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return (-1);
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
-               return (-1);
-       }
+       dh = ip->i_dirhash;
+       KASSERT(dh != NULL && dh->dh_hash != NULL,
+           ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
+       DIRHASH_ASSERT_LOCKED(dh);
 
        /* Find a directory block with the desired free space. */
        dirblock = -1;
        for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
                if ((dirblock = dh->dh_firstfree[i]) != -1)
                        break;
-       if (dirblock == -1) {
-               DIRHASH_UNLOCK(dh);
+       if (dirblock == -1)
                return (-1);
-       }
 
        KASSERT(dirblock < dh->dh_nblk &&
            dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
            ("ufsdirhash_findfree: bad stats"));
-       DIRHASH_UNLOCK(dh);
        pos = dirblock * DIRBLKSIZ;
        error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
        if (error)
@@ -582,24 +742,18 @@ ufsdirhash_enduseful(struct inode *ip)
        struct dirhash *dh;
        int i;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return (-1);
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
-               return (-1);
-       }
+       dh = ip->i_dirhash;
+       DIRHASH_ASSERT_LOCKED(dh);
+       KASSERT(dh != NULL && dh->dh_hash != NULL,
+           ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
 
-       if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
-               DIRHASH_UNLOCK(dh);
+       if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
                return (-1);
-       }
 
        for (i = dh->dh_dirblks - 1; i >= 0; i--)
                if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
                        break;
-       DIRHASH_UNLOCK(dh);
+
        return ((doff_t)(i + 1) * DIRBLKSIZ);
 }
 
@@ -614,15 +768,9 @@ ufsdirhash_add(struct inode *ip, struct 
        struct dirhash *dh;
        int slot;
 
-       if ((dh = ip->i_dirhash) == NULL)
+       if ((dh = ufsdirhash_acquire(ip)) == NULL)
                return;
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
-               return;
-       }
-
+       
        KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
            ("ufsdirhash_add: bad offset"));
        /*
@@ -630,8 +778,7 @@ ufsdirhash_add(struct inode *ip, struct 
         * remove the hash entirely and let it be rebuilt later.
         */
        if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+               ufsdirhash_free_locked(ip);
                return;
        }
 
@@ -645,7 +792,7 @@ ufsdirhash_add(struct inode *ip, struct 
 
        /* Update the per-block summary info. */
        ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
-       DIRHASH_UNLOCK(dh);
+       ufsdirhash_release(dh);
 }
 
 /*
@@ -659,14 +806,8 @@ ufsdirhash_remove(struct inode *ip, stru
        struct dirhash *dh;
        int slot;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return;
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+       if ((dh = ufsdirhash_acquire(ip)) == NULL)
                return;
-       }
 
        KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
            ("ufsdirhash_remove: bad offset"));
@@ -678,7 +819,7 @@ ufsdirhash_remove(struct inode *ip, stru
 
        /* Update the per-block summary info. */
        ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
-       DIRHASH_UNLOCK(dh);
+       ufsdirhash_release(dh);
 }
 
 /*
@@ -692,14 +833,8 @@ ufsdirhash_move(struct inode *ip, struct
        struct dirhash *dh;
        int slot;
 
-       if ((dh = ip->i_dirhash) == NULL)
+       if ((dh = ufsdirhash_acquire(ip)) == NULL)
                return;
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
-               return;
-       }
 
        KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
            newoff < dh->dh_dirblks * DIRBLKSIZ,
@@ -707,7 +842,7 @@ ufsdirhash_move(struct inode *ip, struct
        /* Find the entry, and update the offset. */
        slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
        DH_ENTRY(dh, slot) = newoff;
-       DIRHASH_UNLOCK(dh);
+       ufsdirhash_release(dh);
 }
 
 /*
@@ -720,22 +855,15 @@ ufsdirhash_newblk(struct inode *ip, doff
        struct dirhash *dh;
        int block;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return;
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+       if ((dh = ufsdirhash_acquire(ip)) == NULL)
                return;
-       }
 
        KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
            ("ufsdirhash_newblk: bad offset"));
        block = offset / DIRBLKSIZ;
        if (block >= dh->dh_nblk) {
                /* Out of space; must rebuild. */
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+               ufsdirhash_free_locked(ip);
                return;
        }
        dh->dh_dirblks = block + 1;
@@ -744,7 +872,7 @@ ufsdirhash_newblk(struct inode *ip, doff
        dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
        if (dh->dh_firstfree[DH_NFSTATS] == -1)
                dh->dh_firstfree[DH_NFSTATS] = block;
-       DIRHASH_UNLOCK(dh);
+       ufsdirhash_release(dh);
 }
 
 /*
@@ -756,14 +884,8 @@ ufsdirhash_dirtrunc(struct inode *ip, do
        struct dirhash *dh;
        int block, i;
 
-       if ((dh = ip->i_dirhash) == NULL)
-               return;
-       DIRHASH_LOCK(dh);
-       if (dh->dh_hash == NULL) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+       if ((dh = ufsdirhash_acquire(ip)) == NULL)
                return;
-       }
 
        KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
            ("ufsdirhash_dirtrunc: bad offset"));
@@ -775,8 +897,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
         * if necessary.
         */
        if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
-               DIRHASH_UNLOCK(dh);
-               ufsdirhash_free(ip);
+               ufsdirhash_free_locked(ip);
                return;
        }
 
@@ -794,7 +915,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
                if (dh->dh_firstfree[i] >= block)
                        panic("ufsdirhash_dirtrunc: first free corrupt");
        dh->dh_dirblks = block;
-       DIRHASH_UNLOCK(dh);
+       ufsdirhash_release(dh);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to