Sorry for the arbitrary indexing, there would be valid reasons for
calling it 3rd or 4th (or else) as well. Second such changeset from me,
at least.

See explanation on [EMAIL PROTECTED]

Csaba

diff -r b30705ca4860 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c      Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/kern/vfs_cache.c      Tue Feb  7 19:55:06 2006 +0100
@@ -108,6 +108,16 @@
 #define NCHHASH(hash)  (&nchashtbl[(hash) & nchash])
 #define MINNEG         1024
 
+/* Direction markers for shadow tree traversal. */
+#define STREE_AWAY     0x1
+#define STREE_BACK     0x2
+
+/* Shadow tree updater function with metadata (direction). */
+struct stree_updater {
+       int (*updater)(struct namecache *ncp, void *param);
+       int flags;
+};
+
 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;    /* Hash Table */
@@ -225,7 +235,7 @@ _cache_drop(struct namecache *ncp)
            (ncp->nc_flag & NCF_UNRESOLVED) && 
            TAILQ_EMPTY(&ncp->nc_list)
        ) {
-               KKASSERT(ncp->nc_exlocks == 0);
+               KKASSERT(ncp->nc_lockby->nc_exlocks == 0);
                cache_lock(ncp);
                cache_zap(ncp);
        } else {
@@ -295,7 +305,9 @@ cache_alloc(int nlen)
        ncp->nc_error = ENOTCONN;       /* needs to be resolved */
        ncp->nc_refs = 1;
        ncp->nc_fsmid = 1;
+       ncp->nc_lockby = ncp;
        TAILQ_INIT(&ncp->nc_list);
+       LIST_INIT(&ncp->nc_shadow_list);
        cache_lock(ncp);
        return(ncp);
 }
@@ -323,6 +335,277 @@ cache_drop(struct namecache *ncp)
 {
        _cache_drop(ncp);
 }
+
+/*
+ * Iterative tree traversal routine with actually more features than we need:
+ * supports making callbacks either in "away" or "back" direction, pruning
+ * the traversal at certain nodes (ie., skipping the respective subtree), and
+ * safe leaf cutting.
+ *
+ * Now it's pretty much the same if callbacks are made away or backway, and we
+ * never prune or cut off leaves...
+ *
+ * In fact, all we need is to be able to traverse a subtree identified by its
+ * root.
+ */
+static void
+cache_group_walk(struct namecache *ncp, struct stree_updater *sup, 
+                 void *param)
+{
+       struct namecache *pncp = ncp; /* previous ncp during traversal */
+       struct namecache *qncp;       /* current ncp during traversal */
+       struct namecache *xncp;       /* auxiliary ncp */
+       int prune;
+#ifdef INVARIANTS
+       int height = 0, turns = 0, pruned = 0;
+
+#define STAWAY_ASSERT(ncp) do {                                \
+       KKASSERT(((ncp)->nc_flag & NCF_TREEPASS) == 0); \
+       (ncp)->nc_flag |= NCF_TREEPASS;                 \
+} while (0)
+#define STBACK_ASSERT(ncp) do {                                \
+       KKASSERT((ncp)->nc_flag & NCF_TREEPASS);        \
+       (ncp)->nc_flag &= ~NCF_TREEPASS;                \
+} while (0)
+#else
+#define STAWAY_ASSERT(ncp)
+#define STBACK_ASSERT(ncp)
+#endif
+
+       STAWAY_ASSERT(ncp);
+       prune = sup->flags & STREE_AWAY ? sup->updater(ncp, param) : 0;
+       if (prune || ! (qncp = LIST_FIRST(&ncp->nc_shadow_list)))
+               goto out;
+
+       for (;;) {
+#ifdef INVARIANTS
+               turns++;
+#endif
+               if (qncp->nc_shadowed == pncp) {
+                       /* heading toward leafs */
+                       KKASSERT(height++ >= 0);
+                       STAWAY_ASSERT(qncp);
+                       prune = sup->flags & STREE_AWAY ?
+                               sup->updater(qncp, param) :
+                               0;
+#ifdef INVARIANTS
+                       if (prune)
+                               pruned = 1;
+#endif
+                       if (prune || LIST_EMPTY(&qncp->nc_shadow_list)) {
+                              /*
+                               * we hit a leaf or were asked to prune the
+                               * branch, turn back
+                               */
+                               xncp = pncp;
+                               pncp = qncp;
+                               qncp = xncp;
+                       } else {
+                               /* keep going on */
+                               pncp = qncp;
+                               qncp = LIST_FIRST(&qncp->nc_shadow_list);
+                       }
+               } else {
+                       /* heading back toward root */
+                       KKASSERT(--height >= 0);
+                       STBACK_ASSERT(pncp);
+                       if (LIST_NEXT(pncp, nc_shadow_entry))
+                               /* turn to leaf direction using neighbour */
+                               qncp = LIST_NEXT(pncp, nc_shadow_entry);
+                       else
+                               /*
+                                * No neighbour,
+                                * try to keep going on to root direction.
+                                */
+                               qncp = qncp == ncp ? NULL : qncp->nc_shadowed;
+                       xncp = pncp->nc_shadowed;
+                       /*
+                        * We have to take care about finding out the next pncp
+                        * candidate before calling the updater, because the
+                        * updater might remove it from the tree, rendering 
+                        * pncp unuseable wrt. traversal.
+                        */
+                       if (sup->flags & STREE_BACK)
+                               sup->updater(pncp, param);
+#ifdef INVARIANTS
+                       if (! pncp->nc_shadowed)
+                               turns -= 2;
+#endif
+                       if (! qncp)
+                               break;
+                       pncp = xncp;
+               }
+       }
+
+out:
+       STBACK_ASSERT(ncp);
+       KASSERT(height == 0, ("shadow tree left with height %d\n", height));
+       KASSERT(turns == 2 * ncp->nc_treesize || (pruned && turns < 2 * 
ncp->nc_treesize),
+               ("inconsistent traversal turn count: turns %d, ncp->nc_treesize 
%d, "
+                "pruned %d", turns, ncp->nc_treesize, pruned));
+
+       if (sup->flags & STREE_BACK)
+               sup->updater(ncp, param);
+}
+                               
+static int
+migrate_updater(struct namecache *ncp, void *param)
+{
+       struct namecache *lncp = ncp->nc_lockby;
+
+       ncp->nc_lockby = param;
+
+       if (lncp->nc_flag & NCF_LOCKREQ) {
+               lncp->nc_flag &= ~NCF_LOCKREQ;
+               wakeup(lncp);
+       }
+
+       return(0);
+}
+
+static struct stree_updater stree_migrate_updater = {
+       .updater = migrate_updater,
+       .flags = STREE_BACK
+};
+
+/*
+ * Join ncp into the shadow group of sncp.
+ * 
+ * ncp must be unlocked on entry, while sncp must be locked on entry. Caller
+ * also has to hold a dedicated reference of sncp.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * yielded a loop in the shadow chain. It will fail with EEXIST if ncp gets
+ * resolved or acquires a shadow association from elsewhere during the attach
+ * attempt (it is [at least theoretically] possbile due to the fact that ncp
+ * is unlocked).
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ *   then will be locked (both via ncp and sncp).
+ * - On failure the namecache entries will exist separately just as they did
+ *   before; both entries will be locked.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+#ifdef INVARIANTS
+       struct namecache *xncp;
+#endif
+
+       if (ncp == sncp)
+               return(ELOOP);
+
+       KKASSERT(ncp->nc_lockby->nc_locktd != curthread);
+       KKASSERT(sncp->nc_lockby->nc_locktd == curthread);
+
+       cache_lock_two(ncp, sncp);
+
+       if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowed)
+               return(EEXIST);
+
+       KKASSERT(ncp->nc_lockby == ncp);
+       KKASSERT(sncp->nc_lockby == sncp->nc_lockby->nc_lockby);
+
+       if (sncp->nc_lockby == ncp)
+               return(ELOOP);
+
+       ncp->nc_shadowed = sncp;
+       LIST_INSERT_HEAD(&sncp->nc_shadow_list, ncp, nc_shadow_entry);
+       cache_group_walk(ncp, &stree_migrate_updater, sncp->nc_lockby);
+
+#ifdef INVARIANTS
+       xncp = sncp;
+       for (;;) {
+               KKASSERT(xncp != ncp);
+               KKASSERT(xncp->nc_treesize++ >= 0);
+               xncp->nc_treesize += ncp->nc_treesize;
+               if (! xncp->nc_shadowed)
+                       break;
+               xncp = xncp->nc_shadowed;
+       }
+       KKASSERT(xncp == ncp->nc_lockby);
+#endif
+
+       return(0);
+}
+
+static __inline
+void
+delete_shadow(struct namecache *ncp)
+{
+#ifdef INVARIANTS
+       struct namecache *xncp = ncp->nc_shadowed;
+
+       KKASSERT(ncp->nc_shadowed);
+       for (;;) {
+               KKASSERT(xncp != ncp);
+               xncp->nc_treesize -= ncp->nc_treesize;
+               KKASSERT(--xncp->nc_treesize >= 0);
+               if (! xncp->nc_shadowed)
+                       break;
+               xncp = xncp->nc_shadowed;
+       }
+#endif
+
+       LIST_REMOVE(ncp, nc_shadow_entry);
+       ncp->nc_shadowed = NULL;
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * ncp must really shadow someone, and the shadow group must be locked
+ * upon entry.
+ *
+ * After the routine returns, ncp will be the head of a new (possibly 
singleton)
+ * shadow group. The routine returns the former successor of ncp in the 
original
+ * shadow group in a locked+ref'd state.
+ */
+struct namecache *
+cache_shadow_detach(struct namecache *ncp)
+{
+       struct namecache *sncp = ncp->nc_shadowed;
+
+       KKASSERT(ncp->nc_shadowed);
+       KKASSERT(sncp->nc_lockby == ncp->nc_lockby);
+       KKASSERT(ncp->nc_lockby == ncp->nc_lockby->nc_lockby);
+
+       delete_shadow(ncp);
+       ncp->nc_locktd = curthread;
+       cache_group_walk(ncp, &stree_migrate_updater, ncp);
+
+       return(sncp);
+}
+
+static int
+vhold_updater(struct namecache *ncp, void *param)
+{
+       
+       if (ncp->nc_vp)
+               vhold(ncp->nc_vp);
+
+       return(0);
+}
+
+static struct stree_updater stree_vhold_updater = {
+       .updater = vhold_updater,
+       .flags = STREE_AWAY
+};
+
+static int
+vdrop_updater(struct namecache *ncp, void *param)
+{
+       
+       if (ncp->nc_vp)
+               vdrop(ncp->nc_vp);
+
+       return(0);
+}
+
+static struct stree_updater stree_vdrop_updater = {
+       .updater = vdrop_updater,
+       .flags = STREE_AWAY
+};
 
 /*
  * Namespace locking.  The caller must already hold a reference to the
@@ -349,15 +632,19 @@ cache_lock(struct namecache *ncp)
 {
        thread_t td;
        int didwarn;
+       struct namecache *lncp;
 
        KKASSERT(ncp->nc_refs != 0);
        didwarn = 0;
        td = curthread;
 
        for (;;) {
-               if (ncp->nc_exlocks == 0) {
-                       ncp->nc_exlocks = 1;
-                       ncp->nc_locktd = td;
+               lncp = ncp->nc_lockby;
+               KKASSERT(lncp);
+               KKASSERT(lncp->nc_refs != 0);
+               if (lncp->nc_exlocks == 0) {
+                       lncp->nc_exlocks = 1;
+                       lncp->nc_locktd = td;
                        /* 
                         * The vp associated with a locked ncp must be held
                         * to prevent it from being recycled (which would
@@ -365,16 +652,15 @@ cache_lock(struct namecache *ncp)
                         *
                         * XXX loop on race for later MPSAFE work.
                         */
-                       if (ncp->nc_vp)
-                               vhold(ncp->nc_vp);
+                       cache_group_walk(lncp, &stree_vhold_updater, NULL);
                        break;
                }
-               if (ncp->nc_locktd == td) {
-                       ++ncp->nc_exlocks;
+               if (lncp->nc_locktd == td) {
+                       ++lncp->nc_exlocks;
                        break;
                }
-               ncp->nc_flag |= NCF_LOCKREQ;
-               if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
+               lncp->nc_flag |= NCF_LOCKREQ;
+               if (tsleep(lncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
                        if (didwarn)
                                continue;
                        didwarn = 1;
@@ -398,12 +684,15 @@ cache_lock_nonblock(struct namecache *nc
 cache_lock_nonblock(struct namecache *ncp)
 {
        thread_t td;
+       struct namecache *lncp = ncp->nc_lockby;
 
        KKASSERT(ncp->nc_refs != 0);
+       KKASSERT(lncp);
+       KKASSERT(lncp->nc_refs != 0);
        td = curthread;
-       if (ncp->nc_exlocks == 0) {
-               ncp->nc_exlocks = 1;
-               ncp->nc_locktd = td;
+       if (lncp->nc_exlocks == 0) {
+               lncp->nc_exlocks = 1;
+               lncp->nc_locktd = td;
                /* 
                 * The vp associated with a locked ncp must be held
                 * to prevent it from being recycled (which would
@@ -411,8 +700,7 @@ cache_lock_nonblock(struct namecache *nc
                 *
                 * XXX loop on race for later MPSAFE work.
                 */
-               if (ncp->nc_vp)
-                       vhold(ncp->nc_vp);
+               cache_group_walk(lncp, &stree_vhold_updater, NULL);
                return(0);
        } else {
                return(EWOULDBLOCK);
@@ -423,17 +711,43 @@ cache_unlock(struct namecache *ncp)
 cache_unlock(struct namecache *ncp)
 {
        thread_t td = curthread;
+       struct namecache *lncp = ncp->nc_lockby;
 
        KKASSERT(ncp->nc_refs > 0);
-       KKASSERT(ncp->nc_exlocks > 0);
-       KKASSERT(ncp->nc_locktd == td);
-       if (--ncp->nc_exlocks == 0) {
-               if (ncp->nc_vp)
-                       vdrop(ncp->nc_vp);
-               ncp->nc_locktd = NULL;
-               if (ncp->nc_flag & NCF_LOCKREQ) {
-                       ncp->nc_flag &= ~NCF_LOCKREQ;
-                       wakeup(ncp);
+       KKASSERT(lncp);
+       KKASSERT(lncp->nc_refs > 0);
+       KKASSERT(lncp->nc_exlocks > 0);
+       KKASSERT(lncp->nc_locktd == td);
+       if (lncp->nc_exlocks == 1)
+               cache_group_walk(lncp, &stree_vdrop_updater, NULL);
+       if (--lncp->nc_exlocks == 0) {
+               lncp->nc_locktd = NULL;
+               if (lncp->nc_flag & NCF_LOCKREQ) {
+                       lncp->nc_flag &= ~NCF_LOCKREQ;
+                       wakeup(lncp);
+               }
+       }
+}
+
+/*
+ * Obtain lock on both of uncp and lncp.
+ *
+ * On entry, uncp is assumed to be unlocked, and lncp is assumed to be
+ * locked.
+ *
+ * After this function returns, caller is responsible for checking
+ * the state of lncp which might have got unlocked temporarily.
+ */
+void
+cache_lock_two(struct namecache *uncp, struct namecache *lncp)
+{
+       if (cache_lock_nonblock(uncp) != 0) {
+               if (uncp > lncp)
+                       cache_lock(uncp);
+               else {
+                       cache_unlock(lncp);
+                       cache_lock(uncp);
+                       cache_lock(lncp);
                }
        }
 }
@@ -453,7 +767,8 @@ cache_get_nonblock(struct namecache *ncp
 cache_get_nonblock(struct namecache *ncp)
 {
        /* XXX MP */
-       if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
+       if (ncp->nc_lockby->nc_exlocks == 0 ||
+           ncp->nc_lockby->nc_locktd == curthread) {
                _cache_hold(ncp);
                cache_lock(ncp);
                return(0);
@@ -521,6 +836,13 @@ cache_settimeout(struct namecache *ncp, 
                ncp->nc_timeout = 1;
 }
 
+static int unresolver_updater(struct namecache *ncp, void *param); 
+
+static struct stree_updater stree_unresolver_updater = {
+       .updater = unresolver_updater,
+       .flags = STREE_BACK
+};
+
 /*
  * Disassociate the vnode or negative-cache association and mark a
  * namecache entry as unresolved again.  Note that the ncp is still
@@ -541,10 +863,17 @@ void
 void
 cache_setunresolved(struct namecache *ncp)
 {
+       cache_group_walk(ncp, &stree_unresolver_updater, NULL);
+}
+
+static int
+unresolver_updater(struct namecache *ncp, void *param) 
+{
        struct vnode *vp;
 
-       if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
+       if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && ncp != param) {
                ncp->nc_flag |= NCF_UNRESOLVED;
+               /* XXX Why don't we reset NCF_DESTROYED ? */
                ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK|
                                  NCF_FSMID);
                ncp->nc_timeout = 0;
@@ -563,13 +892,15 @@ cache_setunresolved(struct namecache *nc
                         */
                        if (!TAILQ_EMPTY(&ncp->nc_list))
                                vdrop(vp);
-                       if (ncp->nc_exlocks)
+                       if (ncp->nc_lockby->nc_exlocks)
                                vdrop(vp);
                } else {
                        TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
                        --numneg;
                }
        }
+
+       return(0);
 }
 
 /*
@@ -619,7 +950,7 @@ cache_inval(struct namecache *ncp, int f
        struct namecache *nextkid;
        int rcnt = 0;
 
-       KKASSERT(ncp->nc_exlocks);
+       KKASSERT(ncp->nc_lockby->nc_exlocks);
 
        cache_setunresolved(ncp);
        if (flags & CINV_DESTROY)
@@ -715,6 +1046,7 @@ restart:
  * XXX the disconnection could pose a problem, check code paths to make
  * sure any code that blocks can handle the parent being changed out from
  * under it.  Maybe we should lock the children (watch out for deadlocks) ?
+ * [UPDATE: attempt made to lock children, see in situ explanation]
  *
  * After we return the caller has the option of calling cache_setvp() if
  * the vnode of the new target ncp is known.
@@ -726,26 +1058,62 @@ cache_rename(struct namecache *fncp, str
 cache_rename(struct namecache *fncp, struct namecache *tncp)
 {
        struct namecache *scan;
-       int didwarn = 0;
-
+       int didwarn[] = { 0, 0 };
+
+       /* XXX should we rather make here a non-equality assertion? */
+       if (fncp == tncp)
+               return;
+
+again:
        cache_setunresolved(fncp);
        cache_setunresolved(tncp);
+
+       /*
+        * It seems we need to unlock fncp before calling cache_inval():
+        * cache_inval() does a lot of lock/unlock/relock-ing (with tncp
+        * and its children), therefore keeping fncp locked might be
+        * deadlocky...
+        */
+       cache_unlock(fncp);
+       
        while (cache_inval(tncp, CINV_CHILDREN) != 0) {
-               if (didwarn++ % 10 == 0) {
-                       printf("Warning: cache_rename: race during "
+               if (didwarn[0]++ % 10 == 0) {
+                       printf("Warning: cache_rename: race #1 during "
                                "rename %s->%s\n",
                                fncp->nc_name, tncp->nc_name);
                }
                tsleep(tncp, 0, "mvrace", hz / 10);
                cache_setunresolved(tncp);
        }
+
+       cache_unlock(tncp);
+       cache_lock(fncp);
+
        while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) {
-               cache_hold(scan);
+               cache_unlock(fncp);
+               /*
+                * We have to lock fncp's kids in order to unresolve
+                * their shadow kids...
+                */
+               cache_get(scan);
                cache_unlink_parent(scan);
+               cache_group_walk(scan, &stree_unresolver_updater, scan);
                cache_link_parent(scan, tncp);
                if (scan->nc_flag & NCF_HASHED)
                        cache_rehash(scan);
-               cache_drop(scan);
+               cache_put(scan);
+               cache_lock(fncp);
+       }
+
+       cache_lock_two(tncp, fncp);
+
+       if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) {
+               if (didwarn[1]++ % 10 == 0) {
+                       printf("Warning: cache_rename: race #2 during "
+                               "rename %s->%s\n",
+                               fncp->nc_name, tncp->nc_name);
+               }
+               goto again;
        }
 }
 
@@ -1207,6 +1575,9 @@ cache_zap(struct namecache *ncp)
                                vdrop(par->nc_vp);
                }
 
+               if (ncp->nc_shadowed)
+                       cache_put(cache_shadow_detach(ncp));
+
                /*
                 * ncp should not have picked up any refs.  Physically
                 * destroy the ncp.
diff -r b30705ca4860 sys/sys/namecache.h
--- a/sys/sys/namecache.h       Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/sys/namecache.h       Tue Feb  7 19:55:06 2006 +0100
@@ -71,6 +71,7 @@ struct vnode;
 struct vnode;
 
 TAILQ_HEAD(namecache_list, namecache);
+LIST_HEAD(namecache_shadow_list, namecache);
 
 /*
  * The namecache structure is used to manage the filesystem namespace.  Most
@@ -100,8 +101,12 @@ struct namecache {
     LIST_ENTRY(namecache) nc_hash;     /* hash chain (nc_parent,name) */
     TAILQ_ENTRY(namecache) nc_entry;   /* scan via nc_parent->nc_list */
     TAILQ_ENTRY(namecache) nc_vnode;   /* scan via vnode->v_namecache */
+    LIST_ENTRY(namecache) nc_shadow_entry; /* scan via 
nc_shadowed->nc_shadow_list */
     struct namecache_list  nc_list;    /* list of children */
+    struct namecache_shadow_list nc_shadow_list; /* list of shadow overlays */
     struct namecache *nc_parent;       /* namecache entry for parent */
+    struct namecache *nc_shadowed;     /* lower layer entry in layered fs */
+    struct namecache *nc_lockby;       /* entry via which this one can be 
locked */
     struct     vnode *nc_vp;           /* vnode representing name or NULL */
     int                nc_refs;                /* ref count prevents deletion 
*/
     u_short    nc_flag;
@@ -114,6 +119,9 @@ struct namecache {
     struct thread *nc_locktd;          /* namespace locking */
     struct mount *nc_mount;            /* associated mount for vopops */
     int64_t    nc_fsmid;               /* filesystem modified id */
+#ifdef INVARIANTS
+    int nc_treesize;                   /* shadow tree size under ncp */
+#endif
 };
 
 typedef struct namecache *namecache_t;
@@ -133,6 +141,9 @@ typedef struct namecache *namecache_t;
 #define NCF_ISDIR      0x0200  /* represents a directory */
 #define NCF_DESTROYED  0x0400  /* name association is considered destroyed */
 #define NCF_FSMID      0x0800  /* FSMID updated */
+#ifdef INVARIANTS
+#define NCF_TREEPASS   0x2000
+#endif
 
 /*
  * cache_inval[_vp]() flags
@@ -150,6 +161,9 @@ void        cache_lock(struct namecache *ncp);
 void   cache_lock(struct namecache *ncp);
 int    cache_lock_nonblock(struct namecache *ncp);
 void   cache_unlock(struct namecache *ncp);
+void   cache_lock_two(struct namecache *uncp, struct namecache *lncp);
+int    cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+struct namecache *cache_shadow_detach(struct namecache *ncp);
 void   cache_setvp(struct namecache *ncp, struct vnode *vp);
 void   cache_settimeout(struct namecache *ncp, int nticks);
 void   cache_setunresolved(struct namecache *ncp);
diff -r b30705ca4860 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h     Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null.h     Tue Feb  7 19:55:06 2006 +0100
@@ -43,18 +43,21 @@ struct null_args {
        char            *target;        /* Target of loopback  */
 };
 
-struct null_mount {
-       struct mount    *nullm_vfs;
-       struct vnode    *nullm_rootvp;  /* Reference to root null_node */
-};
-
 #ifdef _KERNEL
-#define        MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 
 #ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+       printf(" [nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define        NULLNCDEBUG(ncp)                                                
        \
+        NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, nc_flag 0x%x, "   
\
+                    "nc_mount %p, nc_shadowed %p, nc_lockby %p, nc_vp %p\n",   
\
+                    (ncp), (ncp)->nc_name, (ncp)->nc_refs,                     
\
+                   (ncp)->nc_lockby->nc_exlocks, (ncp)->nc_flag,               
\
+                    (ncp)->nc_mount, (ncp)->nc_shadowed,                       
\
+                    (ncp)->nc_lockby, (ncp)->nc_vp)
 #else
 #define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
 #endif /* NULLFS_DEBUG */
 
 #endif /* _KERNEL */
diff -r b30705ca4860 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c      Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null_vfsops.c      Tue Feb  7 19:55:06 2006 +0100
@@ -53,6 +53,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/nlookup.h>
+#include <sys/namecache.h>
 #include "null.h"
 
 extern struct vnodeopv_entry_desc null_vnodeop_entries[];
@@ -80,12 +81,10 @@ nullfs_mount(struct mount *mp, char *pat
 {
        int error = 0;
        struct null_args args;
-       struct vnode *rootvp;
-       struct null_mount *xmp;
        u_int size;
        struct nlookupdata nd;
 
-       NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+       NULLFSDEBUG("mp %p\n", (void *)mp);
 
        /*
         * Update is a no-op
@@ -98,118 +97,113 @@ nullfs_mount(struct mount *mp, char *pat
         * Get argument
         */
        error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
-       if (error)
-               return (error);
-
-       /*
-        * Find lower node
-        */
-       rootvp = NULL;
-       error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
+
+       /*
+        * Do a lookup just to see if things are not fundamentally broken...
+        * but it's too early to make a proper use of the result.
+        */
+       if (error == 0)
+               error = nlookup_init(&nd, args.target, UIO_USERSPACE,
+                                    NLC_FOLLOW);
        if (error == 0)
                error = nlookup(&nd);
-       if (error == 0) {
-               error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, 
-                                       &rootvp);
-       }
-
-       xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
-                               M_NULLFSMNT, M_WAITOK); /* XXX */
-
-       /*
-        * Save reference to underlying FS
-        */
-        /*
-         * As lite stacking enters the scene, the old way of doing this
-        * -- via the vnode -- is not good enough anymore...
-        */
-       xmp->nullm_vfs = nd.nl_ncp->nc_mount;
+       if (error)
+               return(error);
+
        nlookup_done(&nd);
 
-       vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, 
-                        null_vnodeop_entries, 0);
-
-       VOP_UNLOCK(rootvp, 0, td);
-
-       /*
-        * Keep a held reference to the root vnode.
-        * It is vrele'd in nullfs_unmount.
-        */
-       xmp->nullm_rootvp = rootvp;
-       /*
-        * XXX What's the proper safety condition for querying
-        * the underlying mount? Is this flag tuning necessary
-        * at all?
-        */
-       if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
-               mp->mnt_flag |= MNT_LOCAL;
-       mp->mnt_data = (qaddr_t) xmp;
+       vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+       /*
+        * Heck it, let it just be local. I bet I need only five minutes to
+        * find out a sound sounding meaning for "local" by which null mounts
+        * are always local.
+        */
+       mp->mnt_flag |= MNT_LOCAL;
        vfs_getnewfsid(mp);
 
        (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
            &size);
        bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
-       (void)nullfs_statfs(mp, &mp->mnt_stat, td);
-       NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
-               mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
+       NULLFSDEBUG("lower %s, alias at %s\n",
+                   mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
        return (0);
 }
 
-/*
- * Free reference to null layer
- */
 static int
 nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
-       void *mntdata;
-       int flags = 0;
-
-       NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
-       if (mntflags & MNT_FORCE)
-               flags |= FORCECLOSE;
-
-       /*
-        * Finally, throw away the null_mount structure
-        */
-       mntdata = mp->mnt_data;
-       mp->mnt_data = 0;
-       free(mntdata, M_NULLFSMNT);
+       NULLNCDEBUG(mp->mnt_ncp);
+
+       cache_lock(mp->mnt_ncp);
+       cache_put(cache_shadow_detach(mp->mnt_ncp));
+       cache_unlock(mp->mnt_ncp);
+
        return 0;
 }
 
 static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+       int error;
+       struct namecache *sncp;
+
+       NULLFSDEBUG("nlookup %s\n", mp->mnt_stat.f_mntfromname);
+
+       sncp = nlookup_simple(mp->mnt_stat.f_mntfromname,
+                             UIO_SYSSPACE, NLC_FOLLOW, &error);
+
+       if (! sncp)
+               return (error);
+
+       if ((error = cache_shadow_attach(mp->mnt_ncp, sncp)))
+               cache_put(sncp);
+
+       NULLNCDEBUG(mp->mnt_ncp);
+#ifdef NULLFS_DEBUG
+       if (mp->mnt_ncp->nc_shadowed)
+               NULLNCDEBUG(mp->mnt_ncp->nc_shadowed);
+#endif
+
+       cache_unlock(mp->mnt_ncp);
+       return (error);
+}      
+
+/*
+ * As the mount won't get aborted if VFS_START fails, we have to check in each 
+ * VFS call whether it has succeeded...
+ */ 
+
+static int
 nullfs_root(struct mount *mp, struct vnode **vpp)
 {
-       struct thread *td = curthread;  /* XXX */
-       struct vnode *vp;
-
-       NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
-           (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
-       /*
-        * Return locked reference to root.
-        */
-       vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
-       vref(vp);
-
-#ifdef NULLFS_DEBUG
-       if (VOP_ISLOCKED(vp, NULL)) {
-               Debugger("root vnode is locked.\n");
-               vrele(vp);
-               return (EDEADLK);
-       }
-#endif
-       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-       *vpp = vp;
-       return 0;
+       int error;
+
+       if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+               return (ENXIO);
+
+       error = cache_vget(mp->mnt_ncp->nc_shadowed, crhold(proc0.p_ucred),
+                          LK_EXCLUSIVE | LK_RETRY, vpp);
+       crfree(proc0.p_ucred);
+
+       return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+       return (mp->mnt_ncp->nc_shadowed->nc_mount);
 }
 
 static int
 nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
                struct thread *td)
 {
-       return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+       if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+               return (ENXIO);
+
+       return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
 }
 
 static int
@@ -218,12 +212,15 @@ nullfs_statfs(struct mount *mp, struct s
        int error;
        struct statfs mstat;
 
-       NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
-           (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+       if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+               return (ENXIO);
+
+       NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+                   mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
 
        bzero(&mstat, sizeof(mstat));
 
-       error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+       error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
        if (error)
                return (error);
 
@@ -248,23 +245,27 @@ nullfs_checkexp(struct mount *mp, struct
 nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
                struct ucred **credanonp)
 {
-
-       return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam, 
-               extflagsp, credanonp);
+       if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+               return (ENXIO);
+
+       return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
 }
 
 static int                        
 nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
                  struct thread *td)
 {
-       return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
-           arg, td);
+       if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+               return (ENXIO);
+
+       return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
 }
 
 
 static struct vfsops null_vfsops = {
        .vfs_mount =            nullfs_mount,
        .vfs_unmount =          nullfs_unmount,
+       .vfs_start =            nullfs_start,
        .vfs_root =             nullfs_root,
        .vfs_quotactl =         nullfs_quotactl,
        .vfs_statfs =           nullfs_statfs,
diff -r b30705ca4860 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c       Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null_vnops.c       Tue Feb  7 19:55:06 2006 +0100
@@ -109,6 +109,8 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
 #include "null.h"
 
 static int     null_nresolve(struct vop_nresolve_args *ap);
@@ -122,18 +124,114 @@ static int       null_nrmdir(struct vop_nrmdir
 static int     null_nrmdir(struct vop_nrmdir_args *ap);
 static int     null_nrename(struct vop_nrename_args *ap);
 
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+       /*
+        * The code in use below allows allows passing through lower mounts.
+        * If we didn't want to do that, we could use
+        *
+        *   ncp->nc_mount->mnt_ncp->nc_shadowed->nc_mount
+        *
+        * Eventually, the choice might be configurable.
+        *
+        *                  -  -  -
+        *
+        * Matt says in
+        * http://leaf.dragonflybsd.org/mailarchive/kernel/2006-01/msg00023.html
+        * :
+        
+    The ncp->nc_mount field was never meant to be used by the VFS code...
+    only to be used internally by cache_*().  It looks like I broke my own
+    rule... I have two references in NFS, but that's for later.
+
+        * Note that both approaches still use nc_mount:
+        *
+        * - If we wanna pass through lower mounts, we do have to find the
+        *   the lower fs ncp-wise, we simply don't have choice.
+        *
+        * - If we just work with a fixed lower fs, we are able to access
+        *   that if we are willing to use nc_mount. Hence it just seems to be
+        *   stupid to keep around a direct reference to the lower fs, but
+        *   that's of course feasible.
+        */
+       return (ncp->nc_shadowed->nc_mount);
+}
+
 static int
 null_nresolve(struct vop_nresolve_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nresolve_ap(ap);
+       struct namecache *ncp = ap->a_ncp;
+       struct nlcomponent nlc;
+       struct namecache *sncp;
+       struct mount *lmp;
+       int error = 0;
+
+       if (ncp->nc_shadowed)
+               cache_put(cache_shadow_detach(ncp));
+
+       nlc.nlc_nameptr = ncp->nc_name;
+       nlc.nlc_namelen = ncp->nc_nlen;
+
+       KKASSERT(ncp->nc_parent->nc_shadowed);
+       cache_unlock(ncp);
+       sncp = cache_nlookup(ncp->nc_parent->nc_shadowed, &nlc);
+
+       if ((error = cache_shadow_attach(ncp, sncp))) {
+               cache_put(sncp);
+               if (ncp->nc_flag & NCF_UNRESOLVED)
+                       cache_setvp(ncp, NULL);
+               return (error);
+       }
+
+       NULLNCDEBUG(ncp);
+       NULLNCDEBUG(ncp->nc_shadowed);
+
+checkready:
+
+       if ((ncp->nc_shadowed->nc_flag & NCF_UNRESOLVED) == 0) {
+               cache_setvp(ncp, ncp->nc_shadowed->nc_vp);
+               return (ncp->nc_shadowed->nc_error);
+       }
+
+       /*
+        * XXX Querying/ensuring usability of lower fs still not got right.
+        * As a quick hack, we do a simple test here, that will do for
+        * avoiding most obvious fallacies.
+        */
+       if ((lmp = nullfs_lowermount_l(ncp)) &&
+           (ap->a_head.a_ops = lmp->mnt_vn_use_ops)) {
+               /*
+                * Moving down in the shadow chain is for avoiding a recursed
+                * loop (ending up in exhausting the kernel stack).
+                *
+                * Otherwise it's the same whether we use ncp or
+                * ncp->nc_shadowed -- we go for group shared ncp attributes.
+                */
+               ap->a_ncp = ncp->nc_shadowed;
+               /*
+                * According to cache_resolve(), the primary place for
+                * VOP_NRESOLVE calls, the caller of the nresolve method
+                * is the one who should take care about ncp->nc_error.
+                */
+               ap->a_ncp->nc_error = vop_nresolve_ap(ap);
+
+               goto checkready;
+       }
+
+       return (ENXIO);
 }
 
 static int
 null_ncreate(struct vop_ncreate_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_ncreate_ap(ap);
 }
@@ -141,7 +239,12 @@ static int
 static int
 null_nmkdir(struct vop_nmkdir_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nmkdir_ap(ap);
 }
@@ -149,7 +252,12 @@ static int
 static int
 null_nmknod(struct vop_nmknod_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nmknod_ap(ap);
 }
@@ -157,7 +265,12 @@ static int
 static int
 null_nlink(struct vop_nlink_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nlink_ap(ap);
 }
@@ -165,7 +278,12 @@ static int
 static int
 null_nsymlink(struct vop_nsymlink_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nsymlink_ap(ap);
 }
@@ -173,7 +291,12 @@ static int
 static int
 null_nwhiteout(struct vop_nwhiteout_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nwhiteout_ap(ap);
 }
@@ -181,7 +304,12 @@ static int
 static int
 null_nremove(struct vop_nremove_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nremove_ap(ap);
 }
@@ -189,7 +317,12 @@ static int
 static int
 null_nrmdir(struct vop_nrmdir_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+       struct namecache *ncp = ap->a_ncp;
+
+       NULLNCDEBUG(ap->a_ncp);
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+       ap->a_ncp = ncp->nc_shadowed;
 
        return vop_nrmdir_ap(ap);
 }
@@ -197,13 +330,21 @@ static int
 static int
 null_nrename(struct vop_nrename_args *ap)
 {
+       struct namecache *fncp = ap->a_fncp;
+       struct namecache *tncp = ap->a_tncp;
        struct mount *lmp;
 
-       lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
-       if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
-               return (EINVAL);
-
-       ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
+       lmp = nullfs_lowermount_l(fncp);
+       if (lmp != nullfs_lowermount_l(tncp))
+               return (EXDEV);
+
+       NULLNCDEBUG(ap->a_fncp);
+       NULLNCDEBUG(ap->a_fncp->nc_shadowed);
+       NULLNCDEBUG(ap->a_tncp);
+       NULLNCDEBUG(ap->a_tncp->nc_shadowed);
+       ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+       ap->a_fncp = fncp->nc_shadowed;
+       ap->a_tncp = tncp->nc_shadowed;
 
        return vop_nrename_ap(ap);
 }
@@ -224,4 +365,3 @@ struct vnodeopv_entry_desc null_vnodeop_
        { &vop_nrename_desc,            (vnodeopv_entry_t) null_nrename },
        { NULL, NULL }
 };
-

Reply via email to