Hi!

This time the following changes were made:

 - Namecache locks are kept by shadowinfo structures which
   can be embedded into the namecache structure itself (singleton
   groups) or fetched from a pool (for non-singleton groups).

 - Shadow group data structure: I ended up at what has been
   suggested by Matt earlier: shadow group entries form a circular list.
   Double linked in order to support O(1) node deletion, enhanced
   with a height counter to retain tree semantics.

 - cache_setunresolved(ncp) blows up subtree over ncp to break broken
   topologies.

 - Deadlock avoiding techniques of the previous patch have been kept.

 - The "struct namecache *nc_shadowed" field of namecache structures
   still exists, but is nowhere referred by cache code. Now it's sort
   of a private field, almost like the "void *" fields in vnodes,
   specinfo structures, etc. It could be easily ditched and replaced by
   per-mount hash for those fs-es who would use it. I just kept it as
   is -- I didn't want to do anything about it without having a
   consensus.

 - Nullfs adjusted to this API.

Regards,
Csaba
diff -r e1d135a8f666 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c      Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/kern/vfs_cache.c      Wed Mar 29 10:23:26 2006 +0200
@@ -108,6 +108,10 @@
 #define NCHHASH(hash)  (&nchashtbl[(hash) & nchash])
 #define MINNEG         1024
 
+/* Modes for shadow group traversal */
+#define SG_ALL     0 /* traverse whole group */
+#define SG_SUBTREE 1 /* traverse only subtree */
+
 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;    /* Hash Table */
@@ -170,6 +174,15 @@ static u_long numneghits; STATNODE(CTLFL
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 struct nchstats nchstats[SMP_MAXCPU];
+
+static STAILQ_HEAD(, shadowinfo) shadowinfo_freeq;
+static u_long numshadowinfo = 0;
+STATNODE(CTLFLAG_RD, numshadowinfo, &numshadowinfo);
+static long maxnumshadowinfo = -1;
+SYSCTL_LONG(_vfs_cache, OID_AUTO, maxnumshadowinfo, CTLFLAG_RW,
+            &maxnumshadowinfo, 0, "");
+MALLOC_DEFINE(M_SHADOWINFO, "shadowinfo", "VFS name cache shadowinfo");
+
 /*
  * Export VFS cache effectiveness statistics to user-land.
  *
@@ -196,6 +209,62 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchsta
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 
+/* XXX stubs for later MPSAFE work */
+#define shadowinfo_freeq_lock()
+#define shadowinfo_freeq_unlock()
+
+static struct shadowinfo *
+shadowinfo_fetch(void)
+{
+       struct shadowinfo *shinf = STAILQ_FIRST(&shadowinfo_freeq);
+
+       if (! shinf)
+               goto alloc;
+
+       shadowinfo_freeq_lock();
+       if ((shinf = STAILQ_FIRST(&shadowinfo_freeq)))
+               STAILQ_REMOVE_HEAD(&shadowinfo_freeq, sh_entry);
+       shadowinfo_freeq_unlock();
+
+       if (shinf)
+               return (shinf);
+
+alloc:
+       shinf = malloc(sizeof(*shinf), M_SHADOWINFO, M_WAITOK|M_ZERO);
+       numshadowinfo++;
+
+       return (shinf);
+}
+
+static __inline
+struct shadowinfo*
+shadowinfo_ref(struct shadowinfo *shinf)
+{
+       shinf->sh_refs++;
+
+       return (shinf);
+}
+
+static void 
+shadowinfo_put(struct shadowinfo *shinf)
+{
+       if (--shinf->sh_refs > 0)
+               return;
+
+       if (maxnumshadowinfo >= 0 && numshadowinfo > maxnumshadowinfo) {
+               free(shinf, M_SHADOWINFO);
+               numshadowinfo--;
+               return;
+       }
+
+       shinf->sh_exlocks = 0;
+       shinf->sh_locktd = NULL;
+
+       shadowinfo_freeq_lock();
+       STAILQ_INSERT_TAIL(&shadowinfo_freeq, shinf, sh_entry);
+       shadowinfo_freeq_unlock();
+}
+
 static void cache_zap(struct namecache *ncp);
 
 /*
@@ -225,7 +294,7 @@ _cache_drop(struct namecache *ncp)
            (ncp->nc_flag & NCF_UNRESOLVED) && 
            TAILQ_EMPTY(&ncp->nc_list)
        ) {
-               KKASSERT(ncp->nc_exlocks == 0);
+               KKASSERT(ncp->nc_shadowinfo->sh_exlocks == 0);
                cache_lock(ncp);
                cache_zap(ncp);
        } else {
@@ -295,6 +364,10 @@ cache_alloc(int nlen)
        ncp->nc_error = ENOTCONN;       /* needs to be resolved */
        ncp->nc_refs = 1;
        ncp->nc_fsmid = 1;
+       ncp->nc_shadowinfo = &ncp->nc_shadowinfo_internal;
+       ncp->nc_shadowinfo_internal.sh_refs = 2;
+       ncp->nc_shadow_prev = NULL;
+       ncp->nc_shadow_next = NULL;
        TAILQ_INIT(&ncp->nc_list);
        cache_lock(ncp);
        return(ncp);
@@ -303,7 +376,7 @@ static void
 static void
 cache_free(struct namecache *ncp)
 {
-       KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
+       KKASSERT(ncp->nc_refs == 1 && ncp->nc_shadowinfo->sh_exlocks == 1);
        if (ncp->nc_name)
                free(ncp->nc_name, M_VFSCACHE);
        free(ncp, M_VFSCACHE);
@@ -322,6 +395,188 @@ cache_drop(struct namecache *ncp)
 cache_drop(struct namecache *ncp)
 {
        _cache_drop(ncp);
+}
+
+/*
+ * Iterate an "updater" function over a shadow group.
+ * All-group and subtree-only traversals are supported.
+ */
+static struct namecache *
+cache_group_walk(struct namecache *ncp,
+                 int (*updater)(struct namecache *xncp, void *param),
+                 int flags, void *param)
+{
+       struct namecache *xncp = ncp, *yncp;
+
+       for (;;) {
+               yncp = xncp->nc_shadow_next;
+               if (updater(xncp, param))
+                       break;
+               if (! yncp || yncp == ncp ||
+                   (flags & SG_SUBTREE &&
+                    yncp->nc_shadowheight <= ncp->nc_shadowheight))
+                       break;
+               xncp = yncp;
+       }
+
+       return(xncp);
+}
+
+struct migrate_param {
+       int heightdelta;
+       int exlocks;
+       struct shadowinfo *shadowinfo;
+};
+
+static int 
+migrate_updater(struct namecache *ncp, void *param)
+{
+       struct migrate_param *mpm = param;
+       struct shadowinfo *shinf = mpm->shadowinfo;
+       struct shadowinfo *oldshinf = ncp->nc_shadowinfo;
+
+       if (! shinf)
+               shinf = &ncp->nc_shadowinfo_internal;
+
+       if (shinf == oldshinf)
+               goto out;
+
+       shinf->sh_locktd = oldshinf->sh_locktd;
+
+       ncp->nc_shadowinfo = shadowinfo_ref(shinf);
+       shadowinfo_put(oldshinf);
+
+out:
+       ncp->nc_shadowheight += mpm->heightdelta;
+       if (mpm->exlocks >= 0)
+               shinf->sh_exlocks = mpm->exlocks;
+
+       return (0);
+}
+
+static __inline
+void
+cache_shadow_link(struct namecache *sncp, struct namecache *ncp)
+{
+       struct namecache *pncp;
+       struct namecache *nsncp;
+
+       pncp = ncp->nc_shadow_prev ?: ncp;
+       nsncp = sncp->nc_shadow_next ?: sncp;
+
+       pncp->nc_shadow_next = nsncp;
+       nsncp->nc_shadow_prev = pncp;
+
+       sncp->nc_shadow_next = ncp;
+       ncp->nc_shadow_prev = sncp;
+}
+
+static __inline
+void
+cache_shadow_unlink(struct namecache *ncp)
+{
+       if (! ncp->nc_shadow_next)
+               return;
+
+       KKASSERT(ncp->nc_shadow_prev);
+
+       if (ncp->nc_shadow_prev == ncp->nc_shadow_next) {
+               ncp->nc_shadow_prev->nc_shadow_next = NULL;
+               ncp->nc_shadow_next->nc_shadow_prev = NULL;
+       } else {
+               ncp->nc_shadow_prev->nc_shadow_next = ncp->nc_shadow_next;
+               ncp->nc_shadow_next->nc_shadow_prev = ncp->nc_shadow_prev;
+       }
+
+       ncp->nc_shadow_prev = ncp->nc_shadow_next = NULL;
+}
+
+/*
+ * Join ncp into the shadow group of sncp.
+ * 
+ * ncp must be unlocked on entry, while sncp must be locked on entry.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * doesnt' make sense (currently this boils down to ncp being the same as
+ * sncp).
+ * It will fail with EEXIST if ncp gets resolved or acquires a shadow
+ * association from elsewhere during the attach attempt (it is possbile due to
+ * the fact that ncp is unlocked).
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ *   then will be locked.
+ * - On failure the namecache entries will exist separately just as they did
+ *   before; both entries will be locked.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+       struct migrate_param mpm;
+
+       if (ncp == sncp)
+               return(ELOOP);
+
+       KKASSERT(ncp->nc_shadowinfo->sh_locktd != curthread);
+       KKASSERT(sncp->nc_shadowinfo->sh_locktd == curthread);
+
+       cache_lock_two(ncp, sncp);
+
+       if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowheight != 0)
+               return(EEXIST);
+
+       if (sncp->nc_shadowinfo == &sncp->nc_shadowinfo_internal) {
+               mpm.heightdelta = 0;
+               mpm.shadowinfo = shadowinfo_fetch();
+               mpm.exlocks = sncp->nc_shadowinfo->sh_exlocks;
+               migrate_updater(sncp, &mpm);
+       }
+
+       mpm.heightdelta = sncp->nc_shadowheight + 1;
+       mpm.shadowinfo = sncp->nc_shadowinfo;
+       mpm.exlocks = -1;
+
+       cache_group_walk(ncp, &migrate_updater, SG_ALL, &mpm);
+       cache_shadow_link(sncp, ncp);
+       
+       return(0);
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * The shadow group must be locked upon entry.
+ *
+ * On return both the entry and its former group remain locked.
+ */
+void
+cache_shadow_detach(struct namecache *ncp)
+{
+       struct namecache *pncp, *nncp;
+       struct migrate_param mpm;
+
+       mpm.shadowinfo = NULL;
+again:
+       mpm.heightdelta = -ncp->nc_shadowheight;
+       mpm.exlocks = ncp->nc_shadowinfo->sh_exlocks;
+       pncp = ncp->nc_shadow_prev;
+       nncp = ncp->nc_shadow_next;
+
+       migrate_updater(ncp, &mpm);
+       cache_shadow_unlink(ncp);
+
+       if (nncp && nncp == pncp) {
+               ncp = nncp;
+               goto again;
+       }
+}
+
+static int
+vref_updater(struct namecache *ncp, void *param)
+{
+       if (ncp->nc_vp)
+               *(int *)param > 0 ? vhold(ncp->nc_vp) : vdrop(ncp->nc_vp);
+
+       return(0);
 }
 
 /*
@@ -349,15 +604,21 @@ cache_lock(struct namecache *ncp)
 {
        thread_t td;
        int didwarn;
+       struct shadowinfo *shinf;
 
        KKASSERT(ncp->nc_refs != 0);
        didwarn = 0;
        td = curthread;
 
        for (;;) {
-               if (ncp->nc_exlocks == 0) {
-                       ncp->nc_exlocks = 1;
-                       ncp->nc_locktd = td;
+               shinf = ncp->nc_shadowinfo;
+               KKASSERT(shinf);
+               KKASSERT(shinf->sh_refs != 0);
+               if (shinf->sh_exlocks == 0) {
+                       int ref = 1;
+
+                       shinf->sh_exlocks = 1;
+                       shinf->sh_locktd = td;
                        /* 
                         * The vp associated with a locked ncp must be held
                         * to prevent it from being recycled (which would
@@ -365,16 +626,15 @@ cache_lock(struct namecache *ncp)
                         *
                         * XXX loop on race for later MPSAFE work.
                         */
-                       if (ncp->nc_vp)
-                               vhold(ncp->nc_vp);
+                       cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
                        break;
                }
-               if (ncp->nc_locktd == td) {
-                       ++ncp->nc_exlocks;
+               if (shinf->sh_locktd == td) {
+                       ++shinf->sh_exlocks;
                        break;
                }
-               ncp->nc_flag |= NCF_LOCKREQ;
-               if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
+               shinf->sh_lockreq = 1;
+               if (tsleep(shinf, 0, "clock", nclockwarn) == EWOULDBLOCK) {
                        if (didwarn)
                                continue;
                        didwarn = 1;
@@ -398,12 +658,17 @@ cache_lock_nonblock(struct namecache *nc
 cache_lock_nonblock(struct namecache *ncp)
 {
        thread_t td;
+       struct shadowinfo *shinf = ncp->nc_shadowinfo;
 
        KKASSERT(ncp->nc_refs != 0);
+       KKASSERT(shinf);
+       KKASSERT(shinf->sh_refs != 0);
        td = curthread;
-       if (ncp->nc_exlocks == 0) {
-               ncp->nc_exlocks = 1;
-               ncp->nc_locktd = td;
+       if (shinf->sh_exlocks == 0) {
+               int ref = 1;
+
+               shinf->sh_exlocks = 1;
+               shinf->sh_locktd = td;
                /* 
                 * The vp associated with a locked ncp must be held
                 * to prevent it from being recycled (which would
@@ -411,8 +676,7 @@ cache_lock_nonblock(struct namecache *nc
                 *
                 * XXX loop on race for later MPSAFE work.
                 */
-               if (ncp->nc_vp)
-                       vhold(ncp->nc_vp);
+               cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
                return(0);
        } else {
                return(EWOULDBLOCK);
@@ -423,17 +687,45 @@ cache_unlock(struct namecache *ncp)
 cache_unlock(struct namecache *ncp)
 {
        thread_t td = curthread;
+       struct shadowinfo *shinf = ncp->nc_shadowinfo;
 
        KKASSERT(ncp->nc_refs > 0);
-       KKASSERT(ncp->nc_exlocks > 0);
-       KKASSERT(ncp->nc_locktd == td);
-       if (--ncp->nc_exlocks == 0) {
-               if (ncp->nc_vp)
-                       vdrop(ncp->nc_vp);
-               ncp->nc_locktd = NULL;
-               if (ncp->nc_flag & NCF_LOCKREQ) {
-                       ncp->nc_flag &= ~NCF_LOCKREQ;
-                       wakeup(ncp);
+       KKASSERT(shinf);
+       KKASSERT(shinf->sh_refs > 0);
+       KKASSERT(shinf->sh_exlocks > 0);
+       KKASSERT(shinf->sh_locktd == td);
+       if (shinf->sh_exlocks == 1) {
+               int ref = -1;
+               cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
+       }
+       if (--shinf->sh_exlocks == 0) {
+               shinf->sh_locktd = NULL;
+               if (shinf->sh_lockreq) {
+                       shinf->sh_lockreq = 0;
+                       wakeup(shinf);
+               }
+       }
+}
+
+/*
+ * Obtain lock on both of uncp and lncp.
+ *
+ * On entry, uncp is assumed to be unlocked, and lncp is assumed to be
+ * locked.
+ *
+ * After this function returns, caller is responsible for checking
+ * the state of lncp which might have got unlocked temporarily.
+ */
+void
+cache_lock_two(struct namecache *uncp, struct namecache *lncp)
+{
+       if (cache_lock_nonblock(uncp) != 0) {
+               if (uncp > lncp)
+                       cache_lock(uncp);
+               else {
+                       cache_unlock(lncp);
+                       cache_lock(uncp);
+                       cache_lock(lncp);
                }
        }
 }
@@ -453,7 +745,8 @@ cache_get_nonblock(struct namecache *ncp
 cache_get_nonblock(struct namecache *ncp)
 {
        /* XXX MP */
-       if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
+       if (ncp->nc_shadowinfo->sh_exlocks == 0 ||
+           ncp->nc_shadowinfo->sh_locktd == curthread) {
                _cache_hold(ncp);
                cache_lock(ncp);
                return(0);
@@ -487,7 +780,7 @@ cache_setvp(struct namecache *ncp, struc
                if (!TAILQ_EMPTY(&ncp->nc_list))
                        vhold(vp);
                TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
-               if (ncp->nc_exlocks)
+               if (ncp->nc_shadowinfo->sh_exlocks)
                        vhold(vp);
 
                /*
@@ -521,6 +814,8 @@ cache_settimeout(struct namecache *ncp, 
                ncp->nc_timeout = 1;
 }
 
+static int unresolver_updater(struct namecache *ncp, void *param); 
+
 /*
  * Disassociate the vnode or negative-cache association and mark a
  * namecache entry as unresolved again.  Note that the ncp is still
@@ -541,7 +836,25 @@ void
 void
 cache_setunresolved(struct namecache *ncp)
 {
+       struct namecache *nncp;
+
+       cache_group_walk(ncp, &unresolver_updater, SG_SUBTREE, ncp);
+
+       nncp = ncp->nc_shadow_next;
+       if (nncp)
+               cache_hold(nncp);
+       unresolver_updater(ncp, NULL);
+       if (nncp)
+               cache_put(nncp);
+}
+
+static int
+unresolver_updater(struct namecache *ncp, void *param) 
+{
        struct vnode *vp;
+
+       if (ncp == param)
+               return(0);
 
        if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
                ncp->nc_flag |= NCF_UNRESOLVED;
@@ -563,13 +876,23 @@ cache_setunresolved(struct namecache *nc
                         */
                        if (!TAILQ_EMPTY(&ncp->nc_list))
                                vdrop(vp);
-                       if (ncp->nc_exlocks)
+                       if (ncp->nc_shadowinfo->sh_exlocks)
                                vdrop(vp);
                } else {
                        TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
                        --numneg;
                }
-       }
+
+               cache_shadow_detach(ncp);
+       }
+
+       if (ncp->nc_refs == 0) {
+               cache_hold(ncp);
+               cache_put(ncp);
+       }
+
+
+       return(0);
 }
 
 /*
@@ -619,7 +942,7 @@ cache_inval(struct namecache *ncp, int f
        struct namecache *nextkid;
        int rcnt = 0;
 
-       KKASSERT(ncp->nc_exlocks);
+       KKASSERT(ncp->nc_shadowinfo->sh_exlocks);
 
        cache_setunresolved(ncp);
        if (flags & CINV_DESTROY)
@@ -715,6 +1038,7 @@ restart:
  * XXX the disconnection could pose a problem, check code paths to make
  * sure any code that blocks can handle the parent being changed out from
  * under it.  Maybe we should lock the children (watch out for deadlocks) ?
+ * [UPDATE: attempt made to lock children, see in situ explanation]
  *
  * After we return the caller has the option of calling cache_setvp() if
  * the vnode of the new target ncp is known.
@@ -726,26 +1050,62 @@ cache_rename(struct namecache *fncp, str
 cache_rename(struct namecache *fncp, struct namecache *tncp)
 {
        struct namecache *scan;
-       int didwarn = 0;
-
+       int didwarn[] = { 0, 0 };
+
+       /* XXX should we rather make here a non-equality assertion? */
+       if (fncp == tncp)
+               return;
+
+again:
        cache_setunresolved(fncp);
        cache_setunresolved(tncp);
+
+       /*
+        * It seems we need to unlock fncp before calling cache_inval():
+        * cache_inval() does a lot of lock/unlock/relock-ing (with tncp
+        * and its children), therefore keeping fncp locked might be
+        * deadlocky...
+        */
+       cache_unlock(fncp);
+       
        while (cache_inval(tncp, CINV_CHILDREN) != 0) {
-               if (didwarn++ % 10 == 0) {
-                       printf("Warning: cache_rename: race during "
+               if (didwarn[0]++ % 10 == 0) {
+                       printf("Warning: cache_rename: race #1 during "
                                "rename %s->%s\n",
                                fncp->nc_name, tncp->nc_name);
                }
                tsleep(tncp, 0, "mvrace", hz / 10);
                cache_setunresolved(tncp);
        }
+
+       cache_unlock(tncp);
+       cache_lock(fncp);
+
        while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) {
-               cache_hold(scan);
+               cache_unlock(fncp);
+               /*
+                * We have to lock fncp's kids in order to unresolve
+                * their shadow kids...
+                */
+               cache_get(scan);
                cache_unlink_parent(scan);
+               cache_group_walk(scan, &unresolver_updater, SG_SUBTREE, scan);
                cache_link_parent(scan, tncp);
                if (scan->nc_flag & NCF_HASHED)
                        cache_rehash(scan);
-               cache_drop(scan);
+               cache_put(scan);
+               cache_lock(fncp);
+       }
+
+       cache_lock_two(tncp, fncp);
+
+       if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) {
+               if (didwarn[1]++ % 10 == 0) {
+                       printf("Warning: cache_rename: race #2 during "
+                               "rename %s->%s\n",
+                               fncp->nc_name, tncp->nc_name);
+               }
+               goto again;
        }
 }
 
@@ -1321,7 +1681,7 @@ cache_zap(struct namecache *ncp)
                        cache_drop(ncp);
                        return;
                }
-               KKASSERT(par->nc_exlocks == 0);
+               KKASSERT(par->nc_shadowinfo->sh_exlocks == 0);
                cache_lock(ncp);
        }
 done:
@@ -1417,7 +1777,7 @@ restart:
                if (ncp->nc_timeout && 
                    (int)(ncp->nc_timeout - ticks) < 0 &&
                    (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
-                   ncp->nc_exlocks == 0
+                   ncp->nc_shadowinfo->sh_exlocks == 0
                ) {
                        cache_zap(cache_get(ncp));
                        goto restart;
@@ -1738,6 +2098,7 @@ nchinit(void)
                gd->gd_nchstats = &nchstats[i];
        }
        TAILQ_INIT(&ncneglist);
+       STAILQ_INIT(&shadowinfo_freeq);
        nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
        nclockwarn = 1 * hz;
 }
diff -r e1d135a8f666 sys/sys/namecache.h
--- a/sys/sys/namecache.h       Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/sys/namecache.h       Wed Mar 29 10:23:26 2006 +0200
@@ -70,7 +70,20 @@
 
 struct vnode;
 
+/*
+ * Auxiliary structure for locking namecache entries,
+ * either on their own or grouped into "shadow groups".
+ */
+struct shadowinfo {
+    STAILQ_ENTRY(shadowinfo) sh_entry;  /* entry for free list */
+    int                   sh_exlocks;          /* namespace locking */
+    struct thread *sh_locktd;          /* namespace locking */
+    int            sh_refs;            /* reference count */
+    uint8_t        sh_lockreq :1;      /* lock intent sign */ 
+};
+
 TAILQ_HEAD(namecache_list, namecache);
+LIST_HEAD(namecache_shadow_list, namecache);
 
 /*
  * The namecache structure is used to manage the filesystem namespace.  Most
@@ -110,8 +123,12 @@ struct namecache {
     char       *nc_name;               /* Separately allocated seg name */
     int                nc_error;
     int                nc_timeout;             /* compared against ticks, or 0 
*/
-    int                nc_exlocks;             /* namespace locking */
-    struct thread *nc_locktd;          /* namespace locking */
+    struct shadowinfo *nc_shadowinfo;         /* namespace locking */
+    struct shadowinfo nc_shadowinfo_internal; /* private locking information */
+    struct namecache *nc_shadow_prev;   /* previous entry in shadow group */
+    struct namecache *nc_shadow_next;   /* next entry in shadow group */
+    int         nc_shadowheight;        /* measure within shadow group */
+    struct namecache *nc_shadowed;     /* lower layer entry in layered fs */
     struct mount *nc_mount;            /* associated mount for vopops */
     int64_t    nc_fsmid;               /* filesystem modified id */
 };
@@ -127,7 +144,7 @@ typedef struct namecache *namecache_t;
 #define NCF_MOUNTPT    0x0008  /* mount point */
 #define NCF_ROOT       0x0010  /* namecache root (static) */
 #define NCF_HASHED     0x0020  /* namecache entry in hash table */
-#define NCF_LOCKREQ    0x0040
+#define NCF_UNUSED040  0x0040
 #define NCF_UNUSED080  0x0080
 #define NCF_ISSYMLINK  0x0100  /* represents a symlink */
 #define NCF_ISDIR      0x0200  /* represents a directory */
@@ -150,6 +167,9 @@ void        cache_lock(struct namecache *ncp);
 void   cache_lock(struct namecache *ncp);
 int    cache_lock_nonblock(struct namecache *ncp);
 void   cache_unlock(struct namecache *ncp);
+void   cache_lock_two(struct namecache *uncp, struct namecache *lncp);
+int    cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+void   cache_shadow_detach(struct namecache *ncp);
 void   cache_setvp(struct namecache *ncp, struct vnode *vp);
 void   cache_settimeout(struct namecache *ncp, int nticks);
 void   cache_setunresolved(struct namecache *ncp);
diff -r e1d135a8f666 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h     Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null.h     Wed Mar 29 10:29:08 2006 +0200
@@ -44,17 +44,26 @@ struct null_args {
 };
 
 struct null_mount {
-       struct mount    *nullm_vfs;
-       struct vnode    *nullm_rootvp;  /* Reference to root null_node */
+       struct namecache *nullm_ncp;
 };
 
 #ifdef _KERNEL
+
 #define        MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
-
 #ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+       printf(" [nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define        NULLNCDEBUG(ncp)                                                
        \
+        NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, nc_flag 0x%x, "   
\
+                   "nc_mount %p, nc_shadowed %p, nc_shadowinfo %p, "           
\
+                   "nc_shadowheight %d, nc_vp %p\n",                           
\
+                   (ncp), (ncp)->nc_name, (ncp)->nc_refs,                      
\
+                   (ncp)->nc_shadowinfo->sh_exlocks, (ncp)->nc_flag,           
\
+                   (ncp)->nc_mount, (ncp)->nc_shadowed,                        
\
+                   (ncp)->nc_shadowinfo, (ncp)->nc_shadowheight, (ncp)->nc_vp)
 #else
 #define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
 #endif /* NULLFS_DEBUG */
 
 #endif /* _KERNEL */
diff -r e1d135a8f666 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c      Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vfsops.c      Wed Mar 29 10:29:08 2006 +0200
@@ -53,6 +53,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/nlookup.h>
+#include <sys/namecache.h>
 #include "null.h"
 
 extern struct vnodeopv_entry_desc null_vnodeop_entries[];
@@ -80,12 +81,10 @@ nullfs_mount(struct mount *mp, char *pat
 {
        int error = 0;
        struct null_args args;
-       struct vnode *rootvp;
        struct null_mount *xmp;
        u_int size;
-       struct nlookupdata nd;
-
-       NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+
+       NULLFSDEBUG("mp %p\n", (void *)mp);
 
        /*
         * Update is a no-op
@@ -98,118 +97,79 @@ nullfs_mount(struct mount *mp, char *pat
         * Get argument
         */
        error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
-       if (error)
+
+       xmp = malloc(sizeof(*xmp), M_NULLFSMNT, M_WAITOK | M_ZERO);
+
+       NULLFSDEBUG("nlookup %s\n", args.target);
+
+       xmp->nullm_ncp = nlookup_simple(args.target,
+                             UIO_SYSSPACE, NLC_FOLLOW, &error);
+
+       if (! xmp->nullm_ncp) {
+               free(xmp, M_NULLFSMNT);
                return (error);
-
-       /*
-        * Find lower node
-        */
-       rootvp = NULL;
-       error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
-       if (error == 0)
-               error = nlookup(&nd);
-       if (error == 0) {
-               error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, 
-                                       &rootvp);
-       }
-
-       xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
-                               M_NULLFSMNT, M_WAITOK); /* XXX */
-
-       /*
-        * Save reference to underlying FS
-        */
-        /*
-         * As lite stacking enters the scene, the old way of doing this
-        * -- via the vnode -- is not good enough anymore...
-        */
-       xmp->nullm_vfs = nd.nl_ncp->nc_mount;
-       nlookup_done(&nd);
-
-       vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, 
-                        null_vnodeop_entries, 0);
-
-       VOP_UNLOCK(rootvp, 0, td);
-
-       /*
-        * Keep a held reference to the root vnode.
-        * It is vrele'd in nullfs_unmount.
-        */
-       xmp->nullm_rootvp = rootvp;
-       /*
-        * XXX What's the proper safety condition for querying
-        * the underlying mount? Is this flag tuning necessary
-        * at all?
-        */
-       if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
+       }
+
+       cache_unlock(xmp->nullm_ncp);
+
+       vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+       if (xmp->nullm_ncp->nc_mount->mnt_flag & MNT_LOCAL)
                mp->mnt_flag |= MNT_LOCAL;
-       mp->mnt_data = (qaddr_t) xmp;
+       mp->mnt_data = (void *)xmp;
        vfs_getnewfsid(mp);
 
        (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
            &size);
        bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
-       (void)nullfs_statfs(mp, &mp->mnt_stat, td);
-       NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
-               mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
-       return (0);
-}
-
-/*
- * Free reference to null layer
- */
+       NULLFSDEBUG("lower %s, alias at %s\n",
+                   mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
+       return (0);
+}
+
 static int
 nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
-       void *mntdata;
-       int flags = 0;
-
-       NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
-       if (mntflags & MNT_FORCE)
-               flags |= FORCECLOSE;
-
-       /*
-        * Finally, throw away the null_mount structure
-        */
-       mntdata = mp->mnt_data;
-       mp->mnt_data = 0;
-       free(mntdata, M_NULLFSMNT);
-       return 0;
+       NULLNCDEBUG(mp->mnt_ncp);
+
+       cache_drop(MOUNTTONULLMOUNT(mp)->nullm_ncp);
+       free(mp->mnt_data, M_NULLFSMNT);
+
+       return (0);
+}
+
+static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+       mp->mnt_ncp->nc_shadowed = MOUNTTONULLMOUNT(mp)->nullm_ncp;
+
+       return (0);
 }
 
 static int
 nullfs_root(struct mount *mp, struct vnode **vpp)
 {
-       struct thread *td = curthread;  /* XXX */
-       struct vnode *vp;
-
-       NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
-           (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
-       /*
-        * Return locked reference to root.
-        */
-       vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
-       vref(vp);
-
-#ifdef NULLFS_DEBUG
-       if (VOP_ISLOCKED(vp, NULL)) {
-               Debugger("root vnode is locked.\n");
-               vrele(vp);
-               return (EDEADLK);
-       }
-#endif
-       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-       *vpp = vp;
-       return 0;
+       int error;
+
+       error = cache_vget(MOUNTTONULLMOUNT(mp)->nullm_ncp,
+                          crhold(proc0.p_ucred), LK_EXCLUSIVE | LK_RETRY, vpp);
+       crfree(proc0.p_ucred);
+
+       return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+       return (MOUNTTONULLMOUNT(mp)->nullm_ncp->nc_mount);
 }
 
 static int
 nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
                struct thread *td)
 {
-       return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+       return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
 }
 
 static int
@@ -218,12 +178,12 @@ nullfs_statfs(struct mount *mp, struct s
        int error;
        struct statfs mstat;
 
-       NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
-           (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+       NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+                   mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
 
        bzero(&mstat, sizeof(mstat));
 
-       error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+       error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
        if (error)
                return (error);
 
@@ -248,23 +208,21 @@ nullfs_checkexp(struct mount *mp, struct
 nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
                struct ucred **credanonp)
 {
-
-       return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam, 
-               extflagsp, credanonp);
+       return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
 }
 
 static int                        
 nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
                  struct thread *td)
 {
-       return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
-           arg, td);
+       return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
 }
 
 
 static struct vfsops null_vfsops = {
        .vfs_mount =            nullfs_mount,
        .vfs_unmount =          nullfs_unmount,
+       .vfs_start =            nullfs_start,
        .vfs_root =             nullfs_root,
        .vfs_quotactl =         nullfs_quotactl,
        .vfs_statfs =           nullfs_statfs,
diff -r e1d135a8f666 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c       Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vnops.c       Wed Mar 29 10:29:08 2006 +0200
@@ -98,7 +98,7 @@
  * might be able to get on with a hybrid solution: overlay some vnodes, and 
rely
  * on namecache API for the rest.
  */
- 
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -109,6 +109,8 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
 #include "null.h"
 
 static int     null_nresolve(struct vop_nresolve_args *ap);
@@ -122,90 +124,190 @@ static int       null_nrmdir(struct vop_nrmdir
 static int     null_nrmdir(struct vop_nrmdir_args *ap);
 static int     null_nrename(struct vop_nrename_args *ap);
 
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+       /*
+        * The code in use below allows allows passing through lower mounts.
+        * If we didn't want to do that, we could use
+        *
+        *   MOUNTTONULLMOUNT(ncp->nc_mount)->nullm_ncp->nc_mount
+        *
+        * Eventually, the choice might be configurable.
+        */
+       return (ncp->nc_shadowed->nc_mount);
+}
+
+
+static __inline
+int
+nullfs_check(struct namecache *ncp)
+{
+       if (ncp->nc_mount->mnt_ncp == ncp)
+               return (EPERM);
+
+       if (!ncp->nc_shadowed)
+               return (ENOENT);
+
+       if (ncp->nc_shadowheight == 0)
+               return (EINVAL);
+
+       return (0);
+}
+
 static int
 null_nresolve(struct vop_nresolve_args *ap)
 {
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nresolve_ap(ap);
-}
-
-static int
-null_ncreate(struct vop_ncreate_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_ncreate_ap(ap);
-}
-
-static int
-null_nmkdir(struct vop_nmkdir_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nmkdir_ap(ap);
-}
-
-static int
-null_nmknod(struct vop_nmknod_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nmknod_ap(ap);
-}
-
-static int
-null_nlink(struct vop_nlink_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nlink_ap(ap);
-}
-
-static int
-null_nsymlink(struct vop_nsymlink_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nsymlink_ap(ap);
-}
-
-static int
-null_nwhiteout(struct vop_nwhiteout_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nwhiteout_ap(ap);
-}
-
-static int
-null_nremove(struct vop_nremove_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nremove_ap(ap);
-}
-
-static int
-null_nrmdir(struct vop_nrmdir_args *ap)
-{
-       ap->a_head.a_ops = 
MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-       return vop_nrmdir_ap(ap);
-}
+       struct namecache *ncp = ap->a_ncp;
+       struct nlcomponent nlc;
+       struct namecache *sncp, *psncp;
+       int error = 0;
+
+       sncp = ncp->nc_shadow_next;
+       if (sncp) {
+               cache_hold(sncp);
+               cache_setunresolved(sncp);
+               cache_put(sncp);
+       }
+
+       cache_unlock(ncp);
+       cache_lock(ncp->nc_parent);
+       psncp = ncp->nc_parent->nc_shadowed;
+       if (psncp)
+               cache_hold(psncp);
+       cache_unlock(ncp->nc_parent);
+
+       if (! psncp) {
+               cache_lock(ncp);
+               if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
+                       cache_setvp(ncp, NULL);
+               return (ncp->nc_error);
+       }
+
+       nlc.nlc_nameptr = ncp->nc_name;
+       nlc.nlc_namelen = ncp->nc_nlen;
+       sncp = cache_nlookup(psncp, &nlc);
+       cache_drop(psncp);
+
+       if ((sncp->nc_flag & NCF_UNRESOLVED) == 0)
+               goto postdowncall;
+
+       ap->a_head.a_ops = sncp->nc_mount->mnt_vn_use_ops;
+       ap->a_ncp = sncp;
+       /*
+        * According to cache_resolve(), the primary place for
+        * VOP_NRESOLVE calls, the caller of the nresolve method
+        * is the one who should take care about ncp->nc_error.
+        */
+       ap->a_ncp->nc_error = vop_nresolve_ap(ap);
+
+postdowncall:
+
+       error = cache_shadow_attach(ncp, sncp);
+
+       NULLNCDEBUG(ncp);
+       NULLNCDEBUG(sncp);
+       NULLFSDEBUG("attach error %d\n", error);
+
+       if (error) {
+               cache_put(sncp);
+               if (ncp->nc_flag & NCF_UNRESOLVED) {
+                       cache_setvp(ncp, NULL);
+                       error = ENOENT;
+               } else if (error == EEXIST)
+                       error = ncp->nc_error;
+       } else {
+               error = sncp->nc_error;
+               cache_setvp(ncp, sncp->nc_vp);
+               ncp->nc_shadowed = sncp;
+               cache_drop(sncp);
+       }
+
+       NULLFSDEBUG("error %d\n", error);
+       return (error);
+}
+
+#define NULL_NVOP_TEMPLATE(OP)                                                 
\
+static int                                                                     
\
+null_ ## OP(struct vop_ ## OP ## _args *ap)                                    
\
+{                                                                              
\
+       struct namecache *ncp = ap->a_ncp;                                      
\
+       struct namecache *sncp = ncp->nc_shadowed;                              
\
+       int error;                                                              
\
+                                                                               
\
+       NULLNCDEBUG(ap->a_ncp);                                                 
\
+                                                                               
\
+       if ((error = nullfs_check(ncp)))                                        
\
+               return (error);                                                 
\
+       cache_hold(sncp);                                                       
\
+                                                                               
\
+       NULLNCDEBUG(ap->a_ncp->nc_shadowed);                                    
\
+                                                                               
\
+       ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;      
\
+       ap->a_ncp = ncp->nc_shadowed;                                           
\
+                                                                               
\
+       error = vop_ ## OP ## _ap(ap);                                          
\
+       NULLNCDEBUG(ncp);                                                       
\
+       NULLNCDEBUG(sncp);                                                      
\
+       sncp->nc_shadowinfo == ncp->nc_shadowinfo ?                             
\
+           cache_drop(sncp) :                                                  
\
+           cache_put(sncp);                                                    
\
+                                                                               
\
+       return (error);                                                         
\
+}
+
+NULL_NVOP_TEMPLATE(ncreate)
+NULL_NVOP_TEMPLATE(nmkdir)
+NULL_NVOP_TEMPLATE(nmknod)
+NULL_NVOP_TEMPLATE(nlink)
+NULL_NVOP_TEMPLATE(nsymlink)
+NULL_NVOP_TEMPLATE(nwhiteout)
+NULL_NVOP_TEMPLATE(nremove)
+NULL_NVOP_TEMPLATE(nrmdir)
 
 static int
 null_nrename(struct vop_nrename_args *ap)
 {
+       struct namecache *fncp = ap->a_fncp;
+       struct namecache *tncp = ap->a_tncp;
+       struct namecache *sfncp = fncp->nc_shadowed;
+       struct namecache *stncp = tncp->nc_shadowed;
        struct mount *lmp;
-
-       lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
-       if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
-               return (EINVAL);
-
-       ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
-
-       return vop_nrename_ap(ap);
+       int error;
+
+       NULLNCDEBUG(ap->a_fncp);
+       NULLNCDEBUG(ap->a_tncp);
+
+       if ((error = nullfs_check(fncp)))
+               return (error);
+       if ((error = nullfs_check(tncp)))
+               return (error);
+
+       lmp = nullfs_lowermount_l(fncp);
+       if (lmp != nullfs_lowermount_l(tncp))
+               return (EXDEV);
+
+       cache_hold(sfncp);
+       cache_hold(stncp);
+
+       NULLNCDEBUG(ap->a_fncp->nc_shadowed);
+       NULLNCDEBUG(ap->a_tncp->nc_shadowed);
+
+       ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+       ap->a_fncp = fncp->nc_shadowed;
+       ap->a_tncp = tncp->nc_shadowed;
+
+       error = vop_nrename_ap(ap);
+
+       sfncp->nc_shadowinfo == fncp->nc_shadowinfo ?
+           cache_drop(sfncp) :
+           cache_put(sfncp);
+       stncp->nc_shadowinfo == tncp->nc_shadowinfo ?
+           cache_drop(stncp) :
+           cache_put(stncp);
+
+       return (error);
 }
 
 /*
@@ -224,4 +326,3 @@ struct vnodeopv_entry_desc null_vnodeop_
        { &vop_nrename_desc,            (vnodeopv_entry_t) null_nrename },
        { NULL, NULL }
 };
-

Reply via email to