Author: mjg
Date: Sat Sep 10 16:29:53 2016
New Revision: 305684
URL: https://svnweb.freebsd.org/changeset/base/305684

Log:
  cache: improve scalability by introducing bucket locks
  
  An array of bucket locks is added.
  
  All modifications still require the global cache_lock to be held for
  writing. However, most readers only need the relevant bucket lock and in
  effect can run concurrently to the writer as long as they use a
  different lock. See the added comment for more details.
  
  This is an intermediate step towards removal of the global lock.
  
  Reviewed by:  kib
  Tested by:    pho

Modified:
  head/sys/kern/subr_witness.c
  head/sys/kern/vfs_cache.c

Modified: head/sys/kern/subr_witness.c
==============================================================================
--- head/sys/kern/subr_witness.c        Sat Sep 10 16:11:42 2016        
(r305683)
+++ head/sys/kern/subr_witness.c        Sat Sep 10 16:29:53 2016        
(r305684)
@@ -623,6 +623,14 @@ static struct witness_order_list_entry o
        { "vnode interlock", &lock_class_mtx_sleep },
        { NULL, NULL },
        /*
+        * VFS namecache
+        */
+       { "ncglobal", &lock_class_rw },
+       { "ncbuc", &lock_class_rw },
+       { "vnode interlock", &lock_class_mtx_sleep },
+       { "ncneg", &lock_class_mtx_sleep },
+       { NULL, NULL },
+       /*
         * ZFS locking
         */
        { "dn->dn_mtx", &lock_class_sx },

Modified: head/sys/kern/vfs_cache.c
==============================================================================
--- head/sys/kern/vfs_cache.c   Sat Sep 10 16:11:42 2016        (r305683)
+++ head/sys/kern/vfs_cache.c   Sat Sep 10 16:29:53 2016        (r305684)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
+#include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
@@ -148,6 +149,23 @@ struct     namecache_ts {
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME         TYPE    ROLE
+ * cache_lock   rwlock  global, needed for all modifications
+ * bucketlock   rwlock  for access to given hash bucket
+ * ncneg_mtx    mtx     negative entry LRU management
+ *
+ * A name -> vnode lookup can be safely performed by either locking cache_lock
+ * or the relevant hash bucket.
+ *
+ * ".." and vnode -> name lookups require cache_lock.
+ *
+ * Modifications require both cache_lock and relevant bucketlock taken for
+ * writing.
+ *
+ * Negative entry LRU management requires ncneg_mtx taken on top of either
+ * cache_lock or bucketlock.
  */
 
 /*
@@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor
 struct nchstats        nchstats;               /* cache effectiveness 
statistics */
 
 static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
 
+#define        CACHE_TRY_WLOCK()       rw_try_wlock(&cache_lock)
 #define        CACHE_UPGRADE_LOCK()    rw_try_upgrade(&cache_lock)
 #define        CACHE_RLOCK()           rw_rlock(&cache_lock)
 #define        CACHE_RUNLOCK()         rw_runlock(&cache_lock)
@@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name 
 #define        CACHE_WUNLOCK()         rw_wunlock(&cache_lock)
 
 static struct mtx_padalign ncneg_mtx;
-MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF);
+MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
+
+static u_int   numbucketlocks;
+static struct rwlock_padalign  *bucketlocks;
+#define        HASH2BUCKETLOCK(hash) \
+       ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
 
 /*
  * UMA zones for the VFS cache.
@@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Numb
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 static long numupgrades; STATNODE_ULONG(numupgrades,
     "Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+    "Number of times bucketlocked zap_and_exit case failed to writelock");
 
 static void cache_zap(struct namecache *ncp);
 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
@@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, s
        return (hash);
 }
 
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+       struct rwlock *bucketlock;
+       uint32_t hash;
+
+       hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+       bucketlock = HASH2BUCKETLOCK(hash);
+       rw_assert(bucketlock, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+static void
+cache_lock_all_buckets(void)
+{
+       u_int i;
+
+       for (i = 0; i < numbucketlocks; i++)
+               rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+       u_int i;
+
+       for (i = 0; i < numbucketlocks; i++)
+               rw_wunlock(&bucketlocks[i]);
+}
+
 static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
@@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, n
  * Negative entries management
  */
 static void
-cache_negative_hit(struct namecache *ncp, int wlocked)
+cache_negative_hit(struct namecache *ncp)
 {
 
-       if (!wlocked) {
-               rw_assert(&cache_lock, RA_RLOCKED);
-               mtx_lock(&ncneg_mtx);
-       } else {
-               rw_assert(&cache_lock, RA_WLOCKED);
-       }
-
+       mtx_lock(&ncneg_mtx);
        TAILQ_REMOVE(&ncneg, ncp, nc_dst);
        TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-
-       if (!wlocked)
-               mtx_unlock(&ncneg_mtx);
+       mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -464,9 +515,12 @@ cache_negative_insert(struct namecache *
 {
 
        rw_assert(&cache_lock, RA_WLOCKED);
+       cache_assert_bucket_locked(ncp, RA_WLOCKED);
        MPASS(ncp->nc_vp == NULL);
+       mtx_lock(&ncneg_mtx);
        TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
        numneg++;
+       mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -474,9 +528,12 @@ cache_negative_remove(struct namecache *
 {
 
        rw_assert(&cache_lock, RA_WLOCKED);
+       cache_assert_bucket_locked(ncp, RA_WLOCKED);
        MPASS(ncp->nc_vp == NULL);
+       mtx_lock(&ncneg_mtx);
        TAILQ_REMOVE(&ncneg, ncp, nc_dst);
        numneg--;
+       mtx_unlock(&ncneg_mtx);
 }
 
 static struct namecache *
@@ -499,10 +556,11 @@ cache_negative_zap_one(void)
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp)
 {
 
        rw_assert(&cache_lock, RA_WLOCKED);
+       cache_assert_bucket_locked(ncp, RA_WLOCKED);
        CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
        if (ncp->nc_vp != NULL) {
                SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
@@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp)
        numcache--;
 }
 
+static void
+cache_zap(struct namecache *ncp)
+{
+       struct rwlock *bucketlock;
+       uint32_t hash;
+
+       rw_assert(&cache_lock, RA_WLOCKED);
+
+       hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+       bucketlock = HASH2BUCKETLOCK(hash);
+       rw_wlock(bucketlock);
+       cache_zap_locked(ncp);
+       rw_wunlock(bucketlock);
+}
+
 /*
  * Lookup an entry in the cache
  *
@@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp)
  * not recursively acquired.
  */
 
+enum { UNLOCKED, WLOCKED, RLOCKED };
+
+static void
+cache_unlock(int cache_locked)
+{
+
+       switch (cache_locked) {
+       case UNLOCKED:
+               break;
+       case WLOCKED:
+               CACHE_WUNLOCK();
+               break;
+       case RLOCKED:
+               CACHE_RUNLOCK();
+               break;
+       }
+}
+
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
+       struct rwlock *bucketlock;
        struct namecache *ncp;
        uint32_t hash;
-       int error, ltype, wlocked;
+       int error, ltype, cache_locked;
 
        if (!doingcache) {
                cnp->cn_flags &= ~MAKEENTRY;
                return (0);
        }
 retry:
-       wlocked = 0;
-       counter_u64_add(numcalls, 1);
+       bucketlock = NULL;
+       cache_locked = UNLOCKED;
        error = 0;
+       counter_u64_add(numcalls, 1);
 
 retry_wlocked:
        if (cnp->cn_nameptr[0] == '.') {
@@ -598,17 +691,21 @@ retry_wlocked:
                        }
                        return (-1);
                }
-               if (!wlocked)
-                       CACHE_RLOCK();
                if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
                        counter_u64_add(dotdothits, 1);
+                       if (cache_locked == UNLOCKED) {
+                               CACHE_RLOCK();
+                               cache_locked = RLOCKED;
+                       }
+
                        if (dvp->v_cache_dd == NULL) {
                                SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
                                    "..", NULL);
                                goto unlock;
                        }
                        if ((cnp->cn_flags & MAKEENTRY) == 0) {
-                               if (!wlocked && !CACHE_UPGRADE_LOCK())
+                               if (cache_locked != WLOCKED &&
+                                   !CACHE_UPGRADE_LOCK())
                                        goto wlock;
                                ncp = NULL;
                                if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
@@ -639,10 +736,14 @@ retry_wlocked:
                                    nc_dotdottime;
                        goto success;
                }
-       } else if (!wlocked)
-               CACHE_RLOCK();
+       }
 
        hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+       if (cache_locked == UNLOCKED) {
+               bucketlock = HASH2BUCKETLOCK(hash);
+               rw_rlock(bucketlock);
+       }
+
        LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
                counter_u64_add(numchecks, 1);
                if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
@@ -665,12 +766,7 @@ retry_wlocked:
        /* We don't want to have an entry, so dump it */
        if ((cnp->cn_flags & MAKEENTRY) == 0) {
                counter_u64_add(numposzaps, 1);
-               if (!wlocked && !CACHE_UPGRADE_LOCK())
-                       goto wlock;
-               cache_zap(ncp);
-               CACHE_WUNLOCK();
-               cache_free(ncp);
-               return (0);
+               goto zap_and_exit;
        }
 
        /* We found a "positive" match, return the vnode */
@@ -689,25 +785,20 @@ negative_success:
        /* We found a negative match, and want to create it, so purge */
        if (cnp->cn_nameiop == CREATE) {
                counter_u64_add(numnegzaps, 1);
-               if (!wlocked && !CACHE_UPGRADE_LOCK())
-                       goto wlock;
-               cache_zap(ncp);
-               CACHE_WUNLOCK();
-               cache_free(ncp);
-               return (0);
+               goto zap_and_exit;
        }
 
        counter_u64_add(numneghits, 1);
-       cache_negative_hit(ncp, wlocked);
+       cache_negative_hit(ncp);
        if (ncp->nc_flag & NCF_WHITE)
                cnp->cn_flags |= ISWHITEOUT;
        SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
            nc_get_name(ncp));
        cache_out_ts(ncp, tsp, ticksp);
-       if (wlocked)
-               CACHE_WUNLOCK();
-       else
-               CACHE_RUNLOCK();
+       MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+       if (bucketlock != NULL)
+               rw_runlock(bucketlock);
+       cache_unlock(cache_locked);
        return (ENOENT);
 
 wlock:
@@ -716,9 +807,10 @@ wlock:
         * a write lock and retry the operation.
         */
        CACHE_RUNLOCK();
+wlock_unlocked:
        CACHE_WLOCK();
        numupgrades++;
-       wlocked = 1;
+       cache_locked = WLOCKED;
        goto retry_wlocked;
 
 success:
@@ -733,10 +825,10 @@ success:
                VOP_UNLOCK(dvp, 0);
        }
        vhold(*vpp);
-       if (wlocked)
-               CACHE_WUNLOCK();
-       else
-               CACHE_RUNLOCK();
+       MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+       if (bucketlock != NULL)
+               rw_runlock(bucketlock);
+       cache_unlock(cache_locked);
        error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
        if (cnp->cn_flags & ISDOTDOT) {
                vn_lock(dvp, ltype | LK_RETRY);
@@ -758,10 +850,29 @@ success:
        return (-1);
 
 unlock:
-       if (wlocked)
-               CACHE_WUNLOCK();
-       else
-               CACHE_RUNLOCK();
+       MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+       if (bucketlock != NULL)
+               rw_runlock(bucketlock);
+       cache_unlock(cache_locked);
+       return (0);
+
+zap_and_exit:
+       if (bucketlock != NULL) {
+               rw_assert(&cache_lock, RA_UNLOCKED);
+               if (!CACHE_TRY_WLOCK()) {
+                       rw_runlock(bucketlock);
+                       bucketlock = NULL;
+                       zap_and_exit_bucket_fail++;
+                       goto wlock_unlocked;
+               }
+               cache_locked = WLOCKED;
+               rw_runlock(bucketlock);
+               bucketlock = NULL;
+       } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
+               goto wlock;
+       cache_zap(ncp);
+       CACHE_WUNLOCK();
+       cache_free(ncp);
        return (0);
 }
 
@@ -772,6 +883,7 @@ void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname 
*cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
+       struct rwlock *bucketlock;
        struct namecache *ncp, *n2, *ndd, *nneg;
        struct namecache_ts *n3;
        struct nchashhead *ncpp;
@@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, stru
                }
        }
 
-       /*
-        * Insert the new namecache entry into the appropriate chain
-        * within the cache entries table.
-        */
-       LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
        if (flag != NCF_ISDOTDOT) {
                if (LIST_EMPTY(&dvp->v_cache_src)) {
                        vhold(dvp);
@@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, stru
                LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
        }
 
+       bucketlock = HASH2BUCKETLOCK(hash);
+       rw_wlock(bucketlock);
+
+       /*
+        * Insert the new namecache entry into the appropriate chain
+        * within the cache entries table.
+        */
+       LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
        /*
         * If the entry is "negative", we place it into the
         * "negative" cache queue, otherwise, we place it into the
@@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, stru
                SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
                    nc_get_name(ncp));
        }
+       rw_wunlock(bucketlock);
        if (numneg * ncnegfactor > numcache)
                nneg = cache_negative_zap_one();
        CACHE_WUNLOCK();
@@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, stru
        cache_free(nneg);
 }
 
+static u_int
+cache_roundup_2(u_int val)
+{
+       u_int res;
+
+       for (res = 1; res <= val; res <<= 1)
+               continue;
+
+       return (res);
+}
+
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
+       u_int i;
 
        TAILQ_INIT(&ncneg);
 
@@ -983,6 +1112,13 @@ nchinit(void *dummy __unused)
            NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
        nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+       numbucketlocks = cache_roundup_2(mp_ncpus * 16);
+       if (numbucketlocks > nchash)
+               numbucketlocks = nchash;
+       bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+           M_WAITOK | M_ZERO);
+       for (i = 0; i < numbucketlocks; i++)
+               rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
 
        numcalls = counter_u64_alloc(M_WAITOK);
        dothits = counter_u64_alloc(M_WAITOK);
@@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes)
         * because to do so, they have to be removed from the hash table.
         */
        CACHE_WLOCK();
+       cache_lock_all_buckets();
        old_nchashtbl = nchashtbl;
        old_nchash = nchash;
        nchashtbl = new_nchashtbl;
@@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes)
                        LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
                }
        }
+       cache_unlock_all_buckets();
        CACHE_WUNLOCK();
        free(old_nchashtbl, M_VFSCACHE);
 }
@@ -1108,20 +1246,30 @@ void
 cache_purgevfs(struct mount *mp)
 {
        TAILQ_HEAD(, namecache) ncps;
-       struct nchashhead *ncpp;
+       struct rwlock *bucketlock;
+       struct nchashhead *bucket;
        struct namecache *ncp, *nnp;
+       u_long i, j, n_nchash;
 
        /* Scan hash tables for applicable entries */
        SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
        TAILQ_INIT(&ncps);
        CACHE_WLOCK();
-       for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-               LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
-                       if (ncp->nc_dvp->v_mount != mp)
-                               continue;
-                       cache_zap(ncp);
-                       TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+       n_nchash = nchash + 1;
+       for (i = 0; i < numbucketlocks; i++) {
+               bucketlock = (struct rwlock *)&bucketlocks[i];
+               rw_wlock(bucketlock);
+               for (j = i; j < n_nchash; j += numbucketlocks) {
+                       bucket = &nchashtbl[j];
+                       LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+                               cache_assert_bucket_locked(ncp, RA_WLOCKED);
+                               if (ncp->nc_dvp->v_mount != mp)
+                                       continue;
+                               cache_zap_locked(ncp);
+                               TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+                       }
                }
+               rw_wunlock(bucketlock);
        }
        CACHE_WUNLOCK();
        TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "[email protected]"

Reply via email to