Module Name: src Committed By: dennis Date: Wed Dec 24 20:01:22 UTC 2014
Modified Files: src/sys/kern: vfs_cache.c src/usr.bin/systat: vmstat.c src/usr.bin/vmstat: vmstat.c Log Message: Update stats-keeping in sys/kern/vfs_cache.c to remove (most) races while allowing consistent lockless sampling of the per-cpu statistics without atomic operations. Update comment describing the locking protocol to include this. These files were fumble-fingered out of the last commit. To generate a diff of this commit: cvs rdiff -u -r1.102 -r1.103 src/sys/kern/vfs_cache.c cvs rdiff -u -r1.80 -r1.81 src/usr.bin/systat/vmstat.c cvs rdiff -u -r1.205 -r1.206 src/usr.bin/vmstat/vmstat.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/kern/vfs_cache.c diff -u src/sys/kern/vfs_cache.c:1.102 src/sys/kern/vfs_cache.c:1.103 --- src/sys/kern/vfs_cache.c:1.102 Sun Dec 7 22:23:38 2014 +++ src/sys/kern/vfs_cache.c Wed Dec 24 20:01:21 2014 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_cache.c,v 1.102 2014/12/07 22:23:38 dennis Exp $ */ +/* $NetBSD: vfs_cache.c,v 1.103 2014/12/24 20:01:21 dennis Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -58,7 +58,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.102 2014/12/07 22:23:38 dennis Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.103 2014/12/24 20:01:21 dennis Exp $"); #include "opt_ddb.h" #include "opt_revcache.h" @@ -126,19 +126,52 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c, * locks essentially idles the subsystem, ensuring there are no * concurrent references to the cache entries being freed. * - * As a side effect while running cache_reclaim(), once the per-cpu - * locks are held the per-cpu stats are sampled, added to the - * subsystem total and zeroed. Only some of the per-cpu stats are - * incremented with the per-cpu lock held, however, and attempting - * to add locking around the remaining counters currently - * incremented without a lock can cause deadlock, so we don't - * do that. XXX Fix this up in a later revision. + * 32 bit per-cpu statistic counters (struct nchstats_percpu) are + * incremented when the operations they count are performed while + * running on the corresponding CPU. Frequently individual counters + * are incremented while holding a lock (either a per-cpu lock or + * namecache_lock) sufficient to preclude concurrent increments + * being done to the same counter, so non-atomic increments are + * done using the COUNT() macro. Counters which are incremented + * when one of these locks is not held use the COUNT_UNL() macro + * instead. COUNT_UNL() could be defined to do atomic increments + * but currently just does what COUNT() does, on the theory that + * it is unlikely the non-atomic increment will be interrupted + * by something on the same CPU that increments the same counter, + * but even if it does happen the consequences aren't serious. + * + * N.B.: Attempting to protect COUNT_UNL() increments by taking + * a per-cpu lock in the namecache_count_*() functions causes + * a deadlock. Don't do that, use atomic increments instead if + * the imperfections here bug you. + * + * The 64 bit system-wide statistic counts (struct nchstats) are + * maintained by sampling the per-cpu counters periodically, adding + * in the deltas since the last samples and recording the current + * samples to use to compute the next delta. The sampling is done + * as a side effect of cache_reclaim() which is run periodically, + * for its own purposes, often enough to avoid overflow of the 32 + * bit counters. While sampling in this fashion requires no locking + * it is never-the-less done only after all locks have been taken by + * cache_reclaim() to allow cache_stat_sysctl() to hold off + * cache_reclaim() with minimal locking. + * + * cache_stat_sysctl() takes its CPU's per-cpu lock to hold off + * cache_reclaim() so that it can copy the subsystem total stats + * without them being concurrently modified. If CACHE_STATS_CURRENT + * is defined it also harvests the per-cpu increments into the total, + * which again requires cache_reclaim() to be held off. * - * Per-cpu namecache data is defined next. + * The per-cpu data (a lock and the per-cpu stats structures) + * are defined next. */ +struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t); + struct nchcpu { - kmutex_t cpu_lock; - struct nchstats cpu_stats; + kmutex_t cpu_lock; + struct nchstats_percpu cpu_stats; + /* XXX maybe __cacheline_aligned would improve this? */ + struct nchstats_percpu cpu_stats_last; /* from last sample */ }; /* @@ -177,9 +210,32 @@ static long numcache __cacheline_aligned static void *cache_gcqueue; static u_int cache_gcpend; -/* Cache effectiveness statistics. */ +/* Cache effectiveness statistics. This holds total from per-cpu stats */ struct nchstats nchstats __cacheline_aligned; -#define COUNT(c,x) (c.x++) + +/* + * Macros to count an event, update the central stats with per-cpu + * values and add current per-cpu increments to the subsystem total + * last collected by cache_reclaim(). + */ +#define CACHE_STATS_CURRENT /* nothing */ + +#define COUNT(cpup, f) ((cpup)->cpu_stats.f++) + +#define UPDATE(cpup, f) do { \ + struct nchcpu *Xcpup = (cpup); \ + uint32_t Xcnt = (volatile uint32_t) Xcpup->cpu_stats.f; \ + nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \ + Xcpup->cpu_stats_last.f = Xcnt; \ +} while (/* CONSTCOND */ 0) + +#define ADD(stats, cpup, f) do { \ + struct nchcpu *Xcpup = (cpup); \ + stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \ +} while (/* CONSTCOND */ 0) + +/* Do unlocked stats the same way. Use a different name to allow mind changes */ +#define COUNT_UNL(cpup, f) COUNT((cpup), f) static const int cache_lowat = 95; static const int cache_hiwat = 98; @@ -219,6 +275,8 @@ cache_hash(const char *name, size_t name /* * Invalidate a cache entry and enqueue it for garbage collection. + * The caller needs to hold namecache_lock or a per-cpu lock to hold + * off cache_reclaim(). */ static void cache_invalidate(struct namecache *ncp) @@ -271,11 +329,6 @@ cache_disassociate(struct namecache *ncp * Lock all CPUs to prevent any cache lookup activity. Conceptually, * this locks out all "readers". */ -#define UPDATE(f) do { \ - nchstats.f += cpup->cpu_stats.f; \ - cpup->cpu_stats.f = 0; \ -} while (/* CONSTCOND */ 0) - static void cache_lock_cpus(void) { @@ -283,24 +336,31 @@ cache_lock_cpus(void) struct cpu_info *ci; struct nchcpu *cpup; + /* + * Lock out all CPUs first, then harvest per-cpu stats. This + * is probably not quite as cache-efficient as doing the lock + * and harvest at the same time, but allows cache_stat_sysctl() + * to make do with a per-cpu lock. + */ for (CPU_INFO_FOREACH(cii, ci)) { cpup = ci->ci_data.cpu_nch; mutex_enter(&cpup->cpu_lock); - UPDATE(ncs_goodhits); - UPDATE(ncs_neghits); - UPDATE(ncs_badhits); - UPDATE(ncs_falsehits); - UPDATE(ncs_miss); - UPDATE(ncs_long); - UPDATE(ncs_pass2); - UPDATE(ncs_2passes); - UPDATE(ncs_revhits); - UPDATE(ncs_revmiss); + } + for (CPU_INFO_FOREACH(cii, ci)) { + cpup = ci->ci_data.cpu_nch; + UPDATE(cpup, ncs_goodhits); + UPDATE(cpup, ncs_neghits); + UPDATE(cpup, ncs_badhits); + UPDATE(cpup, ncs_falsehits); + UPDATE(cpup, ncs_miss); + UPDATE(cpup, ncs_long); + UPDATE(cpup, ncs_pass2); + UPDATE(cpup, ncs_2passes); + UPDATE(cpup, ncs_revhits); + UPDATE(cpup, ncs_revmiss); } } -#undef UPDATE - /* * Release all CPU locks. */ @@ -318,8 +378,9 @@ cache_unlock_cpus(void) } /* - * Find a single cache entry and return it locked. 'namecache_lock' or - * at least one of the per-CPU locks must be held. + * Find a single cache entry and return it locked. + * The caller needs to hold namecache_lock or a per-cpu lock to hold + * off cache_reclaim(). */ static struct namecache * cache_lookup_entry(const struct vnode *dvp, const char *name, size_t namelen) @@ -333,6 +394,7 @@ cache_lookup_entry(const struct vnode *d ncpp = &nchashtbl[NCHASH2(hash, dvp)]; LIST_FOREACH(ncp, ncpp, nc_hash) { + /* XXX Needs barrier for Alpha here */ if (ncp->nc_dvp != dvp || ncp->nc_nlen != namelen || memcmp(ncp->nc_name, name, (u_int)ncp->nc_nlen)) @@ -407,7 +469,8 @@ cache_lookup(struct vnode *dvp, const ch struct namecache *ncp; struct vnode *vp; struct nchcpu *cpup; - int error; + int error, ret_value; + /* Establish default result values */ if (iswht_ret != NULL) { @@ -422,20 +485,21 @@ cache_lookup(struct vnode *dvp, const ch cpup = curcpu()->ci_data.cpu_nch; mutex_enter(&cpup->cpu_lock); if (__predict_false(namelen > NCHNAMLEN)) { - COUNT(cpup->cpu_stats, ncs_long); + COUNT(cpup, ncs_long); mutex_exit(&cpup->cpu_lock); /* found nothing */ return 0; } + ncp = cache_lookup_entry(dvp, name, namelen); if (__predict_false(ncp == NULL)) { - COUNT(cpup->cpu_stats, ncs_miss); + COUNT(cpup, ncs_miss); mutex_exit(&cpup->cpu_lock); /* found nothing */ return 0; } if ((cnflags & MAKEENTRY) == 0) { - COUNT(cpup->cpu_stats, ncs_badhits); + COUNT(cpup, ncs_badhits); /* * Last component and we are renaming or deleting, * the cache entry is invalid, or otherwise don't @@ -460,13 +524,11 @@ cache_lookup(struct vnode *dvp, const ch if (__predict_true(nameiop != CREATE || (cnflags & ISLASTCN) == 0)) { - COUNT(cpup->cpu_stats, ncs_neghits); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); + COUNT(cpup, ncs_neghits); /* found neg entry; vn is already null from above */ - return 1; + ret_value = 1; } else { - COUNT(cpup->cpu_stats, ncs_badhits); + COUNT(cpup, ncs_badhits); /* * Last component and we are renaming or * deleting, the cache entry is invalid, @@ -474,17 +536,22 @@ cache_lookup(struct vnode *dvp, const ch * exist. */ cache_invalidate(ncp); - mutex_exit(&ncp->nc_lock); - mutex_exit(&cpup->cpu_lock); /* found nothing */ - return 0; + ret_value = 0; } + mutex_exit(&ncp->nc_lock); + mutex_exit(&cpup->cpu_lock); + return ret_value; } vp = ncp->nc_vp; mutex_enter(vp->v_interlock); mutex_exit(&ncp->nc_lock); mutex_exit(&cpup->cpu_lock); + + /* + * Unlocked except for the vnode interlock. Call vget(). + */ error = vget(vp, LK_NOWAIT); if (error) { KASSERT(error == EBUSY); @@ -492,27 +559,21 @@ cache_lookup(struct vnode *dvp, const ch * This vnode is being cleaned out. * XXX badhits? */ - COUNT(cpup->cpu_stats, ncs_falsehits); + COUNT_UNL(cpup, ncs_falsehits); /* found nothing */ return 0; } -#ifdef DEBUG - /* - * since we released nb->nb_lock, - * we can't use this pointer any more. - */ - ncp = NULL; -#endif /* DEBUG */ - - /* We don't have the right lock, but this is only for stats. */ - COUNT(cpup->cpu_stats, ncs_goodhits); - + COUNT_UNL(cpup, ncs_goodhits); /* found it */ *vn_ret = vp; return 1; } + +/* + * Cut-'n-pasted version of the above without the nameiop argument. + */ int cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen, uint32_t cnflags, @@ -537,14 +598,14 @@ cache_lookup_raw(struct vnode *dvp, cons cpup = curcpu()->ci_data.cpu_nch; mutex_enter(&cpup->cpu_lock); if (__predict_false(namelen > NCHNAMLEN)) { - COUNT(cpup->cpu_stats, ncs_long); + COUNT(cpup, ncs_long); mutex_exit(&cpup->cpu_lock); /* found nothing */ return 0; } ncp = cache_lookup_entry(dvp, name, namelen); if (__predict_false(ncp == NULL)) { - COUNT(cpup->cpu_stats, ncs_miss); + COUNT(cpup, ncs_miss); mutex_exit(&cpup->cpu_lock); /* found nothing */ return 0; @@ -559,7 +620,7 @@ cache_lookup_raw(struct vnode *dvp, cons /*cnp->cn_flags |= ncp->nc_flags;*/ *iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0; } - COUNT(cpup->cpu_stats, ncs_neghits); + COUNT(cpup, ncs_neghits); mutex_exit(&ncp->nc_lock); mutex_exit(&cpup->cpu_lock); /* found negative entry; vn is already null from above */ @@ -568,6 +629,10 @@ cache_lookup_raw(struct vnode *dvp, cons mutex_enter(vp->v_interlock); mutex_exit(&ncp->nc_lock); mutex_exit(&cpup->cpu_lock); + + /* + * Unlocked except for the vnode interlock. Call vget(). + */ error = vget(vp, LK_NOWAIT); if (error) { KASSERT(error == EBUSY); @@ -575,14 +640,12 @@ cache_lookup_raw(struct vnode *dvp, cons * This vnode is being cleaned out. * XXX badhits? */ - COUNT(cpup->cpu_stats, ncs_falsehits); + COUNT_UNL(cpup, ncs_falsehits); /* found nothing */ return 0; } - /* Unlocked, but only for stats. */ - COUNT(cpup->cpu_stats, ncs_goodhits); /* XXX can be "badhits" */ - + COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */ /* found it */ *vn_ret = vp; return 1; @@ -605,8 +668,8 @@ cache_revlookup(struct vnode *vp, struct { struct namecache *ncp; struct vnode *dvp; - struct nchcpu *cpup; struct ncvhashhead *nvcpp; + struct nchcpu *cpup; char *bp; int error, nlen; @@ -614,8 +677,14 @@ cache_revlookup(struct vnode *vp, struct goto out; nvcpp = &ncvhashtbl[NCVHASH(vp)]; - cpup = curcpu()->ci_data.cpu_nch; + /* + * We increment counters in the local CPU's per-cpu stats. + * We don't take the per-cpu lock, however, since this function + * is the only place these counters are incremented so no one + * will be racing with us to increment them. + */ + cpup = curcpu()->ci_data.cpu_nch; mutex_enter(namecache_lock); LIST_FOREACH(ncp, nvcpp, nc_vhash) { mutex_enter(&ncp->nc_lock); @@ -633,7 +702,7 @@ cache_revlookup(struct vnode *vp, struct ncp->nc_name[1] == '.') panic("cache_revlookup: found entry for .."); #endif - COUNT(cpup->cpu_stats, ncs_revhits); + COUNT(cpup, ncs_revhits); nlen = ncp->nc_nlen; if (bufp) { @@ -665,7 +734,7 @@ cache_revlookup(struct vnode *vp, struct } mutex_exit(&ncp->nc_lock); } - COUNT(cpup->cpu_stats, ncs_revmiss); + COUNT(cpup, ncs_revmiss); mutex_exit(namecache_lock); out: *dvpp = NULL; @@ -1113,7 +1182,7 @@ namecache_count_pass2(void) { struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; - COUNT(cpup->cpu_stats, ncs_pass2); + COUNT_UNL(cpup, ncs_pass2); } void @@ -1121,13 +1190,23 @@ namecache_count_2passes(void) { struct nchcpu *cpup = curcpu()->ci_data.cpu_nch; - COUNT(cpup->cpu_stats, ncs_2passes); + COUNT_UNL(cpup, ncs_2passes); } +/* + * Fetch the current values of the stats. We return the most + * recent values harvested into nchstats by cache_reclaim(), which + * will be less than a second old. + */ static int cache_stat_sysctl(SYSCTLFN_ARGS) { - struct nchstats_sysctl stats; + struct nchstats stats; + struct nchcpu *my_cpup; +#ifdef CACHE_STATS_CURRENT + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; +#endif /* CACHE_STATS_CURRENT */ if (oldp == NULL) { *oldlenp = sizeof(stats); @@ -1139,21 +1218,32 @@ cache_stat_sysctl(SYSCTLFN_ARGS) return 0; } - memset(&stats, 0, sizeof(stats)); - + /* + * Take this CPU's per-cpu lock to hold off cache_reclaim() + * from doing a stats update while doing minimal damage to + * concurrent operations. + */ sysctl_unlock(); - cache_lock_cpus(); - stats.ncs_goodhits = nchstats.ncs_goodhits; - stats.ncs_neghits = nchstats.ncs_neghits; - stats.ncs_badhits = nchstats.ncs_badhits; - stats.ncs_falsehits = nchstats.ncs_falsehits; - stats.ncs_miss = nchstats.ncs_miss; - stats.ncs_long = nchstats.ncs_long; - stats.ncs_pass2 = nchstats.ncs_pass2; - stats.ncs_2passes = nchstats.ncs_2passes; - stats.ncs_revhits = nchstats.ncs_revhits; - stats.ncs_revmiss = nchstats.ncs_revmiss; - cache_unlock_cpus(); + my_cpup = curcpu()->ci_data.cpu_nch; + mutex_enter(&my_cpup->cpu_lock); + stats = nchstats; +#ifdef CACHE_STATS_CURRENT + for (CPU_INFO_FOREACH(cii, ci)) { + struct nchcpu *cpup = ci->ci_data.cpu_nch; + + ADD(stats, cpup, ncs_goodhits); + ADD(stats, cpup, ncs_neghits); + ADD(stats, cpup, ncs_badhits); + ADD(stats, cpup, ncs_falsehits); + ADD(stats, cpup, ncs_miss); + ADD(stats, cpup, ncs_long); + ADD(stats, cpup, ncs_pass2); + ADD(stats, cpup, ncs_2passes); + ADD(stats, cpup, ncs_revhits); + ADD(stats, cpup, ncs_revmiss); + } +#endif /* CACHE_STATS_CURRENT */ + mutex_exit(&my_cpup->cpu_lock); sysctl_relock(); *oldlenp = sizeof(stats); Index: src/usr.bin/systat/vmstat.c diff -u src/usr.bin/systat/vmstat.c:1.80 src/usr.bin/systat/vmstat.c:1.81 --- src/usr.bin/systat/vmstat.c:1.80 Fri Jun 20 07:08:15 2014 +++ src/usr.bin/systat/vmstat.c Wed Dec 24 20:01:22 2014 @@ -1,4 +1,4 @@ -/* $NetBSD: vmstat.c,v 1.80 2014/06/20 07:08:15 njoly Exp $ */ +/* $NetBSD: vmstat.c,v 1.81 2014/12/24 20:01:22 dennis Exp $ */ /*- * Copyright (c) 1983, 1989, 1992, 1993 @@ -34,7 +34,7 @@ #if 0 static char sccsid[] = "@(#)vmstat.c 8.2 (Berkeley) 1/12/94"; #endif -__RCSID("$NetBSD: vmstat.c,v 1.80 2014/06/20 07:08:15 njoly Exp $"); +__RCSID("$NetBSD: vmstat.c,v 1.81 2014/12/24 20:01:22 dennis Exp $"); #endif /* not lint */ /* @@ -63,7 +63,7 @@ __RCSID("$NetBSD: vmstat.c,v 1.80 2014/0 static struct Info { struct uvmexp_sysctl uvmexp; struct vmtotal Total; - struct nchstats_sysctl nchstats; + struct nchstats nchstats; long nchcount; long *intrcnt; u_int64_t *evcnt; Index: src/usr.bin/vmstat/vmstat.c diff -u src/usr.bin/vmstat/vmstat.c:1.205 src/usr.bin/vmstat/vmstat.c:1.206 --- src/usr.bin/vmstat/vmstat.c:1.205 Fri Sep 12 16:25:55 2014 +++ src/usr.bin/vmstat/vmstat.c Wed Dec 24 20:01:22 2014 @@ -1,4 +1,4 @@ -/* $NetBSD: vmstat.c,v 1.205 2014/09/12 16:25:55 skrll Exp $ */ +/* $NetBSD: vmstat.c,v 1.206 2014/12/24 20:01:22 dennis Exp $ */ /*- * Copyright (c) 1998, 2000, 2001, 2007 The NetBSD Foundation, Inc. @@ -70,7 +70,7 @@ __COPYRIGHT("@(#) Copyright (c) 1980, 19 #if 0 static char sccsid[] = "@(#)vmstat.c 8.2 (Berkeley) 3/1/95"; #else -__RCSID("$NetBSD: vmstat.c,v 1.205 2014/09/12 16:25:55 skrll Exp $"); +__RCSID("$NetBSD: vmstat.c,v 1.206 2014/12/24 20:01:22 dennis Exp $"); #endif #endif /* not lint */ @@ -867,7 +867,7 @@ pct(u_long top, u_long bot) void dosum(void) { - struct nchstats_sysctl nch_stats; + struct nchstats nch_stats; uint64_t nchtotal; size_t ssize; int active_kernel; @@ -1055,20 +1055,7 @@ dosum(void) memset(&nch_stats, 0, sizeof(nch_stats)); } } else { - struct nchstats nch_stats_kvm; - - kread(namelist, X_NCHSTATS, &nch_stats_kvm, - sizeof(nch_stats_kvm)); - nch_stats.ncs_goodhits = nch_stats_kvm.ncs_goodhits; - nch_stats.ncs_neghits = nch_stats_kvm.ncs_neghits; - nch_stats.ncs_badhits = nch_stats_kvm.ncs_badhits; - nch_stats.ncs_falsehits = nch_stats_kvm.ncs_falsehits; - nch_stats.ncs_miss = nch_stats_kvm.ncs_miss; - nch_stats.ncs_long = nch_stats_kvm.ncs_long; - nch_stats.ncs_pass2 = nch_stats_kvm.ncs_pass2; - nch_stats.ncs_2passes = nch_stats_kvm.ncs_2passes; - nch_stats.ncs_revhits = nch_stats_kvm.ncs_revhits; - nch_stats.ncs_revmiss = nch_stats_kvm.ncs_revmiss; + kread(namelist, X_NCHSTATS, &nch_stats, sizeof(nch_stats)); } nchtotal = nch_stats.ncs_goodhits + nch_stats.ncs_neghits +