Index: sys/sys/namei.src
===================================================================
RCS file: /cvsroot/src/sys/sys/namei.src,v
retrieving revision 1.33
diff -u -r1.33 namei.src
--- sys/sys/namei.src	3 Jun 2014 21:16:15 -0000	1.33
+++ sys/sys/namei.src	1 Dec 2014 03:17:48 -0000
@@ -288,36 +288,29 @@
 #endif
 
 /*
- * Stats on usefulness of namei caches.
- * XXX: should be 64-bit counters.
+ * Stats on usefulness of namei caches.  A couple of structures are
+ * used for counting, with members having the same names but different
+ * types.  Containerize member names with the preprocessor to avoid
+ * cut-'n'-paste.  A U: in the comment documents values that are
+ * incremented unlocked; we may treat these specially.
  */
-struct	nchstats {
-	long	ncs_goodhits;		/* hits that we can really use */
-	long	ncs_neghits;		/* negative hits that we can use */
-	long	ncs_badhits;		/* hits we must drop */
-	long	ncs_falsehits;		/* hits with id mismatch */
-	long	ncs_miss;		/* misses */
-	long	ncs_long;		/* long names that ignore cache */
-	long	ncs_pass2;		/* names found with passes == 2 */
-	long	ncs_2passes;		/* number of times we attempt it */
-	long	ncs_revhits;		/* reverse-cache hits */
-	long	ncs_revmiss;		/* reverse-cache misses */
-};
+#define	_NAMEI_CACHE_STATS(type) {					\
+	type	ncs_goodhits;	/* U: hits that we can really use */	\
+	type	ncs_neghits;	/* negative hits that we can use */	\
+	type	ncs_badhits;	/* hits we must drop */			\
+	type	ncs_falsehits;	/* U: hits with id mismatch */		\
+	type	ncs_miss;	/* misses */				\
+	type	ncs_long;	/* long names that ignore cache */	\
+	type	ncs_pass2;	/* U: names found with passes == 2 */	\
+	type	ncs_2passes;	/* U: number of times we attempt it */	\
+	type	ncs_revhits;	/* reverse-cache hits */		\
+	type	ncs_revmiss;	/* reverse-cache misses */		\
+}
 
-struct	nchstats_sysctl {
-	uint64_t ncs_goodhits;		/* hits that we can really use */
-	uint64_t ncs_neghits;		/* negative hits that we can use */
-	uint64_t ncs_badhits;		/* hits we must drop */
-	uint64_t ncs_falsehits;		/* hits with id mismatch */
-	uint64_t ncs_miss;		/* misses */
-	uint64_t ncs_long;		/* long names that ignore cache */
-	uint64_t ncs_pass2;		/* names found with passes == 2 */
-	uint64_t ncs_2passes;		/* number of times we attempt it */
-	uint64_t ncs_revhits;		/* reverse-cache hits */
-	uint64_t ncs_revmiss;		/* reverse-cache misses */
-};
+/*
+ * Sysctl deals with a uint64_t version of the stats and summary
+ * totals are kept that way.
+ */
+struct	nchstats _NAMEI_CACHE_STATS(uint64_t);
 
-#ifdef _KERNEL
-extern struct nchstats nchstats;
-#endif
 /* #endif !_SYS_NAMEI_H_ (generated by gennameih.awk) */
Index: sys/kern/vfs_cache.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_cache.c,v
retrieving revision 1.100
diff -u -r1.100 vfs_cache.c
--- sys/kern/vfs_cache.c	30 Nov 2014 04:11:03 -0000	1.100
+++ sys/kern/vfs_cache.c	1 Dec 2014 03:17:48 -0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_cache.c,v 1.100 2014/11/30 04:11:03 dennis Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.99 2014/06/16 12:28:10 joerg Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -58,7 +58,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.100 2014/11/30 04:11:03 dennis Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.99 2014/06/16 12:28:10 joerg Exp $");
 
 #include "opt_ddb.h"
 #include "opt_revcache.h"
@@ -113,9 +113,8 @@
  * When a lookup is done in the cache, via cache_lookup() or
  * cache_lookup_raw(), the per-cpu lock below is taken.  This
  * protects calls to cache_lookup_entry() and cache_invalidate()
- * against cache_reclaim(), and protects the per-cpu stats against
- * modification in both cache_reclaim() and cache_stat_sysctl(),
- * but allows lookups to continue in parallel with cache_enter().
+ * against cache_reclaim() but allows lookups to continue in
+ * parallel with cache_enter().
  *
  * cache_revlookup() takes namecache_lock to exclude cache_enter()
  * and cache_reclaim() since the list it operates on is not
@@ -126,19 +125,43 @@
  * taken to hold off lookups.  Holding all these locks essentially
  * idles the subsystem, ensuring there are no concurrent references
  * to the cache entries being freed.  As a side effect, once the
- * per-cpu locks are held, the per-cpu stats are added to the
- * subsystem totals and then zeroed.  cache_stat_sysctl() similarly
- * takes all locks to collect the per-cpu stats (though it perhaps
- * could avoid this by living with stats that were a second out
- * of date?).
+ * per-cpu locks are held, the per-cpu stats are sampled and the
+ * increments since the last run are added to the subsystem totals.
+ *
+ * cache_stat_sysctl() takes its CPU's per-cpu lock to hold off
+ * cache_reclaim() so that it can copy the subsystem total stats
+ * without them being modified.  If CACHE_STATS_CURRENT is defined
+ * it also harvests the current per-cpu increments into the total,
+ * which again requires cache_reclaim() to be held off.
  *
  * The per-cpu namecache data is defined below.  cpu_lock is used
- * to protect cpu_stats updates and to exclude cache_reclaim()
- * during lookups.
+ * to exclude cache_reclaim() during lookups.  The per-cpu stats
+ * only count a second's worth of activity on a single CPU, so they
+ * can be a 32 bit type to save memory and permit atomic writes on
+ * most machines.  To allow stats collection without atomic operations,
+ * even for counters incremented unlocked, the per-cpu stats are
+ * incremented continuously and never reset; the cpu_stats_last structure
+ * holds the values observed during the last run of cache_reclaim() so
+ * the next run can compute differences.
+ *
+ * When we can we prefer to increment cpu_stats members when a lock
+ * is held.  When one isn't held (by us), however, we do a non-atomic
+ * increment anyway.  The reason is that, since the same per-cpu stats
+ * are only incremented by threads on the same CPU, the only way a non-atomic
+ * increment can fail is if the thread is interrupted by something on
+ * the same CPU which goes on to increment the same counter.  I believe
+ * preemption like this may not happen, though even if it did the
+ * consequences are an occasional lost increment.  We use a different
+ * macro name for unlocked increments, however, so that anyone bugged
+ * by this can replace it with an atomic increment.
  */
+struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
+
 struct nchcpu {
-	kmutex_t	cpu_lock;
-	struct nchstats	cpu_stats;
+	kmutex_t		cpu_lock;
+	struct nchstats_percpu	cpu_stats;
+	/* XXX aligning this to a different cache line might be better? */
+	struct nchstats_percpu	cpu_stats_last;
 };
 
 /*
@@ -177,9 +200,34 @@
 static void	*cache_gcqueue;
 static u_int	cache_gcpend;
 
-/* Cache effectiveness statistics. */
+/* Cache effectiveness statistics.  This holds total from per-cpu stats */
 struct nchstats	nchstats __cacheline_aligned;
-#define	COUNT(c,x)	(c.x++)
+
+/*
+ * Macros to count an event, update the central stats with per-cpu
+ * values and add current per-cpu increments to the subsystem total
+ * last collected by cache_reclaim().
+ */
+#define	CACHE_STATS_CURRENT	/* cache_stat_sysctl() returns latest values */
+
+#define	COUNT(cpup, f)	((cpup)->cpu_stats.f++)
+
+/* The __insn_barrier() is to ensure Xcpup->cpu_stats.f is read just once */
+#define	UPDATE(cpup, f) do { \
+	struct nchcpu *Xcpup = (cpup); \
+	uint32_t Xcnt = Xcpup->cpu_stats.f; \
+	__insn_barrier(); \
+	nchstats.f += Xcnt - Xcpup->cpu_stats_last.f; \
+	Xcpup->cpu_stats_last.f = Xcnt; \
+} while (/* CONSTCOND */ 0)
+
+#define	ADD(stats, cpup, f) do { \
+	struct nchcpu *Xcpup = (cpup); \
+	stats.f += Xcpup->cpu_stats.f - Xcpup->cpu_stats_last.f; \
+} while (/* CONSTCOND */ 0)
+
+/* Do unlocked stats the same way. atomic_inc_32() is another option. */
+#define	COUNT_UNL(cpup, f)	COUNT(cpup, f)
 
 static const int cache_lowat = 95;
 static const int cache_hiwat = 98;
@@ -273,11 +321,6 @@
  * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
  * this locks out all "readers".
  */
-#define	UPDATE(f) do { \
-	nchstats.f += cpup->cpu_stats.f; \
-	cpup->cpu_stats.f = 0; \
-} while (/* CONSTCOND */ 0)
-
 static void
 cache_lock_cpus(void)
 {
@@ -285,24 +328,31 @@
 	struct cpu_info *ci;
 	struct nchcpu *cpup;
 
+	/*
+	 * Lock out all CPUs first, then harvest per-cpu stats.  This
+	 * is probably not quite as cache-efficient as doing the lock
+	 * and harvest at the same time, but allows cache_stat_sysctl()
+	 * to make do with a per-cpu lock.
+	 */
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		cpup = ci->ci_data.cpu_nch;
 		mutex_enter(&cpup->cpu_lock);
-		UPDATE(ncs_goodhits);
-		UPDATE(ncs_neghits);
-		UPDATE(ncs_badhits);
-		UPDATE(ncs_falsehits);
-		UPDATE(ncs_miss);
-		UPDATE(ncs_long);
-		UPDATE(ncs_pass2);
-		UPDATE(ncs_2passes);
-		UPDATE(ncs_revhits);
-		UPDATE(ncs_revmiss);
+	}
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		cpup = ci->ci_data.cpu_nch;
+		UPDATE(cpup, ncs_goodhits);
+		UPDATE(cpup, ncs_neghits);
+		UPDATE(cpup, ncs_badhits);
+		UPDATE(cpup, ncs_falsehits);
+		UPDATE(cpup, ncs_miss);
+		UPDATE(cpup, ncs_long);
+		UPDATE(cpup, ncs_pass2);
+		UPDATE(cpup, ncs_2passes);
+		UPDATE(cpup, ncs_revhits);
+		UPDATE(cpup, ncs_revmiss);
 	}
 }
 
-#undef UPDATE
-
 /*
  * Release all CPU locks.
  */
@@ -427,7 +477,7 @@
 	cpup = curcpu()->ci_data.cpu_nch;
 	mutex_enter(&cpup->cpu_lock);
 	if (__predict_false(namelen > NCHNAMLEN)) {
-		COUNT(cpup->cpu_stats, ncs_long);
+		COUNT(cpup, ncs_long);
 		mutex_exit(&cpup->cpu_lock);
 		/* found nothing */
 		return 0;
@@ -435,13 +485,13 @@
 
 	ncp = cache_lookup_entry(dvp, name, namelen);
 	if (__predict_false(ncp == NULL)) {
-		COUNT(cpup->cpu_stats, ncs_miss);
+		COUNT(cpup, ncs_miss);
 		mutex_exit(&cpup->cpu_lock);
 		/* found nothing */
 		return 0;
 	}
 	if ((cnflags & MAKEENTRY) == 0) {
-		COUNT(cpup->cpu_stats, ncs_badhits);
+		COUNT(cpup, ncs_badhits);
 		/*
 		 * Last component and we are renaming or deleting,
 		 * the cache entry is invalid, or otherwise don't
@@ -466,11 +516,11 @@
 
 		if (__predict_true(nameiop != CREATE ||
 		    (cnflags & ISLASTCN) == 0)) {
-			COUNT(cpup->cpu_stats, ncs_neghits);
+			COUNT(cpup, ncs_neghits);
 			/* found neg entry; vn is already null from above */
 			ret_value = 1;
 		} else {
-			COUNT(cpup->cpu_stats, ncs_badhits);
+			COUNT(cpup, ncs_badhits);
 			/*
 			 * Last component and we are renaming or
 			 * deleting, the cache entry is invalid,
@@ -489,32 +539,27 @@
 	vp = ncp->nc_vp;
 	mutex_enter(vp->v_interlock);
 	mutex_exit(&ncp->nc_lock);
+	mutex_exit(&cpup->cpu_lock);
 
 	/*
-	 * Drop per-cpu lock across the call to vget(), take it
-	 * again for the sake of the stats update.
+	 * Unlocked except for the vnode interlock.  Call vget().
 	 */
-	mutex_exit(&cpup->cpu_lock);
 	error = vget(vp, LK_NOWAIT);
-	mutex_enter(&cpup->cpu_lock);
 	if (error) {
 		KASSERT(error == EBUSY);
 		/*
 		 * This vnode is being cleaned out.
 		 * XXX badhits?
 		 */
-		COUNT(cpup->cpu_stats, ncs_falsehits);
+		COUNT_UNL(cpup, ncs_falsehits);
 		/* found nothing */
-		ret_value = 0;
-	} else {
-		COUNT(cpup->cpu_stats, ncs_goodhits);
-		/* found it */
-		*vn_ret = vp;
-		ret_value = 1;
+		return 0;
 	}
-	mutex_exit(&cpup->cpu_lock);
 
-	return ret_value;
+	COUNT_UNL(cpup, ncs_goodhits);
+	/* found it */
+	*vn_ret = vp;
+	return 1;
 }
 
 
@@ -529,7 +574,7 @@
 	struct namecache *ncp;
 	struct vnode *vp;
 	struct nchcpu *cpup;
-	int error, ret_value;
+	int error;
 
 	/* Establish default results. */
 	if (iswht_ret != NULL) {
@@ -545,14 +590,14 @@
 	cpup = curcpu()->ci_data.cpu_nch;
 	mutex_enter(&cpup->cpu_lock);
 	if (__predict_false(namelen > NCHNAMLEN)) {
-		COUNT(cpup->cpu_stats, ncs_long);
+		COUNT(cpup, ncs_long);
 		mutex_exit(&cpup->cpu_lock);
 		/* found nothing */
 		return 0;
 	}
 	ncp = cache_lookup_entry(dvp, name, namelen);
 	if (__predict_false(ncp == NULL)) {
-		COUNT(cpup->cpu_stats, ncs_miss);
+		COUNT(cpup, ncs_miss);
 		mutex_exit(&cpup->cpu_lock);
 		/* found nothing */
 		return 0;
@@ -567,7 +612,7 @@
 			/*cnp->cn_flags |= ncp->nc_flags;*/
 			*iswht_ret = (ncp->nc_flags & ISWHITEOUT) != 0;
 		}
-		COUNT(cpup->cpu_stats, ncs_neghits);
+		COUNT(cpup, ncs_neghits);
 		mutex_exit(&ncp->nc_lock);
 		mutex_exit(&cpup->cpu_lock);
 		/* found negative entry; vn is already null from above */
@@ -575,32 +620,27 @@
 	}
 	mutex_enter(vp->v_interlock);
 	mutex_exit(&ncp->nc_lock);
+	mutex_exit(&cpup->cpu_lock);
 
 	/*
-	 * Drop per-cpu lock across the call to vget(), take it
-	 * again for the sake of the stats update.
+	 * Unlocked except for the vnode interlock.  Call vget().
 	 */
-	mutex_exit(&cpup->cpu_lock);
 	error = vget(vp, LK_NOWAIT);
-	mutex_enter(&cpup->cpu_lock);
 	if (error) {
 		KASSERT(error == EBUSY);
 		/*
 		 * This vnode is being cleaned out.
 		 * XXX badhits?
 		 */
-		COUNT(cpup->cpu_stats, ncs_falsehits);
+		COUNT_UNL(cpup, ncs_falsehits);
 		/* found nothing */
-		ret_value = 0;
-	} else {
-		COUNT(cpup->cpu_stats, ncs_goodhits); /* XXX can be "badhits" */
-		/* found it */
-		*vn_ret = vp;
-		ret_value = 1;
+		return 0;
 	}
-	mutex_exit(&cpup->cpu_lock);
 
-	return ret_value;
+	COUNT_UNL(cpup, ncs_goodhits); /* XXX can be "badhits" */
+	/* found it */
+	*vn_ret = vp;
+	return 1;
 }
 
 /*
@@ -620,8 +660,8 @@
 {
 	struct namecache *ncp;
 	struct vnode *dvp;
-	struct nchcpu *cpup;
 	struct ncvhashhead *nvcpp;
+	struct nchcpu *cpup;
 	char *bp;
 	int error, nlen;
 
@@ -629,8 +669,14 @@
 		goto out;
 
 	nvcpp = &ncvhashtbl[NCVHASH(vp)];
-	cpup = curcpu()->ci_data.cpu_nch;
 
+	/*
+	 * We increment counters in the local CPU's per-cpu stats.
+	 * We don't take the per-cpu lock, however, since this function
+	 * is the only place these counters are incremented so no one
+	 * will be racing with us to increment them.
+	 */
+	cpup = curcpu()->ci_data.cpu_nch;
 	mutex_enter(namecache_lock);
 	LIST_FOREACH(ncp, nvcpp, nc_vhash) {
 		mutex_enter(&ncp->nc_lock);
@@ -648,9 +694,7 @@
 			    ncp->nc_name[1] == '.')
 				panic("cache_revlookup: found entry for ..");
 #endif
-			mutex_enter(&cpup->cpu_lock);
-			COUNT(cpup->cpu_stats, ncs_revhits);
-			mutex_exit(&cpup->cpu_lock);
+			COUNT(cpup, ncs_revhits);
 			nlen = ncp->nc_nlen;
 
 			if (bufp) {
@@ -682,9 +726,7 @@
 		}
 		mutex_exit(&ncp->nc_lock);
 	}
-	mutex_enter(&cpup->cpu_lock);
-	COUNT(cpup->cpu_stats, ncs_revmiss);
-	mutex_exit(&cpup->cpu_lock);
+	COUNT(cpup, ncs_revmiss);
 	mutex_exit(namecache_lock);
  out:
 	*dvpp = NULL;
@@ -722,8 +764,11 @@
 	numcache++;
 
 	/*
-	 * Concurrent lookups in the same directory may race for a
-	 * cache entry.  if there's a duplicated entry, free it.
+	 * Concurrent lookups in the same directory may race to add a
+	 * cache entry.  If there's a duplicated entry, free it.
+	 * XXX Why not keep the one already in the cache instead
+	 * of replacing it??  Should count this to see how often it
+	 * happens.
 	 */
 	oncp = cache_lookup_entry(dvp, name, namelen);
 	if (oncp) {
@@ -1132,9 +1177,7 @@
 {
 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
 
-	mutex_enter(&cpup->cpu_lock);
-	COUNT(cpup->cpu_stats, ncs_pass2);
-	mutex_exit(&cpup->cpu_lock);
+	COUNT_UNL(cpup, ncs_pass2);
 }
 
 void
@@ -1142,15 +1185,23 @@
 {
 	struct nchcpu *cpup = curcpu()->ci_data.cpu_nch;
 
-	mutex_enter(&cpup->cpu_lock);
-	COUNT(cpup->cpu_stats, ncs_2passes);
-	mutex_exit(&cpup->cpu_lock);
+	COUNT_UNL(cpup, ncs_2passes);
 }
 
+/*
+ * Fetch the current values of the stats.  We return the most
+ * recent values harvested into nchstats by cache_reclaim(), which
+ * will be less than a second old.
+ */
 static int
 cache_stat_sysctl(SYSCTLFN_ARGS)
 {
-	struct nchstats_sysctl stats;
+	struct nchstats stats;
+	struct nchcpu *my_cpup;
+#ifdef CACHE_STATS_CURRENT
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+#endif	/* CACHE_STATS_CURRENT */
 
 	if (oldp == NULL) {
 		*oldlenp = sizeof(stats);
@@ -1162,21 +1213,33 @@
 		return 0;
 	}
 
-	memset(&stats, 0, sizeof(stats));
-
+	/*
+	 * Take this CPU's per-cpu lock to hold off cache_reclaim()
+	 * from doing a stats update while doing minimal damage to
+	 * concurrent operations.
+	 */
 	sysctl_unlock();
-	cache_lock_cpus();
-	stats.ncs_goodhits = nchstats.ncs_goodhits;
-	stats.ncs_neghits = nchstats.ncs_neghits;
-	stats.ncs_badhits = nchstats.ncs_badhits;
-	stats.ncs_falsehits = nchstats.ncs_falsehits;
-	stats.ncs_miss = nchstats.ncs_miss;
-	stats.ncs_long = nchstats.ncs_long;
-	stats.ncs_pass2 = nchstats.ncs_pass2;
-	stats.ncs_2passes = nchstats.ncs_2passes;
-	stats.ncs_revhits = nchstats.ncs_revhits;
-	stats.ncs_revmiss = nchstats.ncs_revmiss;
-	cache_unlock_cpus();
+	my_cpup = curcpu()->ci_data.cpu_nch;
+	mutex_enter(&my_cpup->cpu_lock);
+	stats = nchstats;
+#ifdef CACHE_STATS_CURRENT
+	/* Add in per-cpu increments. */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		struct nchcpu *cpup = ci->ci_data.cpu_nch;
+
+		ADD(stats, cpup, ncs_goodhits);
+		ADD(stats, cpup, ncs_neghits);
+		ADD(stats, cpup, ncs_badhits);
+		ADD(stats, cpup, ncs_falsehits);
+		ADD(stats, cpup, ncs_miss);
+		ADD(stats, cpup, ncs_long);
+		ADD(stats, cpup, ncs_pass2);
+		ADD(stats, cpup, ncs_2passes);
+		ADD(stats, cpup, ncs_revhits);
+		ADD(stats, cpup, ncs_revmiss);
+	}
+#endif	/* CACHE_STATS_CURRENT */
+	mutex_exit(&my_cpup->cpu_lock);
 	sysctl_relock();
 
 	*oldlenp = sizeof(stats);