Module Name:    src
Committed By:   ad
Date:           Thu Dec  5 18:50:41 UTC 2019

Modified Files:
        src/common/lib/libc/gen: radixtree.c

Log Message:
Delete the counter from "struct radix_tree_node", and in the one place we
need a non-zero check, substitute with a deterministic bitwise OR of all
values in the node.  The structure then becomes cache line aligned.

For each node we now need only touch 2 cache lines instead of 3, which makes
all the operations faster (measured), amortises the cost of not having a
counter, and will avoid intra-pool-page false sharing on MP.


To generate a diff of this commit:
cvs rdiff -u -r1.18 -r1.19 src/common/lib/libc/gen/radixtree.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/gen/radixtree.c
diff -u src/common/lib/libc/gen/radixtree.c:1.18 src/common/lib/libc/gen/radixtree.c:1.19
--- src/common/lib/libc/gen/radixtree.c:1.18	Thu Dec  5 18:32:25 2019
+++ src/common/lib/libc/gen/radixtree.c	Thu Dec  5 18:50:41 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $	*/
+/*	$NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $	*/
 
 /*-
  * Copyright (c)2011,2012,2013 YAMAMOTO Takashi,
@@ -112,7 +112,7 @@
 #include <sys/cdefs.h>
 
 #if defined(_KERNEL) || defined(_STANDALONE)
-__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $");
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/pool.h>
@@ -122,7 +122,7 @@ __KERNEL_RCSID(0, "$NetBSD: radixtree.c,
 #include <lib/libsa/stand.h>
 #endif /* defined(_STANDALONE) */
 #else /* defined(_KERNEL) || defined(_STANDALONE) */
-__RCSID("$NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $");
+__RCSID("$NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $");
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
@@ -185,11 +185,16 @@ entry_match_p(void *p, unsigned int tagm
  * radix_tree_node: an intermediate node
  *
  * we don't care the type of leaf nodes.  they are just void *.
+ *
+ * we used to maintain a count of non-NULL nodes in this structure, but it
+ * prevented it from being aligned to a cache line boundary; the performance
+ * benefit from being cache friendly is greater than the benefit of having
+ * a dedicated count value, especially in multi-processor situations where
+ * we need to avoid intra-pool-page false sharing.
  */
 
 struct radix_tree_node {
 	void *n_ptrs[RADIX_TREE_PTR_PER_NODE];
-	unsigned int n_nptrs;	/* # of non-NULL pointers in n_ptrs */
 };
 
 /*
@@ -340,8 +345,8 @@ radix_tree_init(void)
 {
 
 	radix_tree_node_cache = pool_cache_init(sizeof(struct radix_tree_node),
-	    0, 0, 0, "radix_tree_node", NULL, IPL_NONE, radix_tree_node_ctor,
-	    NULL, NULL);
+	    coherency_unit, 0, 0, "radixnode", NULL, IPL_NONE,
+	    radix_tree_node_ctor, NULL, NULL);
 	KASSERT(radix_tree_node_cache != NULL);
 }
 #endif /* defined(_KERNEL) */
@@ -349,17 +354,57 @@ radix_tree_init(void)
 static bool __unused
 radix_tree_node_clean_p(const struct radix_tree_node *n)
 {
+#if RADIX_TREE_PTR_PER_NODE > 16
 	unsigned int i;
 
-	if (n->n_nptrs != 0) {
-		return false;
-	}
 	for (i = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
 		if (n->n_ptrs[i] != NULL) {
 			return false;
 		}
 	}
 	return true;
+#else /* RADIX_TREE_PTR_PER_NODE > 16 */
+	uintptr_t sum;
+
+	/*
+	 * Unrolling the above is much better than a tight loop with two
+	 * test+branch pairs.  On x86 with gcc 5.5.0 this compiles into 19
+	 * deterministic instructions including the "return" and prologue &
+	 * epilogue.
+	 */
+	sum = (uintptr_t)n->n_ptrs[0];
+	sum |= (uintptr_t)n->n_ptrs[1];
+	sum |= (uintptr_t)n->n_ptrs[2];
+	sum |= (uintptr_t)n->n_ptrs[3];
+#if RADIX_TREE_PTR_PER_NODE > 4
+	sum |= (uintptr_t)n->n_ptrs[4];
+	sum |= (uintptr_t)n->n_ptrs[5];
+	sum |= (uintptr_t)n->n_ptrs[6];
+	sum |= (uintptr_t)n->n_ptrs[7];
+#endif
+#if RADIX_TREE_PTR_PER_NODE > 8
+	sum |= (uintptr_t)n->n_ptrs[8];
+	sum |= (uintptr_t)n->n_ptrs[9];
+	sum |= (uintptr_t)n->n_ptrs[10];
+	sum |= (uintptr_t)n->n_ptrs[11];
+	sum |= (uintptr_t)n->n_ptrs[12];
+	sum |= (uintptr_t)n->n_ptrs[13];
+	sum |= (uintptr_t)n->n_ptrs[14];
+	sum |= (uintptr_t)n->n_ptrs[15];
+#endif
+	return sum == 0;
+#endif /* RADIX_TREE_PTR_PER_NODE > 16 */
+}
+
+static int __unused
+radix_tree_node_count_ptrs(const struct radix_tree_node *n)
+{
+	unsigned int i, c;
+
+	for (i = c = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
+		c += (n->n_ptrs[i] != NULL);
+	}
+	return c;
 }
 
 static struct radix_tree_node *
@@ -421,7 +466,6 @@ radix_tree_grow(struct radix_tree *t, un
 			 */
 			return ENOMEM;
 		}
-		n->n_nptrs = 1;
 		n->n_ptrs[0] = t->t_root;
 		t->t_root = entry_compose(n, tagmask);
 		t->t_height++;
@@ -516,10 +560,6 @@ radix_tree_lookup_ptr(struct radix_tree 
 				return NULL;
 			}
 			*vpp = c;
-			if (n != NULL) {
-				KASSERT(n->n_nptrs < RADIX_TREE_PTR_PER_NODE);
-				n->n_nptrs++;
-			}
 		}
 		n = c;
 		vpp = &n->n_ptrs[i];
@@ -531,10 +571,6 @@ radix_tree_lookup_ptr(struct radix_tree 
 	}
 	if (alloc) {
 		KASSERT(*vpp == NULL);
-		if (n != NULL) {
-			KASSERT(n->n_nptrs < RADIX_TREE_PTR_PER_NODE);
-			n->n_nptrs++;
-		}
 	}
 	if (path != NULL) {
 		path->p_lastidx = refs - path->p_refs;
@@ -634,9 +670,7 @@ radix_tree_remove_node(struct radix_tree
 		entry = *pptr;
 		n = entry_ptr(entry);
 		KASSERT(n != NULL);
-		KASSERT(n->n_nptrs > 0);
-		n->n_nptrs--;
-		if (n->n_nptrs > 0) {
+		if (!radix_tree_node_clean_p(n)) {
 			break;
 		}
 		radix_tree_free_node(n);
@@ -663,7 +697,7 @@ radix_tree_remove_node(struct radix_tree
 		entry = *pptr;
 		n = entry_ptr(entry);
 		KASSERT(n != NULL);
-		KASSERT(n->n_nptrs > 0);
+		KASSERT(!radix_tree_node_clean_p(n));
 		newmask = any_children_tagmask(n);
 		if (newmask == entry_tagmask(entry)) {
 			break;
@@ -702,10 +736,7 @@ gang_lookup_init(struct radix_tree *t, u
 {
 	void **vpp __unused;
 
-#if defined(DIAGNOSTIC)
-	vpp =
-#endif /* defined(DIAGNOSTIC) */
-	radix_tree_lookup_ptr(t, idx, path, false, tagmask);
+	vpp = radix_tree_lookup_ptr(t, idx, path, false, tagmask);
 	KASSERT(vpp == NULL ||
 	    vpp == path_pptr(t, path, path->p_lastidx));
 	KASSERT(&t->t_root == path_pptr(t, path, 0));
@@ -795,7 +826,15 @@ scan_siblings:
 			break;
 		}
 		n = path_node(t, path, lastidx - 1);
-		if (*vpp != NULL && n->n_nptrs == 1) {
+		/*
+		 * we used to have an integer counter in the node, and this
+		 * optimization made sense then, even though marginal.  it
+		 * no longer provides benefit with the structure cache line
+		 * aligned and the counter replaced by an unrolled sequence
+		 * testing the pointers in batch.
+		 */
+#if 0
+		if (*vpp != NULL && radix_tree_node_count_ptrs(n) == 1) {
 			/*
 			 * optimization; if the node has only a single pointer
 			 * and we've already visited it, there's no point to
@@ -803,6 +842,7 @@ scan_siblings:
 			 */
 			goto no_siblings;
 		}
+#endif /* 0 */
 		for (i = vpp - n->n_ptrs + step; i != guard; i += step) {
 			KASSERT(i < RADIX_TREE_PTR_PER_NODE);
 			if (entry_match_p(n->n_ptrs[i], tagmask)) {
@@ -986,10 +1026,7 @@ radix_tree_set_tag(struct radix_tree *t,
 	int i;
 
 	KASSERT(tagmask != 0);
-#if defined(DIAGNOSTIC)
-	vpp =
-#endif /* defined(DIAGNOSTIC) */
-	radix_tree_lookup_ptr(t, idx, &path, false, 0);
+	vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
 	KASSERT(vpp != NULL);
 	KASSERT(*vpp != NULL);
 	KASSERT(path.p_lastidx == t->t_height);
@@ -1087,7 +1124,7 @@ radix_tree_dump_node(const struct radix_
 	}
 	n = entry_ptr(vp);
 	assert(any_children_tagmask(n) == entry_tagmask(vp));
-	printf(" (%u children)\n", n->n_nptrs);
+	printf(" (%u children)\n", radix_tree_node_count_ptrs(n));
 	for (i = 0; i < __arraycount(n->n_ptrs); i++) {
 		void *c;
 

Reply via email to