On Wed, May 25, 2022 at 11:48 AM Masahiko Sawada <sawada.m...@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 6:58 PM John Naylor
> <john.nay...@enterprisedb.com> wrote:
> >
> > On Tue, May 10, 2022 at 8:52 AM Masahiko Sawada <sawada.m...@gmail.com> 
> > wrote:
> > >
> > > Overall, radix tree implementations have good numbers. Once we got an
> > > agreement on moving in this direction, I'll start a new thread for
> > > that and move the implementation further; there are many things to do
> > > and discuss: deletion, API design, SIMD support, more tests etc.
> >
> > +1
> >
>
> Thanks!
>
> I've attached an updated version patch. It is still WIP but I've
> implemented deletion and improved test cases and comments.

I've attached an updated version patch that changes the configure
script. I'm still studying how to support AVX2 on msvc build. Also,
added more regression tests.

The integration with lazy vacuum and parallel vacuum is missing for
now. In order to support parallel vacuum, we need to have the radix
tree support to be created on DSA.

Added this item to the next CF.

Regards,

--
Masahiko Sawada
EDB:  https://www.enterprisedb.com/
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index d3562d6fee..a56d6e89da 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -676,3 +676,27 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_AVX2_INTRINSICS
+# --------------------
+# Check if the compiler supports the Intel AVX2 instructinos.
+#
+# If the intrinsics are supported, sets pgac_avx2_intrinsics, and CFLAGS_AVX2.
+AC_DEFUN([PGAC_AVX2_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm256_set_1_epi8 _mm256_cmpeq_epi8 _mm256_movemask_epi8 CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [__m256i vec = _mm256_set1_epi8(0);
+   __m256i cmp = _mm256_cmpeq_epi8(vec, vec);
+   return _mm256_movemask_epi8(cmp) > 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_AVX2="$1"
+  pgac_avx2_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_INTRINSICS
diff --git a/configure b/configure
index 7dec6b7bf9..6ebc15a8c1 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
 MSGMERGE
 MSGFMT_FLAGS
 MSGFMT
+CFLAGS_AVX2
 PG_CRC32C_OBJS
 CFLAGS_ARMV8_CRC32C
 CFLAGS_SSE42
@@ -18829,6 +18830,82 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Check for Intel AVX2 intrinsics.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm256i CFLAGS=" >&5
+$as_echo_n "checking for _mm256i CFLAGS=... " >&6; }
+if ${pgac_cv_avx2_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+__m256i vec = _mm256_set1_epi8(0);
+   __m256i cmp = _mm256_cmpeq_epi8(vec, vec);
+   return _mm256_movemask_epi8(cmp) > 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx2_intrinsics_=yes
+else
+  pgac_cv_avx2_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_intrinsics_" >&5
+$as_echo "$pgac_cv_avx2_intrinsics_" >&6; }
+if test x"$pgac_cv_avx2_intrinsics_" = x"yes"; then
+  CFLAGS_AVX2=""
+  pgac_avx2_intrinsics=yes
+fi
+
+if test x"pgac_avx2_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm256i CFLAGS=-mavx2" >&5
+$as_echo_n "checking for _mm256i CFLAGS=-mavx2... " >&6; }
+if ${pgac_cv_avx2_intrinsics__mavx2+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx2"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+__m256i vec = _mm256_set1_epi8(0);
+   __m256i cmp = _mm256_cmpeq_epi8(vec, vec);
+   return _mm256_movemask_epi8(cmp) > 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx2_intrinsics__mavx2=yes
+else
+  pgac_cv_avx2_intrinsics__mavx2=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_intrinsics__mavx2" >&5
+$as_echo "$pgac_cv_avx2_intrinsics__mavx2" >&6; }
+if test x"$pgac_cv_avx2_intrinsics__mavx2" = x"yes"; then
+  CFLAGS_AVX2="-mavx2"
+  pgac_avx2_intrinsics=yes
+fi
+
+fi
+
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index d093fb88dd..6b6d095306 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2300,6 +2300,12 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Check for Intel AVX2 intrinsics.
+PGAC_AVX2_INTRINSICS([])
+if test x"pgac_avx2_intrinsics" != x"yes"; then
+  PGAC_AVX2_INTRINSICS([-mavx2])
+fi
+AC_SUBST(CFLAGS_AVX2)
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 051718e4fe..9717094724 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -263,6 +263,7 @@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
 CFLAGS_SSE42 = @CFLAGS_SSE42@
 CFLAGS_ARMV8_CRC32C = @CFLAGS_ARMV8_CRC32C@
+CFLAGS_AVX2 = @CFLAGS_AVX2@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
 
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 9dad31398a..5e4516ca90 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -22,6 +22,10 @@ OBJS = \
 	integerset.o \
 	knapsack.o \
 	pairingheap.o \
+	radixtree.o \
 	rbtree.o \
 
+# radixtree.o need CFLAGS_AVX2
+radixtree.o: CFLAGS+=$(CFLAGS_AVX2)
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c
new file mode 100644
index 0000000000..bf87f932fd
--- /dev/null
+++ b/src/backend/lib/radixtree.c
@@ -0,0 +1,1763 @@
+/*-------------------------------------------------------------------------
+ *
+ * radixtree.c
+ *		Implementation for adaptive radix tree.
+ *
+ * This module employs the idea from the paper "The Adaptive Radix Tree: ARTful
+ * Indexing for Main-Memory Databases" by Viktor Leis, Alfons Kemper, and Thomas
+ * Neumann, 2013.
+ *
+ * There are some differences from the proposed implementation.  For instance,
+ * this radix tree module utilizes AVX2 instruction, enabling us to use 256-bit
+ * width SIMD vector, whereas 128-bit width SIMD vector is used in the paper.
+ * Also, there is no support for path compression and lazy path expansion. The
+ * radix tree supports fixed length of the key so we don't expect the tree level
+ * wouldn't be high.
+ *
+ * The key is a 64-bit unsigned integer and the value is a Datum. Both internal
+ * nodes and leaf nodes have the identical structure. For internal tree nodes,
+ * shift > 0, store the pointer to its child node as the value. The leaf nodes,
+ * shift == 0, also have the Datum value that is specified by the user.
+ *
+ * XXX: the radix tree node never be shrunk.
+ *
+ * Interface
+ * ---------
+ *
+ * radix_tree_create		- Create a new, empty radix tree
+ * radix_tree_free			- Free the radix tree
+ * radix_tree_insert		- Insert a key-value pair
+ * radix_tree_delete		- Delete a key-value pair
+ * radix_tree_begin_iterate	- Begin iterating through all key-value pairs
+ * radix_tree_iterate_next	- Return next key-value pair, if any
+ * radix_tree_end_iterate	- End iteration
+ *
+ * radix_tree_create() creates an empty radix tree in the given memory context
+ * and memory contexts for all kinds of radix tree node under the memory context.
+ *
+ * radix_tree_iterate_next() ensures returning key-value pairs in the ascending
+ * order of the key.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/radixtree.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "utils/memutils.h"
+#include "lib/radixtree.h"
+#include "lib/stringinfo.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>			/* AVX2 intrinsics */
+#endif
+
+/* The number of bits encoded in one tree level */
+#define RADIX_TREE_NODE_FANOUT	8
+
+/* The number of maximum slots in the node, used in node-256 */
+#define RADIX_TREE_NODE_MAX_SLOTS (1 << RADIX_TREE_NODE_FANOUT)
+
+/*
+ * Return the number of bits required to represent nslots slots, used
+ * in node-128 and node-256.
+ */
+#define RADIX_TREE_NODE_NSLOTS_BITS(nslots) ((nslots) / (sizeof(uint8) * BITS_PER_BYTE))
+
+/* Mask for extracting a chunk from the key */
+#define RADIX_TREE_CHUNK_MASK ((1 << RADIX_TREE_NODE_FANOUT) - 1)
+
+/* Maximum shift the radix tree uses */
+#define RADIX_TREE_MAX_SHIFT	key_get_shift(UINT64_MAX)
+
+/* Tree level the radix tree uses */
+#define RADIX_TREE_MAX_LEVEL	((sizeof(uint64) * BITS_PER_BYTE) / RADIX_TREE_NODE_FANOUT)
+
+/* Get a chunk from the key */
+#define GET_KEY_CHUNK(key, shift) \
+	((uint8) (((key) >> (shift)) & RADIX_TREE_CHUNK_MASK))
+
+/* Mapping from the value to the bit in is-set bitmap in the node-128 and node-256 */
+#define NODE_BITMAP_BYTE(v) ((v) / RADIX_TREE_NODE_FANOUT)
+#define NODE_BITMAP_BIT(v) (UINT64_C(1) << ((v) % RADIX_TREE_NODE_FANOUT))
+
+/* Enum used radix_tree_node_search() */
+typedef enum
+{
+	RADIX_TREE_FIND = 0,		/* find the key-value */
+	RADIX_TREE_DELETE,			/* delete the key-value */
+} radix_tree_action;
+
+/*
+ * supported radix tree nodes.
+ *
+ * XXX: should we add KIND_16 as we can utilize SSE2 SIMD instructions?
+ */
+typedef enum radix_tree_node_kind
+{
+	RADIX_TREE_NODE_KIND_4 = 0,
+	RADIX_TREE_NODE_KIND_32,
+	RADIX_TREE_NODE_KIND_128,
+	RADIX_TREE_NODE_KIND_256
+} radix_tree_node_kind;
+#define RADIX_TREE_NODE_KIND_COUNT 4
+
+/*
+ * Base type for all nodes types.
+ */
+typedef struct radix_tree_node
+{
+	/*
+	 * Number of children.  We use uint16 to be able to indicate 256 children
+	 * at ta fanout of 8.
+	 */
+	uint16		count;
+
+	/*
+	 * Shift indicates which part of the key space is represented by this
+	 * node. That is, the key is shifted by 'shift' and the lowest
+	 * RADIX_TREE_NODE_FANOUT bits are then represented in chunk.
+	 */
+	uint8		shift;
+	uint8		chunk;
+
+	/* Size class of the node */
+	radix_tree_node_kind kind;
+} radix_tree_node;
+
+/* Macros for radix tree nodes */
+#define IS_LEAF_NODE(n) (((radix_tree_node *) (n))->shift == 0)
+#define IS_EMPTY_NODE(n) (((radix_tree_node *) (n))->count == 0)
+#define NODE_HAS_FREE_SLOT(n) \
+	(((radix_tree_node *) (n))->count < \
+	 radix_tree_node_info[((radix_tree_node *) (n))->kind].max_slots)
+
+/*
+ * To reduce memory usage compared to a simple radix tree with a fixed fanout
+ * we use adaptive node sides, with different storage methods for different
+ * numbers of elements.
+ */
+typedef struct radix_tree_node_4
+{
+	radix_tree_node n;
+
+	/* 4 children, for key chunks */
+	uint8		chunks[4];
+	Datum		slots[4];
+} radix_tree_node_4;
+
+typedef struct radix_tree_node_32
+{
+	radix_tree_node n;
+
+	/* 32 children, for key chunks */
+	uint8		chunks[32];
+	Datum		slots[32];
+} radix_tree_node_32;
+
+#define RADIX_TREE_NODE_128_BITS RADIX_TREE_NODE_NSLOTS_BITS(128)
+typedef struct radix_tree_node_128
+{
+	radix_tree_node n;
+
+	/*
+	 * The index of slots for each fanout. 0 means unused whereas slots is
+	 * 0-indexed. So we can get the slot of the chunk C by slots[C] - 1.
+	 */
+	uint8		slot_idxs[RADIX_TREE_NODE_MAX_SLOTS];
+
+	/* A bitmap to track which slot is in use */
+	uint8		isset[RADIX_TREE_NODE_128_BITS];
+
+	Datum		slots[128];
+} radix_tree_node_128;
+
+#define RADIX_TREE_NODE_MAX_BITS RADIX_TREE_NODE_NSLOTS_BITS(RADIX_TREE_NODE_MAX_SLOTS)
+typedef struct radix_tree_node_256
+{
+	radix_tree_node n;
+
+	/* A bitmap to track which slot is in use */
+	uint8		isset[RADIX_TREE_NODE_MAX_BITS];
+
+	Datum		slots[RADIX_TREE_NODE_MAX_SLOTS];
+} radix_tree_node_256;
+
+/* Information of each size class */
+typedef struct radix_tree_node_info_elem
+{
+	const char *name;
+	int			max_slots;
+	Size		size;
+} radix_tree_node_info_elem;
+
+static radix_tree_node_info_elem radix_tree_node_info[] =
+{
+	{"radix tree node 4", 4, sizeof(radix_tree_node_4)},
+	{"radix tree node 32", 32, sizeof(radix_tree_node_32)},
+	{"radix tree node 128", 128, sizeof(radix_tree_node_128)},
+	{"radix tree node 256", 256, sizeof(radix_tree_node_256)},
+};
+
+/*
+ * As we descend a radix tree, we push the node to the stack. The stack is used
+ * at deletion.
+ */
+typedef struct radix_tree_stack_data
+{
+	radix_tree_node *node;
+	struct radix_tree_stack_data *parent;
+} radix_tree_stack_data;
+typedef radix_tree_stack_data *radix_tree_stack;
+
+/*
+ * Iteration support.
+ *
+ * Iterating the radix tree returns each pair of key and value in the ascending order
+ * of the key. To support this, the we iterate nodes of each level.
+ * radix_tree_iter_node_data struct is used to track the iteration within a node.
+ * radix_tree_iter has the array of this struct, stack, in order to track the iteration
+ * of every level. During the iteration, we also construct the key to return. The key
+ * is updated whenever we update the node iteration information, e.g., when advancing
+ * the current index within the node or when moving to the next node at the same level.
+ */
+typedef struct radix_tree_iter_node_data
+{
+	radix_tree_node *node;		/* current node being iterated */
+	int			current_idx;	/* current position. -1 for initial value */
+} radix_tree_iter_node_data;
+
+struct radix_tree_iter
+{
+	radix_tree *tree;
+
+	/* Track the iteration on nodes of each level */
+	radix_tree_iter_node_data stack[RADIX_TREE_MAX_LEVEL];
+	int			stack_len;
+
+	/* The key is being constructed during the iteration */
+	uint64		key;
+};
+
+/* A radix tree with nodes */
+struct radix_tree
+{
+	MemoryContext context;
+
+	radix_tree_node *root;
+	uint64		max_val;
+	uint64		num_keys;
+	MemoryContextData *slabs[RADIX_TREE_NODE_KIND_COUNT];
+
+	/* statistics */
+	uint64		mem_used;
+	int32		cnt[RADIX_TREE_NODE_KIND_COUNT];
+};
+
+static radix_tree_node *radix_tree_node_grow(radix_tree *tree, radix_tree_node *parent,
+											 radix_tree_node *node, uint64 key);
+static bool radix_tree_node_search_child(radix_tree_node *node, radix_tree_node **child_p,
+										 uint64 key);
+static bool radix_tree_node_search(radix_tree_node *node, Datum **slot_p, uint64 key,
+								   radix_tree_action action);
+static void radix_tree_extend(radix_tree *tree, uint64 key);
+static void radix_tree_new_root(radix_tree *tree, uint64 key, Datum val);
+static radix_tree_node *radix_tree_node_insert_child(radix_tree *tree,
+													 radix_tree_node *parent,
+													 radix_tree_node *node,
+													 uint64 key);
+static void radix_tree_node_insert_val(radix_tree *tree, radix_tree_node *parent,
+									   radix_tree_node *node, uint64 key, Datum val,
+									   bool *replaced_p);
+static inline void radix_tree_iter_update_key(radix_tree_iter *iter, uint8 chunk, uint8 shift);
+static Datum radix_tree_node_iterate_next(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter,
+										  bool *found_p);
+static void radix_tree_store_iter_node(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter,
+									   radix_tree_node *node);
+static void radix_tree_update_iter_stack(radix_tree_iter *iter, int from);
+static void radix_tree_verify_node(radix_tree_node *node);
+
+/*
+ * Helper functions for accessing each kind of nodes.
+ */
+static inline int
+node_32_search_eq(radix_tree_node_32 *node, uint8 chunk)
+{
+#ifdef __AVX2__
+	__m256i		_key = _mm256_set1_epi8(chunk);
+	__m256i		_data = _mm256_loadu_si256((__m256i_u *) node->chunks);
+	__m256i		_cmp = _mm256_cmpeq_epi8(_key, _data);
+	uint32		bitfield = _mm256_movemask_epi8(_cmp);
+
+	bitfield &= ((UINT64_C(1) << node->n.count) - 1);
+
+	return (bitfield) ? __builtin_ctz(bitfield) : -1;
+
+#else
+	for (int i = 0; i < node->n.count; i++)
+	{
+		if (node->chunks[i] > chunk)
+			return -1;
+
+		if (node->chunks[i] == chunk)
+			return i;
+	}
+
+	return -1;
+#endif							/* __AVX2__ */
+}
+
+/*
+ * This is a bit more complicated than search_chunk_array_16_eq(), because
+ * until recently no unsigned uint8 comparison instruction existed on x86. So
+ * we need to play some trickery using _mm_min_epu8() to effectively get
+ * <=. There never will be any equal elements in the current uses, but that's
+ * what we get here...
+ */
+static inline int
+node_32_search_le(radix_tree_node_32 *node, uint8 chunk)
+{
+#ifdef __AVX2__
+	__m256i		_key = _mm256_set1_epi8(chunk);
+	__m256i		_data = _mm256_loadu_si256((__m256i_u *) node->chunks);
+	__m256i		_min = _mm256_min_epu8(_key, _data);
+	__m256i		cmp = _mm256_cmpeq_epi8(_key, _min);
+	uint32_t	bitfield = _mm256_movemask_epi8(cmp);
+
+	bitfield &= ((UINT64_C(1) << node->n.count) - 1);
+
+	return (bitfield) ? __builtin_ctz(bitfield) : node->n.count;
+#else
+	int			index;
+
+	for (index = 0; index < node->n.count; index++)
+	{
+		if (node->chunks[index] >= chunk)
+			break;
+	}
+
+	return index;
+#endif							/* __AVX2__ */
+}
+
+/* Does the given chunk in the node has the value? */
+static inline bool
+node_128_is_chunk_used(radix_tree_node_128 *node, uint8 chunk)
+{
+	return (node->slot_idxs[chunk] != 0);
+}
+
+/* Is the slot in the node used? */
+static inline bool
+node_128_is_slot_used(radix_tree_node_128 *node, uint8 slot)
+{
+	return ((node->isset[NODE_BITMAP_BYTE(slot)] & NODE_BITMAP_BIT(slot)) != 0);
+}
+
+/* Set the slot at the corresponding chunk */
+static inline void
+node_128_set(radix_tree_node_128 *node, uint8 chunk, Datum val)
+{
+	int			slotpos = 0;
+
+	/* Search an unused slot */
+	while (node_128_is_slot_used(node, slotpos))
+		slotpos++;
+
+	node->slot_idxs[chunk] = slotpos + 1;
+	node->slots[slotpos] = val;
+	node->isset[NODE_BITMAP_BYTE(slotpos)] |= NODE_BITMAP_BIT(slotpos);
+}
+
+/* Delete the slot at the corresponding chunk */
+static inline void
+node_128_unset(radix_tree_node_128 *node, uint8 chunk)
+{
+	int			slotpos = node->slot_idxs[chunk] - 1;
+
+	if (!node_128_is_chunk_used(node, chunk))
+		return;
+
+	node->isset[NODE_BITMAP_BYTE(slotpos)] &= ~(NODE_BITMAP_BIT(slotpos));
+	node->slot_idxs[chunk] = 0;
+}
+
+/* Return the slot data corresponding to the chunk */
+static inline Datum
+node_128_get_chunk_slot(radix_tree_node_128 *node, uint8 chunk)
+{
+	return node->slots[node->slot_idxs[chunk] - 1];
+}
+
+/* Return true if the slot corresponding to the given chunk is in use */
+static inline bool
+node_256_is_chunk_used(radix_tree_node_256 *node, uint8 chunk)
+{
+	return (node->isset[NODE_BITMAP_BYTE(chunk)] & NODE_BITMAP_BIT(chunk)) != 0;
+}
+
+/* Set the slot at the given chunk position */
+static inline void
+node_256_set(radix_tree_node_256 *node, uint8 chunk, Datum slot)
+{
+	node->slots[chunk] = slot;
+	node->isset[NODE_BITMAP_BYTE(chunk)] |= NODE_BITMAP_BIT(chunk);
+}
+
+/* Set the slot at the given chunk position */
+static inline void
+node_256_unset(radix_tree_node_256 *node, uint8 chunk)
+{
+	node->isset[NODE_BITMAP_BYTE(chunk)] &= ~(NODE_BITMAP_BIT(chunk));
+}
+
+/*
+ * Return the shift that is satisfied to store the given key.
+ */
+inline static int
+key_get_shift(uint64 key)
+{
+	return (key == 0)
+		? 0
+		: (pg_leftmost_one_pos64(key) / RADIX_TREE_NODE_FANOUT) * RADIX_TREE_NODE_FANOUT;
+}
+
+/*
+ * Return the max value stored in a node with the given shift.
+ */
+static uint64
+shift_get_max_val(int shift)
+{
+	if (shift == RADIX_TREE_MAX_SHIFT)
+		return UINT64_MAX;
+
+	return (UINT64_C(1) << (shift + RADIX_TREE_NODE_FANOUT)) - 1;
+}
+
+/*
+ * Allocate a new node with the given node kind.
+ */
+static radix_tree_node *
+radix_tree_alloc_node(radix_tree *tree, radix_tree_node_kind kind)
+{
+	radix_tree_node *newnode;
+
+	newnode = (radix_tree_node *) MemoryContextAllocZero(tree->slabs[kind],
+														 radix_tree_node_info[kind].size);
+	newnode->kind = kind;
+
+	/* update the statistics */
+	tree->mem_used += GetMemoryChunkSpace(newnode);
+	tree->cnt[kind]++;
+
+	return newnode;
+}
+
+/* Free the given node */
+static void
+radix_tree_free_node(radix_tree *tree, radix_tree_node *node)
+{
+	/*
+	 * XXX: If we're deleting the root node, make the tree empty
+	 */
+	if (tree->root == node)
+	{
+		tree->root = NULL;
+	}
+
+	/* update the statistics */
+	tree->mem_used -= GetMemoryChunkSpace(node);
+	tree->cnt[node->kind]--;
+
+	Assert(tree->mem_used >= 0);
+	Assert(tree->cnt[node->kind] >= 0);
+
+	pfree(node);
+}
+
+/* Free a stack made by radix_tree_delete */
+static void
+radix_tree_free_stack(radix_tree_stack stack)
+{
+	radix_tree_stack ostack;
+
+	while (stack != NULL)
+	{
+		ostack = stack;
+		stack = stack->parent;
+		pfree(ostack);
+	}
+}
+
+/* Copy the common fields without the kind */
+static void
+radix_tree_copy_node_common(radix_tree_node *src, radix_tree_node *dst)
+{
+	dst->shift = src->shift;
+	dst->chunk = src->chunk;
+	dst->count = src->count;
+}
+
+/*
+ * The radix tree doesn't sufficient height. Extend the radix tree so it can
+ * store the key.
+ */
+static void
+radix_tree_extend(radix_tree *tree, uint64 key)
+{
+	int			max_shift;
+	int			shift = tree->root->shift + RADIX_TREE_NODE_FANOUT;
+
+	max_shift = key_get_shift(key);
+
+	/* Grow tree from 'shift' to 'max_shift' */
+	while (shift <= max_shift)
+	{
+		radix_tree_node_4 *node =
+		(radix_tree_node_4 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4);
+
+		node->n.count = 1;
+		node->n.shift = shift;
+		node->chunks[0] = 0;
+		node->slots[0] = PointerGetDatum(tree->root);
+
+		tree->root->chunk = 0;
+		tree->root = (radix_tree_node *) node;
+
+		shift += RADIX_TREE_NODE_FANOUT;
+	}
+
+	tree->max_val = shift_get_max_val(max_shift);
+}
+
+/*
+ * Wrapper for radix_tree_node_search to search the pointer to the child node in the
+ * node.
+ *
+ * Return true if the corresponding child is found, otherwise return false.  On success,
+ * it sets child_p.
+ */
+static bool
+radix_tree_node_search_child(radix_tree_node *node, radix_tree_node **child_p, uint64 key)
+{
+	bool		found = false;
+	Datum	   *slot_ptr;
+
+	if (radix_tree_node_search(node, &slot_ptr, key, RADIX_TREE_FIND))
+	{
+		/* Found the pointer to the child node */
+		found = true;
+		*child_p = (radix_tree_node *) DatumGetPointer(*slot_ptr);
+	}
+
+	return found;
+}
+
+/*
+ * Return true if the corresponding slot is used, otherwise return false.  On success,
+ * sets the pointer to the slot to slot_p.
+ */
+static bool
+radix_tree_node_search(radix_tree_node *node, Datum **slot_p, uint64 key,
+					   radix_tree_action action)
+{
+	int			chunk = GET_KEY_CHUNK(key, node->shift);
+	bool		found = false;
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node;
+
+				/* Do linear search */
+				for (int i = 0; i < n4->n.count; i++)
+				{
+					if (n4->chunks[i] > chunk)
+						break;
+
+					/*
+					 * If we find the chunk in the node, do the specified
+					 * action
+					 */
+					if (n4->chunks[i] == chunk)
+					{
+						if (action == RADIX_TREE_FIND)
+							*slot_p = &(n4->slots[i]);
+						else	/* RADIX_TREE_DELETE */
+						{
+							memmove(&(n4->chunks[i]), &(n4->chunks[i + 1]),
+									sizeof(uint8) * (n4->n.count - i - 1));
+							memmove(&(n4->slots[i]), &(n4->slots[i + 1]),
+									sizeof(radix_tree_node *) * (n4->n.count - i - 1));
+						}
+
+						found = true;
+						break;
+					}
+				}
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+				int			idx;
+
+				/* Search by SIMD instructions */
+				idx = node_32_search_eq(n32, chunk);
+
+				/* If we find the chunk in the node, do the specified action */
+				if (idx >= 0)
+				{
+					if (action == RADIX_TREE_FIND)
+						*slot_p = &(n32->slots[idx]);
+					else		/* RADIX_TREE_DELETE */
+					{
+						memmove(&(n32->chunks[idx]), &(n32->chunks[idx + 1]),
+								sizeof(uint8) * (n32->n.count - idx - 1));
+						memmove(&(n32->slots[idx]), &(n32->slots[idx + 1]),
+								sizeof(radix_tree_node *) * (n32->n.count - idx - 1));
+					}
+
+					found = true;
+				}
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+
+				/* If we find the chunk in the node, do the specified action */
+				if (node_128_is_chunk_used(n128, chunk))
+				{
+					if (action == RADIX_TREE_FIND)
+						*slot_p = &(n128->slots[n128->slot_idxs[chunk] - 1]);
+					else		/* RADIX_TREE_DELETE */
+						node_128_unset(n128, chunk);
+
+					found = true;
+				}
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_256:
+			{
+				radix_tree_node_256 *n256 = (radix_tree_node_256 *) node;
+
+				/* If we find the chunk in the node, do the specified action */
+				if (node_256_is_chunk_used(n256, chunk))
+				{
+					if (action == RADIX_TREE_FIND)
+						*slot_p = &(n256->slots[chunk]);
+					else		/* RADIX_TREE_DELETE */
+						node_256_unset(n256, chunk);
+
+					found = true;
+				}
+
+				break;
+			}
+	}
+
+	/* Update the statistics */
+	if (action == RADIX_TREE_DELETE && found)
+		node->count--;
+
+	return found;
+}
+
+/*
+ * Create a new node as the root. Subordinate nodes will be created during
+ * the insertion.
+ */
+static void
+radix_tree_new_root(radix_tree *tree, uint64 key, Datum val)
+{
+	radix_tree_node_4 *n4 =
+	(radix_tree_node_4 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4);
+	int			shift = key_get_shift(key);
+
+	n4->n.shift = shift;
+	tree->max_val = shift_get_max_val(shift);
+	tree->root = (radix_tree_node *) n4;
+}
+
+/* Insert 'node' as a child node of 'parent' */
+static radix_tree_node *
+radix_tree_node_insert_child(radix_tree *tree, radix_tree_node *parent,
+							 radix_tree_node *node, uint64 key)
+{
+	radix_tree_node *newchild =
+	(radix_tree_node *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4);
+
+	Assert(!IS_LEAF_NODE(node));
+
+	newchild->shift = node->shift - RADIX_TREE_NODE_FANOUT;
+	newchild->chunk = GET_KEY_CHUNK(key, node->shift);
+
+	radix_tree_node_insert_val(tree, parent, node, key, PointerGetDatum(newchild), NULL);
+
+	return (radix_tree_node *) newchild;
+}
+
+/*
+ * Insert the value to the node. The node grows if it's full.
+ */
+static void
+radix_tree_node_insert_val(radix_tree *tree, radix_tree_node *parent,
+						   radix_tree_node *node, uint64 key, Datum val,
+						   bool *replaced_p)
+{
+	int			chunk = GET_KEY_CHUNK(key, node->shift);
+	bool		replaced = false;
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node;
+				int			idx;
+
+				for (idx = 0; idx < n4->n.count; idx++)
+				{
+					if (n4->chunks[idx] >= chunk)
+						break;
+				}
+
+				if (NODE_HAS_FREE_SLOT(n4))
+				{
+					if (n4->n.count == 0)
+					{
+						/* the first key for this node, add it */
+					}
+					else if (n4->chunks[idx] == chunk)
+					{
+						/* found the key, replace it */
+						replaced = true;
+					}
+					else if (idx != n4->n.count)
+					{
+						/*
+						 * the key needs to be inserted in the middle of the
+						 * array, make space for the new key.
+						 */
+						memmove(&(n4->chunks[idx + 1]), &(n4->chunks[idx]),
+								sizeof(uint8) * (n4->n.count - idx));
+						memmove(&(n4->slots[idx + 1]), &(n4->slots[idx]),
+								sizeof(radix_tree_node *) * (n4->n.count - idx));
+					}
+
+					n4->chunks[idx] = chunk;
+					n4->slots[idx] = val;
+
+					/* Done */
+					break;
+				}
+
+				/* The node doesn't have free slot so needs to grow */
+				node = radix_tree_node_grow(tree, parent, node, key);
+				Assert(node->kind == RADIX_TREE_NODE_KIND_32);
+			}
+			/* FALLTHROUGH */
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+				int			idx;
+
+				idx = node_32_search_le(n32, chunk);
+
+				if (NODE_HAS_FREE_SLOT(n32))
+				{
+					if (n32->n.count == 0)
+					{
+						/* first key for this node, add it */
+					}
+					else if (n32->chunks[idx] == chunk)
+					{
+						/* found the key, replace it */
+						replaced = true;
+					}
+					else if (idx != n32->n.count)
+					{
+						/*
+						 * the key needs to be inserted in the middle of the
+						 * array, make space for the new key.
+						 */
+						memmove(&(n32->chunks[idx + 1]), &(n32->chunks[idx]),
+								sizeof(uint8) * (n32->n.count - idx));
+						memmove(&(n32->slots[idx + 1]), &(n32->slots[idx]),
+								sizeof(radix_tree_node *) * (n32->n.count - idx));
+					}
+
+					n32->chunks[idx] = chunk;
+					n32->slots[idx] = val;
+
+					/* Done */
+					break;
+				}
+
+				/* The node doesn't have free slot so needs to grow */
+				node = radix_tree_node_grow(tree, parent, node, key);
+				Assert(node->kind == RADIX_TREE_NODE_KIND_128);
+			}
+			/* FALLTHROUGH */
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+
+				if (node_128_is_chunk_used(n128, chunk))
+				{
+					/* found the existing value */
+					node_128_set(n128, chunk, val);
+					replaced = true;
+					break;
+				}
+
+				if (NODE_HAS_FREE_SLOT(n128))
+				{
+					node_128_set(n128, chunk, val);
+
+					/* Done */
+					break;
+				}
+
+				/* The node doesn't have free slot so needs to grow */
+				node = radix_tree_node_grow(tree, parent, node, key);
+				Assert(node->kind == RADIX_TREE_NODE_KIND_256);
+			}
+			/* FALLTHROUGH */
+		case RADIX_TREE_NODE_KIND_256:
+			{
+				radix_tree_node_256 *n256 = (radix_tree_node_256 *) node;
+
+				if (node_256_is_chunk_used(n256, chunk))
+					replaced = true;
+
+				node_256_set(n256, chunk, val);
+
+				break;
+			}
+	}
+
+	/* Update statistics */
+	if (!replaced)
+		node->count++;
+
+	if (replaced_p)
+		*replaced_p = replaced;
+
+	/*
+	 * Done. Finally, verify the chunk and value is inserted or replaced
+	 * properly in the node.
+	 */
+	radix_tree_verify_node(node);
+}
+
+/* Change the node type to the next larger one */
+static radix_tree_node *
+radix_tree_node_grow(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node,
+					 uint64 key)
+{
+	radix_tree_node *newnode = NULL;
+
+	Assert(node->count == radix_tree_node_info[node->kind].max_slots);
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node;
+				radix_tree_node_32 *new32 =
+				(radix_tree_node_32 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_32);
+
+				radix_tree_copy_node_common((radix_tree_node *) n4,
+											(radix_tree_node *) new32);
+
+				/* Copy both chunks and slots to the new node */
+				memcpy(&(new32->chunks), &(n4->chunks), sizeof(uint8) * 4);
+				memcpy(&(new32->slots), &(n4->slots), sizeof(Datum) * 4);
+
+				newnode = (radix_tree_node *) new32;
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+				radix_tree_node_128 *new128 =
+				(radix_tree_node_128 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_128);
+
+				/* Copy both chunks and slots to the new node */
+				radix_tree_copy_node_common((radix_tree_node *) n32,
+											(radix_tree_node *) new128);
+
+				for (int i = 0; i < n32->n.count; i++)
+					node_128_set(new128, n32->chunks[i], n32->slots[i]);
+
+				newnode = (radix_tree_node *) new128;
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+				radix_tree_node_256 *new256 =
+				(radix_tree_node_256 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_256);
+				int			cnt = 0;
+
+				radix_tree_copy_node_common((radix_tree_node *) n128,
+											(radix_tree_node *) new256);
+
+				for (int i = 0; i < RADIX_TREE_NODE_MAX_SLOTS && cnt < n128->n.count; i++)
+				{
+					if (!node_128_is_chunk_used(n128, i))
+						continue;
+
+					node_256_set(new256, i, node_128_get_chunk_slot(n128, i));
+					cnt++;
+				}
+
+				newnode = (radix_tree_node *) new256;
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_256:
+			elog(ERROR, "radix tree node-256 cannot grow");
+			break;
+	}
+
+	if (parent == node)
+	{
+		/* Replace the root node with the new large node */
+		tree->root = newnode;
+	}
+	else
+	{
+		Datum	   *slot_ptr = NULL;
+
+		/* Redirect from the parent to the node */
+		radix_tree_node_search(parent, &slot_ptr, key, RADIX_TREE_FIND);
+		Assert(*slot_ptr);
+		*slot_ptr = PointerGetDatum(newnode);
+	}
+
+	/* Verify the node has grown properly */
+	radix_tree_verify_node(newnode);
+
+	/* Free the old node */
+	radix_tree_free_node(tree, node);
+
+	return newnode;
+}
+
+/*
+ * Create the radix tree in the given memory context and return it.
+ */
+radix_tree *
+radix_tree_create(MemoryContext ctx)
+{
+	radix_tree *tree;
+	MemoryContext old_ctx;
+
+	old_ctx = MemoryContextSwitchTo(ctx);
+
+	tree = palloc(sizeof(radix_tree));
+	tree->max_val = 0;
+	tree->root = NULL;
+	tree->context = ctx;
+	tree->num_keys = 0;
+	tree->mem_used = 0;
+
+	/* Create the slab allocator for each size class */
+	for (int i = 0; i < RADIX_TREE_NODE_KIND_COUNT; i++)
+	{
+		tree->slabs[i] = SlabContextCreate(ctx,
+										   radix_tree_node_info[i].name,
+										   SLAB_DEFAULT_BLOCK_SIZE,
+										   radix_tree_node_info[i].size);
+		tree->cnt[i] = 0;
+	}
+
+	MemoryContextSwitchTo(old_ctx);
+
+	return tree;
+}
+
+/*
+ * Free the given radix tree.
+ */
+void
+radix_tree_free(radix_tree *tree)
+{
+	for (int i = 0; i < RADIX_TREE_NODE_KIND_COUNT; i++)
+		MemoryContextDelete(tree->slabs[i]);
+
+	pfree(tree);
+}
+
+/*
+ * Insert the key with the val.
+ *
+ * found_p is set to true if the key already present, otherwise false, if
+ * it's not NULL.
+ *
+ * XXX: do we need to support update_if_exists behavior?
+ */
+void
+radix_tree_insert(radix_tree *tree, uint64 key, Datum val, bool *found_p)
+{
+	int			shift;
+	bool		replaced;
+	radix_tree_node *node;
+	radix_tree_node *parent = tree->root;
+
+	/* Empty tree, create the root */
+	if (!tree->root)
+		radix_tree_new_root(tree, key, val);
+
+	/* Extend the tree if necessary */
+	if (key > tree->max_val)
+		radix_tree_extend(tree, key);
+
+	Assert(tree->root);
+
+	shift = tree->root->shift;
+	node = tree->root;
+	while (shift > 0)
+	{
+		radix_tree_node *child;
+
+		if (!radix_tree_node_search_child(node, &child, key))
+			child = radix_tree_node_insert_child(tree, parent, node, key);
+
+		Assert(child != NULL);
+
+		parent = node;
+		node = child;
+		shift -= RADIX_TREE_NODE_FANOUT;
+	}
+
+	/* arrived at a leaf */
+	Assert(IS_LEAF_NODE(node));
+
+	radix_tree_node_insert_val(tree, parent, node, key, val, &replaced);
+
+	/* Update the statistics */
+	if (!replaced)
+		tree->num_keys++;
+
+	if (found_p)
+		*found_p = replaced;
+}
+
+/*
+ * Search the given key in the radix tree. Return true if the key is successfully
+ * found, otherwise return false.  On success, we set the value to *val_p so
+ * it must not be NULL.
+ */
+bool
+radix_tree_search(radix_tree *tree, uint64 key, Datum *val_p)
+{
+	radix_tree_node *node;
+	Datum	   *value_ptr;
+	int			shift;
+
+	Assert(val_p);
+
+	if (!tree->root || key > tree->max_val)
+		return false;
+
+	node = tree->root;
+	shift = tree->root->shift;
+	while (shift > 0)
+	{
+		radix_tree_node *child;
+
+		if (!radix_tree_node_search_child(node, &child, key))
+			return false;
+
+		node = child;
+		shift -= RADIX_TREE_NODE_FANOUT;
+	}
+
+	/* We reached at a leaf node, search the corresponding slot */
+	Assert(IS_LEAF_NODE(node));
+
+	if (!radix_tree_node_search(node, &value_ptr, key, RADIX_TREE_FIND))
+		return false;
+
+	/* Found, set the value to return */
+	*val_p = *value_ptr;
+	return true;
+}
+
+/*
+ * Delete the given key from the radix tree. Return true if the key is found (and
+ * deleted), otherwise do nothing and return false.
+ */
+bool
+radix_tree_delete(radix_tree *tree, uint64 key)
+{
+	radix_tree_node *node;
+	int			shift;
+	radix_tree_stack stack = NULL;
+	bool		deleted;
+
+	if (!tree->root || key > tree->max_val)
+		return false;
+
+	/*
+	 * Descending the tree to search the key while building a stack of nodes
+	 * we visited.
+	 */
+	node = tree->root;
+	shift = tree->root->shift;
+	while (shift >= 0)
+	{
+		radix_tree_node *child;
+		radix_tree_stack new_stack;
+
+		new_stack = (radix_tree_stack) palloc(sizeof(radix_tree_stack_data));
+		new_stack->node = node;
+		new_stack->parent = stack;
+		stack = new_stack;
+
+		if (IS_LEAF_NODE(node))
+			break;
+
+		if (!radix_tree_node_search_child(node, &child, key))
+		{
+			radix_tree_free_stack(stack);
+			return false;
+		}
+
+		node = child;
+		shift -= RADIX_TREE_NODE_FANOUT;
+	}
+
+	/*
+	 * Delete the key from the leaf node and recursively delete internal nodes
+	 * if necessary.
+	 */
+	Assert(IS_LEAF_NODE(stack->node));
+	while (stack != NULL)
+	{
+		radix_tree_node *node;
+		Datum	   *slot;
+
+		/* pop the node from the stack */
+		node = stack->node;
+		stack = stack->parent;
+
+		deleted = radix_tree_node_search(node, &slot, key, RADIX_TREE_DELETE);
+
+		/* If the node didn't become empty, we stop deleting the key */
+		if (!IS_EMPTY_NODE(node))
+			break;
+
+		Assert(deleted);
+
+		/* The node became empty */
+		radix_tree_free_node(tree, node);
+
+		/*
+		 * If we eventually deleted the root node while recursively deleting
+		 * empty nodes, we make the tree empty.
+		 */
+		if (stack == NULL)
+		{
+			tree->root = NULL;
+			tree->max_val = 0;
+		}
+	}
+
+	if (deleted)
+		tree->num_keys--;
+
+	radix_tree_free_stack(stack);
+	return deleted;
+}
+
+/* Create and return the iterator for the given radix tree */
+radix_tree_iter *
+radix_tree_begin_iterate(radix_tree *tree)
+{
+	MemoryContext old_ctx;
+	radix_tree_iter *iter;
+	int			top_level;
+
+	old_ctx = MemoryContextSwitchTo(tree->context);
+
+	iter = (radix_tree_iter *) palloc0(sizeof(radix_tree_iter));
+	iter->tree = tree;
+
+	/* empty tree */
+	if (!iter->tree)
+		return iter;
+
+	top_level = iter->tree->root->shift / RADIX_TREE_NODE_FANOUT;
+
+	iter->stack_len = top_level;
+	iter->stack[top_level].node = iter->tree->root;
+	iter->stack[top_level].current_idx = -1;
+
+	/* Descend to the left most leaf node from the root */
+	radix_tree_update_iter_stack(iter, top_level);
+
+	MemoryContextSwitchTo(old_ctx);
+
+	return iter;
+}
+
+/*
+ * Return true with setting key_p and value_p if there is next key.  Otherwise,
+ * return false.
+ */
+bool
+radix_tree_iterate_next(radix_tree_iter *iter, uint64 *key_p, Datum *value_p)
+{
+	bool		found = false;
+	Datum		slot = (Datum) 0;
+	int			level;
+
+	/* Empty tree */
+	if (!iter->tree)
+		return false;
+
+	for (;;)
+	{
+		radix_tree_node *node;
+		radix_tree_iter_node_data *node_iter;
+
+		/*
+		 * Iterate node at each level from the bottom of the tree until we
+		 * search the next slot.
+		 */
+		for (level = 0; level <= iter->stack_len; level++)
+		{
+			slot = radix_tree_node_iterate_next(iter, &(iter->stack[level]), &found);
+
+			if (found)
+				break;
+		}
+
+		/* end of iteration */
+		if (!found)
+			return false;
+
+		/* found the next slot at the leaf node, return it */
+		if (level == 0)
+		{
+			*key_p = iter->key;
+			*value_p = slot;
+			return true;
+		}
+
+		/*
+		 * We have advanced more than one nodes including internal nodes. So
+		 * we need to update the stack by descending to the left most leaf
+		 * node from this level.
+		 */
+		node = (radix_tree_node *) DatumGetPointer(slot);
+		node_iter = &(iter->stack[level - 1]);
+		radix_tree_store_iter_node(iter, node_iter, node);
+
+		radix_tree_update_iter_stack(iter, level - 1);
+	}
+}
+
+void
+radix_tree_end_iterate(radix_tree_iter *iter)
+{
+	pfree(iter);
+}
+
+/*
+ * Update the part of the key being constructed during the iteration with the
+ * given chunk
+ */
+static inline void
+radix_tree_iter_update_key(radix_tree_iter *iter, uint8 chunk, uint8 shift)
+{
+	iter->key &= ~(((uint64) RADIX_TREE_CHUNK_MASK) << shift);
+	iter->key |= (((uint64) chunk) << shift);
+}
+
+/*
+ * Iterate over the given radix tree node and returns the next slot of the given
+ * node and set true to *found_p, if any.  Otherwise, set false to *found_p.
+ */
+static Datum
+radix_tree_node_iterate_next(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter,
+							 bool *found_p)
+{
+	radix_tree_node *node = node_iter->node;
+	Datum		slot = (Datum) 0;
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node_iter->node;
+
+				node_iter->current_idx++;
+
+				if (node_iter->current_idx >= n4->n.count)
+					goto not_found;
+
+				slot = n4->slots[node_iter->current_idx];
+
+				/* Update the part of the key with the current chunk */
+				if (IS_LEAF_NODE(node))
+					radix_tree_iter_update_key(iter, n4->chunks[node_iter->current_idx], 0);
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+
+				node_iter->current_idx++;
+
+				if (node_iter->current_idx >= n32->n.count)
+					goto not_found;
+
+				slot = n32->slots[node_iter->current_idx];
+
+				/* Update the part of the key with the current chunk */
+				if (IS_LEAF_NODE(node))
+					radix_tree_iter_update_key(iter, n32->chunks[node_iter->current_idx], 0);
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+				int			i;
+
+				for (i = node_iter->current_idx + 1; i < 256; i++)
+				{
+					if (node_128_is_chunk_used(n128, i))
+						break;
+				}
+
+				if (i >= 256)
+					goto not_found;
+
+				node_iter->current_idx = i;
+				slot = node_128_get_chunk_slot(n128, i);
+
+				/* Update the part of the key */
+				if (IS_LEAF_NODE(node))
+					radix_tree_iter_update_key(iter, node_iter->current_idx, 0);
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_256:
+			{
+				radix_tree_node_256 *n256 = (radix_tree_node_256 *) node;
+				int			i;
+
+				for (i = node_iter->current_idx + 1; i < 256; i++)
+				{
+					if (node_256_is_chunk_used(n256, i))
+						break;
+				}
+
+				if (i >= 256)
+					goto not_found;
+
+				node_iter->current_idx = i;
+				slot = n256->slots[i];
+
+				/* Update the part of the key */
+				if (IS_LEAF_NODE(node))
+					radix_tree_iter_update_key(iter, node_iter->current_idx, 0);
+
+				break;
+			}
+	}
+
+	*found_p = true;
+	return slot;
+
+not_found:
+	*found_p = false;
+	return (Datum) 0;
+}
+
+/*
+ * Initialize and update the node iteration struct with the given radix tree node.
+ * This function also updates the part of the key with the chunk of the given node.
+ */
+static void
+radix_tree_store_iter_node(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter,
+						   radix_tree_node *node)
+{
+	node_iter->node = node;
+	node_iter->current_idx = -1;
+
+	radix_tree_iter_update_key(iter, node->chunk, node->shift + RADIX_TREE_NODE_FANOUT);
+}
+
+/*
+ * Build the stack of the radix tree node while descending to the leaf from the 'from'
+ * level.
+ */
+static void
+radix_tree_update_iter_stack(radix_tree_iter *iter, int from)
+{
+	radix_tree_node *node = iter->stack[from].node;
+	int			level = from;
+
+	for (;;)
+	{
+		radix_tree_iter_node_data *node_iter = &(iter->stack[level--]);
+		bool		found;
+
+		/* Set the current node */
+		radix_tree_store_iter_node(iter, node_iter, node);
+
+		if (IS_LEAF_NODE(node))
+			break;
+
+		node = (radix_tree_node *)
+			DatumGetPointer(radix_tree_node_iterate_next(iter, node_iter, &found));
+
+		/*
+		 * Since we always get the first slot in the node, we have to found
+		 * the slot.
+		 */
+		Assert(found);
+	}
+}
+
+/*
+ * Return the number of keys in the radix tree.
+ */
+uint64
+radix_tree_num_entries(radix_tree *tree)
+{
+	return tree->num_keys;
+}
+
+/*
+ * Return the statistics of the amount of memory used by the radix tree.
+ */
+uint64
+radix_tree_memory_usage(radix_tree *tree)
+{
+	return tree->mem_used;
+}
+
+/*
+ * Verify the radix tree node.
+ */
+static void
+radix_tree_verify_node(radix_tree_node *node)
+{
+#ifdef USE_ASSERT_CHECKING
+	Assert(node->count >= 0);
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node;
+
+				/* Check if the chunks in the node are sorted */
+				for (int i = 1; i < n4->n.count; i++)
+					Assert(n4->chunks[i - 1] < n4->chunks[i]);
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+
+				/* Check if the chunks in the node are sorted */
+				for (int i = 1; i < n32->n.count; i++)
+					Assert(n32->chunks[i - 1] < n32->chunks[i]);
+
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+				int			cnt = 0;
+
+				for (int i = 0; i < RADIX_TREE_NODE_MAX_SLOTS; i++)
+				{
+					if (!node_128_is_chunk_used(n128, i))
+						continue;
+
+					/* Check if the corresponding slot is used */
+					Assert(node_128_is_slot_used(n128, n128->slot_idxs[i] - 1));
+
+					cnt++;
+				}
+
+				Assert(n128->n.count == cnt);
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_256:
+			{
+				radix_tree_node_256 *n256 = (radix_tree_node_256 *) node;
+				int			cnt = 0;
+
+				for (int i = 0; i < RADIX_TREE_NODE_MAX_BITS; i++)
+					cnt += pg_popcount32(n256->isset[i]);
+
+				/* Check if the number of used chunk matches */
+				Assert(n256->n.count == cnt);
+
+				break;
+			}
+	}
+#endif
+}
+
+/***************** DEBUG FUNCTIONS *****************/
+#ifdef RADIX_TREE_DEBUG
+void
+radix_tree_stats(radix_tree *tree)
+{
+	fprintf(stderr, "num_keys = %lu, height = %u, n4 = %u(%lu), n32 = %u(%lu), n128 = %u(%lu), n256 = %u(%lu)",
+			tree->num_keys,
+			tree->root->shift / RADIX_TREE_NODE_FANOUT,
+			tree->cnt[0], tree->cnt[0] * sizeof(radix_tree_node_4),
+			tree->cnt[1], tree->cnt[1] * sizeof(radix_tree_node_32),
+			tree->cnt[2], tree->cnt[2] * sizeof(radix_tree_node_128),
+			tree->cnt[3], tree->cnt[3] * sizeof(radix_tree_node_256));
+	/* radix_tree_dump(tree); */
+}
+
+static void
+radix_tree_print_slot(StringInfo buf, uint8 chunk, Datum slot, int idx, bool is_leaf, int level)
+{
+	char		space[128] = {0};
+
+	if (level > 0)
+		sprintf(space, "%*c", level * 4, ' ');
+
+	if (is_leaf)
+		appendStringInfo(buf, "%s[%d] \"0x%X\" val(0x%lX) LEAF\n",
+						 space,
+						 idx,
+						 chunk,
+						 DatumGetInt64(slot));
+	else
+		appendStringInfo(buf, "%s[%d] \"0x%X\" -> ",
+						 space,
+						 idx,
+						 chunk);
+}
+
+static void
+radix_tree_dump_node(radix_tree_node *node, int level, StringInfo buf, bool recurse)
+{
+	bool		is_leaf = IS_LEAF_NODE(node);
+
+	appendStringInfo(buf, "[\"%s\" type %d, cnt %u, shift %u, chunk \"0x%X\"] chunks:\n",
+					 IS_LEAF_NODE(node) ? "LEAF" : "INNR",
+					 (node->kind == RADIX_TREE_NODE_KIND_4) ? 4 :
+					 (node->kind == RADIX_TREE_NODE_KIND_32) ? 32 :
+					 (node->kind == RADIX_TREE_NODE_KIND_128) ? 128 : 256,
+					 node->count, node->shift, node->chunk);
+
+	switch (node->kind)
+	{
+		case RADIX_TREE_NODE_KIND_4:
+			{
+				radix_tree_node_4 *n4 = (radix_tree_node_4 *) node;
+
+				for (int i = 0; i < n4->n.count; i++)
+				{
+					radix_tree_print_slot(buf, n4->chunks[i], n4->slots[i], i, is_leaf, level);
+
+					if (!is_leaf)
+					{
+						if (recurse)
+						{
+							StringInfoData buf2;
+
+							initStringInfo(&buf2);
+							radix_tree_dump_node((radix_tree_node *) n4->slots[i], level + 1, &buf2, recurse);
+							appendStringInfo(buf, "%s", buf2.data);
+						}
+						else
+							appendStringInfo(buf, "\n");
+					}
+				}
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_32:
+			{
+				radix_tree_node_32 *n32 = (radix_tree_node_32 *) node;
+
+				for (int i = 0; i < n32->n.count; i++)
+				{
+					radix_tree_print_slot(buf, n32->chunks[i], n32->slots[i], i, is_leaf, level);
+
+					if (!is_leaf)
+					{
+						if (recurse)
+						{
+							StringInfoData buf2;
+
+							initStringInfo(&buf2);
+							radix_tree_dump_node((radix_tree_node *) n32->slots[i], level + 1, &buf2, recurse);
+							appendStringInfo(buf, "%s", buf2.data);
+						}
+						else
+							appendStringInfo(buf, "\n");
+					}
+				}
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_128:
+			{
+				radix_tree_node_128 *n128 = (radix_tree_node_128 *) node;
+
+				for (int j = 0; j < 256; j++)
+				{
+					if (!node_128_is_chunk_used(n128, j))
+						continue;
+
+					appendStringInfo(buf, "slot_idxs[%d]=%d, ", j, n128->slot_idxs[j]);
+				}
+				appendStringInfo(buf, "\nisset-bitmap:");
+				for (int j = 0; j < 16; j++)
+				{
+					appendStringInfo(buf, "%X ", (uint8) n128->isset[j]);
+				}
+				appendStringInfo(buf, "\n");
+
+				for (int i = 0; i < 256; i++)
+				{
+					if (!node_128_is_chunk_used(n128, i))
+						continue;
+
+					radix_tree_print_slot(buf, i, node_128_get_chunk_slot(n128, i),
+										  i, is_leaf, level);
+
+					if (!is_leaf)
+					{
+						if (recurse)
+						{
+							StringInfoData buf2;
+
+							initStringInfo(&buf2);
+							radix_tree_dump_node((radix_tree_node *) node_128_get_chunk_slot(n128, i),
+												 level + 1, &buf2, recurse);
+							appendStringInfo(buf, "%s", buf2.data);
+						}
+						else
+							appendStringInfo(buf, "\n");
+					}
+				}
+				break;
+			}
+		case RADIX_TREE_NODE_KIND_256:
+			{
+				radix_tree_node_256 *n256 = (radix_tree_node_256 *) node;
+
+				for (int i = 0; i < 256; i++)
+				{
+					if (!node_256_is_chunk_used(n256, i))
+						continue;
+
+					radix_tree_print_slot(buf, i, n256->slots[i], i, is_leaf, level);
+
+					if (!is_leaf)
+					{
+						if (recurse)
+						{
+							StringInfoData buf2;
+
+							initStringInfo(&buf2);
+							radix_tree_dump_node((radix_tree_node *) n256->slots[i], level + 1, &buf2, recurse);
+							appendStringInfo(buf, "%s", buf2.data);
+						}
+						else
+							appendStringInfo(buf, "\n");
+					}
+				}
+				break;
+			}
+	}
+}
+
+void
+radix_tree_dump_search(radix_tree *tree, uint64 key)
+{
+	StringInfoData buf;
+	radix_tree_node *node;
+	int			shift;
+	int			level = 0;
+
+	elog(NOTICE, "-----------------------------------------------------------");
+	elog(NOTICE, "max_val = %lu (0x%lX)", tree->max_val, tree->max_val);
+
+	if (!tree->root)
+	{
+		elog(NOTICE, "tree is empty");
+		return;
+	}
+
+	if (key > tree->max_val)
+	{
+		elog(NOTICE, "key %lu (0x%lX) is larger than max val",
+			 key, key);
+		return;
+	}
+
+	initStringInfo(&buf);
+	node = tree->root;
+	shift = tree->root->shift;
+	while (shift >= 0)
+	{
+		radix_tree_node *child;
+
+		radix_tree_dump_node(node, level, &buf, false);
+
+		if (IS_LEAF_NODE(node))
+		{
+			Datum	   *dummy;
+
+			/* We reached at a leaf node, find the corresponding slot */
+			radix_tree_node_search(node, &dummy, key, RADIX_TREE_FIND);
+
+			break;
+		}
+
+		if (!radix_tree_node_search_child(node, &child, key))
+			break;
+
+		node = child;
+		shift -= RADIX_TREE_NODE_FANOUT;
+		level++;
+	}
+
+	elog(NOTICE, "\n%s", buf.data);
+}
+
+void
+radix_tree_dump(radix_tree *tree)
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	elog(NOTICE, "-----------------------------------------------------------");
+	elog(NOTICE, "max_val = %lu", tree->max_val);
+	radix_tree_dump_node(tree->root, 0, &buf, true);
+	elog(NOTICE, "\n%s", buf.data);
+	elog(NOTICE, "-----------------------------------------------------------");
+}
+#endif
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
new file mode 100644
index 0000000000..7e864d124b
--- /dev/null
+++ b/src/include/lib/radixtree.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * radixtree.h
+ *	  Interface for radix tree.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/radixtree.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RADIXTREE_H
+#define RADIXTREE_H
+
+#include "postgres.h"
+
+/* #define RADIX_TREE_DEBUG 1 */
+
+typedef struct radix_tree radix_tree;
+typedef struct radix_tree_iter radix_tree_iter;
+
+extern radix_tree *radix_tree_create(MemoryContext ctx);
+extern bool radix_tree_search(radix_tree *tree, uint64 key, Datum *val_p);
+extern void radix_tree_insert(radix_tree *tree, uint64 key, Datum val, bool *found_p);
+extern bool radix_tree_delete(radix_tree *tree, uint64 key);
+extern void radix_tree_free(radix_tree *tree);
+extern uint64 radix_tree_memory_usage(radix_tree *tree);
+extern uint64 radix_tree_num_entries(radix_tree *tree);
+
+extern radix_tree_iter *radix_tree_begin_iterate(radix_tree *tree);
+extern bool radix_tree_iterate_next(radix_tree_iter *iter, uint64 *key_p, Datum *value_p);
+extern void radix_tree_end_iterate(radix_tree_iter *iter);
+
+
+#ifdef RADIX_TREE_DEBUG
+extern void radix_tree_dump(radix_tree *tree);
+extern void radix_tree_dump_search(radix_tree *tree, uint64 key);
+extern void radix_tree_stats(radix_tree *tree);
+#endif
+
+#endif							/* RADIXTREE_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 9090226daa..51b2514faf 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -24,6 +24,7 @@ SUBDIRS = \
 		  test_parser \
 		  test_pg_dump \
 		  test_predtest \
+		  test_radixtree \
 		  test_rbtree \
 		  test_regex \
 		  test_rls_hooks \
diff --git a/src/test/modules/test_radixtree/.gitignore b/src/test/modules/test_radixtree/.gitignore
new file mode 100644
index 0000000000..5dcb3ff972
--- /dev/null
+++ b/src/test/modules/test_radixtree/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_radixtree/Makefile b/src/test/modules/test_radixtree/Makefile
new file mode 100644
index 0000000000..da06b93da3
--- /dev/null
+++ b/src/test/modules/test_radixtree/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_radixtree/Makefile
+
+MODULE_big = test_radixtree
+OBJS = \
+	$(WIN32RES) \
+	test_radixtree.o
+PGFILEDESC = "test_radixtree - test code for src/backend/lib/radixtree.c"
+
+EXTENSION = test_radixtree
+DATA = test_radixtree--1.0.sql
+
+REGRESS = test_radixtree
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_radixtree
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_radixtree/README b/src/test/modules/test_radixtree/README
new file mode 100644
index 0000000000..a8b271869a
--- /dev/null
+++ b/src/test/modules/test_radixtree/README
@@ -0,0 +1,7 @@
+test_integerset contains unit tests for testing the integer set implementation
+in src/backend/lib/integerset.c.
+
+The tests verify the correctness of the implementation, but they can also be
+used as a micro-benchmark.  If you set the 'intset_test_stats' flag in
+test_integerset.c, the tests will print extra information about execution time
+and memory usage.
diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out
new file mode 100644
index 0000000000..cc6970c87c
--- /dev/null
+++ b/src/test/modules/test_radixtree/expected/test_radixtree.out
@@ -0,0 +1,28 @@
+CREATE EXTENSION test_radixtree;
+--
+-- All the logic is in the test_radixtree() function. It will throw
+-- an error if something fails.
+--
+SELECT test_radixtree();
+NOTICE:  testing radix tree node types with shift "0"
+NOTICE:  testing radix tree node types with shift "8"
+NOTICE:  testing radix tree node types with shift "16"
+NOTICE:  testing radix tree node types with shift "24"
+NOTICE:  testing radix tree node types with shift "32"
+NOTICE:  testing radix tree node types with shift "40"
+NOTICE:  testing radix tree node types with shift "48"
+NOTICE:  testing radix tree node types with shift "56"
+NOTICE:  testing radix tree with pattern "all ones"
+NOTICE:  testing radix tree with pattern "alternating bits"
+NOTICE:  testing radix tree with pattern "clusters of ten"
+NOTICE:  testing radix tree with pattern "clusters of hundred"
+NOTICE:  testing radix tree with pattern "one-every-64k"
+NOTICE:  testing radix tree with pattern "sparse"
+NOTICE:  testing radix tree with pattern "single values, distance > 2^32"
+NOTICE:  testing radix tree with pattern "clusters, distance > 2^32"
+NOTICE:  testing radix tree with pattern "clusters, distance > 2^60"
+ test_radixtree 
+----------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_radixtree/sql/test_radixtree.sql b/src/test/modules/test_radixtree/sql/test_radixtree.sql
new file mode 100644
index 0000000000..41ece5e9f5
--- /dev/null
+++ b/src/test/modules/test_radixtree/sql/test_radixtree.sql
@@ -0,0 +1,7 @@
+CREATE EXTENSION test_radixtree;
+
+--
+-- All the logic is in the test_radixtree() function. It will throw
+-- an error if something fails.
+--
+SELECT test_radixtree();
diff --git a/src/test/modules/test_radixtree/test_radixtree--1.0.sql b/src/test/modules/test_radixtree/test_radixtree--1.0.sql
new file mode 100644
index 0000000000..074a5a7ea7
--- /dev/null
+++ b/src/test/modules/test_radixtree/test_radixtree--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_radixtree/test_radixtree--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_radixtree" to load this file. \quit
+
+CREATE FUNCTION test_radixtree()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c
new file mode 100644
index 0000000000..6d5b06a800
--- /dev/null
+++ b/src/test/modules/test_radixtree/test_radixtree.c
@@ -0,0 +1,502 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_radixtree.c
+ *		Test radixtree set data structure.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/test/modules/test_radixtree/test_radixtree.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "common/pg_prng.h"
+#include "fmgr.h"
+#include "lib/radixtree.h"
+#include "miscadmin.h"
+#include "nodes/bitmapset.h"
+#include "storage/block.h"
+#include "storage/itemptr.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "X"
+
+/*
+ * If you enable this, the "pattern" tests will print information about
+ * how long populating, probing, and iterating the test set takes, and
+ * how much memory the test set consumed.  That can be used as
+ * micro-benchmark of various operations and input patterns (you might
+ * want to increase the number of values used in each of the test, if
+ * you do that, to reduce noise).
+ *
+ * The information is printed to the server's stderr, mostly because
+ * that's where MemoryContextStats() output goes.
+ */
+static const bool radix_tree_test_stats = false;
+
+/* The maximum number of entries each node type can have */
+static int radix_tree_node_max_entries[] = {
+	4,		/* RADIX_TREE_NODE_KIND_4 */
+	16,		/* RADIX_TREE_NODE_KIND_16 */
+	128,	/* RADIX_TREE_NODE_KIND_128 */
+	256		/* RADIX_TREE_NODE_KIND_256 */
+};
+
+/*
+ * A struct to define a pattern of integers, for use with the test_pattern()
+ * function.
+ */
+typedef struct
+{
+	char	   *test_name;		/* short name of the test, for humans */
+	char	   *pattern_str;	/* a bit pattern */
+	uint64		spacing;		/* pattern repeats at this interval */
+	uint64		num_values;		/* number of integers to set in total */
+} test_spec;
+
+/* Test patterns borrowed from test_integerset.c */
+static const test_spec test_specs[] = {
+	{
+		"all ones", "1111111111",
+		10, 1000000
+	},
+	{
+		"alternating bits", "0101010101",
+		10, 1000000
+	},
+	{
+		"clusters of ten", "1111111111",
+		10000, 1000000
+	},
+	{
+		"clusters of hundred",
+		"1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111",
+		10000, 10000000
+	},
+	{
+		"one-every-64k", "1",
+		65536, 1000000
+	},
+	{
+		"sparse", "100000000000000000000000000000001",
+		10000000, 1000000
+	},
+	{
+		"single values, distance > 2^32", "1",
+		UINT64CONST(10000000000), 100000
+	},
+	{
+		"clusters, distance > 2^32", "10101010",
+		UINT64CONST(10000000000), 1000000
+	},
+	{
+		"clusters, distance > 2^60", "10101010",
+		UINT64CONST(2000000000000000000),
+		23						/* can't be much higher than this, or we
+								 * overflow uint64 */
+	}
+};
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_radixtree);
+
+static void
+test_empty(void)
+{
+	radix_tree *radixtree;
+	Datum dummy;
+
+	radixtree = radix_tree_create(CurrentMemoryContext);
+
+	if (radix_tree_search(radixtree, 0, &dummy))
+		elog(ERROR, "radix_tree_search on empty tree returned true");
+
+	if (radix_tree_search(radixtree, 1, &dummy))
+		elog(ERROR, "radix_tree_search on empty tree returned true");
+
+	if (radix_tree_search(radixtree, PG_UINT64_MAX, &dummy))
+		elog(ERROR, "radix_tree_search on empty tree returned true");
+
+	if (radix_tree_num_entries(radixtree) != 0)
+		elog(ERROR, "radix_tree_num_entries on empty tree return non-zero");
+
+	radix_tree_free(radixtree);
+}
+
+/*
+ * Check if keys from start to end with the shift exist in the tree.
+ */
+static void
+check_search_on_node(radix_tree *radixtree, uint8 shift, int start, int end)
+{
+	for (int i = start; i < end; i++)
+	{
+		uint64 key = ((uint64) i << shift);
+		Datum val;
+
+		if (!radix_tree_search(radixtree, key, &val))
+			elog(ERROR, "key 0x" UINT64_HEX_FORMAT " is not found on node-%d",
+				 key, end);
+		if (DatumGetUInt64(val) != key)
+			elog(ERROR, "radix_tree_search with key 0x" UINT64_HEX_FORMAT " returns 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT,
+				 key, DatumGetUInt64(val), key);
+	}
+}
+
+static void
+test_node_types_insert(radix_tree *radixtree, uint8 shift)
+{
+	uint64 num_entries;
+
+	for (int i = 0; i < 256; i++)
+	{
+		uint64 key = ((uint64) i << shift);
+		bool found;
+
+		radix_tree_insert(radixtree, key, Int64GetDatum(key), &found);
+
+		if (found)
+			elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", key);
+
+		for (int j = 0; j < lengthof(radix_tree_node_max_entries); j++)
+		{
+			/*
+			 * After filling all slots in each node type, check if the values are
+			 * stored properly.
+			 */
+			if (i == (radix_tree_node_max_entries[j] - 1))
+			{
+				check_search_on_node(radixtree, shift,
+									 (j == 0) ? 0 : radix_tree_node_max_entries[j - 1],
+									 radix_tree_node_max_entries[j]);
+				break;
+			}
+		}
+	}
+
+	num_entries = radix_tree_num_entries(radixtree);
+
+	if (num_entries != 256)
+		elog(ERROR,
+			 "radix_tree_num_entries returned" UINT64_FORMAT ", expected " UINT64_FORMAT,
+			 num_entries, UINT64CONST(256));
+}
+
+static void
+test_node_types_delete(radix_tree *radixtree, uint8 shift)
+{
+	uint64 num_entries;
+
+	for (int i = 0; i < 256; i++)
+	{
+		uint64	key = ((uint64) i << shift);
+		bool	found;
+
+		found = radix_tree_delete(radixtree, key);
+
+		if (!found)
+			elog(ERROR, "inserted key 0x" UINT64_HEX_FORMAT " is not found", key);
+	}
+
+	num_entries = radix_tree_num_entries(radixtree);
+
+	/* The tree must be empty */
+	if (num_entries != 0)
+		elog(ERROR,
+			 "radix_tree_num_entries returned" UINT64_FORMAT ", expected " UINT64_FORMAT,
+			 num_entries, UINT64CONST(256));
+}
+
+/*
+ * Test for inserting and deleting key-value pairs to each node type at the given shift
+ * level.
+ */
+static void
+test_node_types(uint8 shift)
+{
+	radix_tree *radixtree;
+
+	elog(NOTICE, "testing radix tree node types with shift \"%d\"", shift);
+
+	radixtree = radix_tree_create(CurrentMemoryContext);
+
+	/*
+	 * Insert and search entries for every node type at the 'shift' level,
+	 * then delete all entries to make it empty, and insert and search
+	 * entries again.
+	 */
+	test_node_types_insert(radixtree, shift);
+	test_node_types_delete(radixtree, shift);
+	test_node_types_insert(radixtree, shift);
+
+	radix_tree_free(radixtree);
+}
+
+/*
+ * Test with a repeating pattern, defined by the 'spec'.
+ */
+static void
+test_pattern(const test_spec *spec)
+{
+	radix_tree *radixtree;
+	radix_tree_iter *iter;
+	MemoryContext radixtree_ctx;
+	TimestampTz starttime;
+	TimestampTz endtime;
+	uint64		n;
+	uint64		last_int;
+	uint64		ndeleted;
+	uint64		nbefore;
+	uint64		nafter;
+	int			patternlen;
+	uint64	   *pattern_values;
+	uint64		pattern_num_values;
+
+	elog(NOTICE, "testing radix tree with pattern \"%s\"", spec->test_name);
+	if (radix_tree_test_stats)
+		fprintf(stderr, "-----\ntesting radix tree with pattern \"%s\"\n", spec->test_name);
+
+	/* Pre-process the pattern, creating an array of integers from it. */
+	patternlen = strlen(spec->pattern_str);
+	pattern_values = palloc(patternlen * sizeof(uint64));
+	pattern_num_values = 0;
+	for (int i = 0; i < patternlen; i++)
+	{
+		if (spec->pattern_str[i] == '1')
+			pattern_values[pattern_num_values++] = i;
+	}
+
+	/*
+	 * Allocate the radix tree.
+	 *
+	 * Allocate it in a separate memory context, so that we can print its
+	 * memory usage easily.
+	 */
+	radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext,
+										  "radixtree test",
+										  ALLOCSET_SMALL_SIZES);
+	MemoryContextSetIdentifier(radixtree_ctx, spec->test_name);
+	radixtree = radix_tree_create(radixtree_ctx);
+
+	/*
+	 * Add values to the set.
+	 */
+	starttime = GetCurrentTimestamp();
+
+	n = 0;
+	last_int = 0;
+	while (n < spec->num_values)
+	{
+		uint64		x = 0;
+
+		for (int i = 0; i < pattern_num_values && n < spec->num_values; i++)
+		{
+			bool found;
+
+			x = last_int + pattern_values[i];
+
+			radix_tree_insert(radixtree, x, Int64GetDatum(x), &found);
+
+			if (found)
+				elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", x);
+
+			n++;
+		}
+		last_int += spec->spacing;
+	}
+
+	endtime = GetCurrentTimestamp();
+
+	if (radix_tree_test_stats)
+		fprintf(stderr, "added " UINT64_FORMAT " values in %d ms\n",
+				spec->num_values, (int) (endtime - starttime) / 1000);
+
+	/*
+	 * Print stats on the amount of memory used.
+	 *
+	 * We print the usage reported by radix_tree_memory_usage(), as well as the
+	 * stats from the memory context.  They should be in the same ballpark,
+	 * but it's hard to automate testing that, so if you're making changes to
+	 * the implementation, just observe that manually.
+	 */
+	if (radix_tree_test_stats)
+	{
+		uint64		mem_usage;
+
+		/*
+		 * Also print memory usage as reported by radix_tree_memory_usage().  It
+		 * should be in the same ballpark as the usage reported by
+		 * MemoryContextStats().
+		 */
+		mem_usage = radix_tree_memory_usage(radixtree);
+		fprintf(stderr, "radix_tree_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n",
+				mem_usage, (double) mem_usage / spec->num_values);
+
+		MemoryContextStats(radixtree_ctx);
+	}
+
+	/* Check that radix_tree_num_entries works */
+	n = radix_tree_num_entries(radixtree);
+	if (n != spec->num_values)
+		elog(ERROR, "radix_tree_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, n, spec->num_values);
+
+	/*
+	 * Test random-access probes with radix_tree_search()
+	 */
+	starttime = GetCurrentTimestamp();
+
+	for (n = 0; n < 100000; n++)
+	{
+		bool		found;
+		bool		expected;
+		uint64		x;
+		Datum		v;
+
+		/*
+		 * Pick next value to probe at random.  We limit the probes to the
+		 * last integer that we added to the set, plus an arbitrary constant
+		 * (1000).  There's no point in probing the whole 0 - 2^64 range, if
+		 * only a small part of the integer space is used.  We would very
+		 * rarely hit values that are actually in the set.
+		 */
+		x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000);
+
+		/* Do we expect this value to be present in the set? */
+		if (x >= last_int)
+			expected = false;
+		else
+		{
+			uint64		idx = x % spec->spacing;
+
+			if (idx >= patternlen)
+				expected = false;
+			else if (spec->pattern_str[idx] == '1')
+				expected = true;
+			else
+				expected = false;
+		}
+
+		/* Is it present according to radix_tree_search() ? */
+		found = radix_tree_search(radixtree, x, &v);
+
+		if (found != expected)
+			elog(ERROR, "mismatch at 0x" UINT64_HEX_FORMAT ": %d vs %d", x, found, expected);
+		if (found && (DatumGetUInt64(v) != x))
+			elog(ERROR, "found 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT,
+				 DatumGetUInt64(v), x);
+	}
+	endtime = GetCurrentTimestamp();
+	if (radix_tree_test_stats)
+		fprintf(stderr, "probed " UINT64_FORMAT " values in %d ms\n",
+				n, (int) (endtime - starttime) / 1000);
+
+	/*
+	 * Test iterator
+	 */
+	starttime = GetCurrentTimestamp();
+
+	iter = radix_tree_begin_iterate(radixtree);
+	n = 0;
+	last_int = 0;
+	while (n < spec->num_values)
+	{
+		for (int i = 0; i < pattern_num_values && n < spec->num_values; i++)
+		{
+			uint64		expected = last_int + pattern_values[i];
+			uint64		x;
+			uint64		val;
+
+			if (!radix_tree_iterate_next(iter, &x, &val))
+				break;
+
+			if (x != expected)
+				elog(ERROR,
+					 "iterate returned wrong key; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i);
+			if (DatumGetUInt64(val) != expected)
+				elog(ERROR,
+					 "iterate returned wrong value; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i);
+			n++;
+		}
+		last_int += spec->spacing;
+	}
+	endtime = GetCurrentTimestamp();
+	if (radix_tree_test_stats)
+		fprintf(stderr, "iterated " UINT64_FORMAT " values in %d ms\n",
+				n, (int) (endtime - starttime) / 1000);
+
+	if (n < spec->num_values)
+		elog(ERROR, "iterator stopped short after " UINT64_FORMAT " entries, expected " UINT64_FORMAT, n, spec->num_values);
+	if (n > spec->num_values)
+		elog(ERROR, "iterator returned " UINT64_FORMAT " entries, " UINT64_FORMAT " was expected", n, spec->num_values);
+
+	/*
+	 * Test random-access probes with radix_tree_delete()
+	 */
+	starttime = GetCurrentTimestamp();
+
+	nbefore = radix_tree_num_entries(radixtree);
+	ndeleted = 0;
+	for (n = 0; n < 100000; n++)
+	{
+		bool		found;
+		uint64		x;
+		Datum		v;
+
+		/*
+		 * Pick next value to probe at random.  We limit the probes to the
+		 * last integer that we added to the set, plus an arbitrary constant
+		 * (1000).  There's no point in probing the whole 0 - 2^64 range, if
+		 * only a small part of the integer space is used.  We would very
+		 * rarely hit values that are actually in the set.
+		 */
+		x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000);
+
+		/* Is it present according to radix_tree_search() ? */
+		found = radix_tree_search(radixtree, x, &v);
+
+		if (!found)
+			continue;
+
+		/* If the key is found, delete it and check again */
+		if (!radix_tree_delete(radixtree, x))
+			elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, x);
+		if (radix_tree_search(radixtree, x, &v))
+			elog(ERROR, "found deleted key 0x" UINT64_HEX_FORMAT, x);
+		if (radix_tree_delete(radixtree, x))
+			elog(ERROR, "deleted already-deleted key 0x" UINT64_HEX_FORMAT, x);
+
+		ndeleted++;
+	}
+	endtime = GetCurrentTimestamp();
+	if (radix_tree_test_stats)
+		fprintf(stderr, "deleted " UINT64_FORMAT " values in %d ms\n",
+				ndeleted, (int) (endtime - starttime) / 1000);
+
+	nafter = radix_tree_num_entries(radixtree);
+
+	/* Check that radix_tree_num_entries works */
+	if ((nbefore - ndeleted) != nafter)
+		elog(ERROR, "radix_tree_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT "after " UINT64_FORMAT " deletion",
+			 nafter, (nbefore - ndeleted), ndeleted);
+
+	MemoryContextDelete(radixtree_ctx);
+}
+
+Datum
+test_radixtree(PG_FUNCTION_ARGS)
+{
+	test_empty();
+
+	for (int shift = 0; shift <= (64 - 8); shift += 8)
+		test_node_types(shift);
+
+	/* Test different test patterns, with lots of entries */
+	for (int i = 0; i < lengthof(test_specs); i++)
+		test_pattern(&test_specs[i]);
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_radixtree/test_radixtree.control b/src/test/modules/test_radixtree/test_radixtree.control
new file mode 100644
index 0000000000..e53f2a3e0c
--- /dev/null
+++ b/src/test/modules/test_radixtree/test_radixtree.control
@@ -0,0 +1,4 @@
+comment = 'Test code for radix tree'
+default_version = '1.0'
+module_pathname = '$libdir/test_radixtree'
+relocatable = true

Reply via email to