From e0509eab42b46ae13cec21bd3a86e2fe0eee4698 Mon Sep 17 00:00:00 2001
From: Andrey <amborodin@acm.org>
Date: Fri, 8 Mar 2019 21:19:32 +0500
Subject: [PATCH 1/2] Add block set data structure

Currently we have Bitmapset which works only with 32 bit ints.
Furthermore it is not very space-efficient in case of sparse
bitmap.

In this commit we invent block set: statically typed radix tree
for keeping BlockNumbers. This still can be improved in many ways
applicable to bitmaps, most notable in-pointer lists can be used.
But for the sake of code simplicity it is now plain in it's
ctructure.

The block set is introduced to support efficient page deletion in
GiST's VACUUM.
---
 src/backend/lib/Makefile                      |   4 +-
 src/backend/lib/blockset.c                    | 201 ++++++++++++++++++
 src/include/lib/blockset.h                    |  24 +++
 src/test/modules/test_blockset/.gitignore     |   4 +
 src/test/modules/test_blockset/Makefile       |  21 ++
 src/test/modules/test_blockset/README         |   8 +
 .../test_blockset/expected/test_blockset.out  |   7 +
 .../test_blockset/sql/test_blockset.sql       |   3 +
 .../test_blockset/test_blockset--1.0.sql      |   8 +
 .../modules/test_blockset/test_blockset.c     | 182 ++++++++++++++++
 .../test_blockset/test_blockset.control       |   4 +
 11 files changed, 464 insertions(+), 2 deletions(-)
 create mode 100644 src/backend/lib/blockset.c
 create mode 100644 src/include/lib/blockset.h
 create mode 100644 src/test/modules/test_blockset/.gitignore
 create mode 100644 src/test/modules/test_blockset/Makefile
 create mode 100644 src/test/modules/test_blockset/README
 create mode 100644 src/test/modules/test_blockset/expected/test_blockset.out
 create mode 100644 src/test/modules/test_blockset/sql/test_blockset.sql
 create mode 100644 src/test/modules/test_blockset/test_blockset--1.0.sql
 create mode 100644 src/test/modules/test_blockset/test_blockset.c
 create mode 100644 src/test/modules/test_blockset/test_blockset.control

diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 191ea9bca2..9601894f07 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/lib
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = binaryheap.o bipartite_match.o bloomfilter.o dshash.o hyperloglog.o \
-       ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
+OBJS = binaryheap.o bipartite_match.o blockset.o bloomfilter.o dshash.o \
+       hyperloglog.o ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/blockset.c b/src/backend/lib/blockset.c
new file mode 100644
index 0000000000..c07b2974b1
--- /dev/null
+++ b/src/backend/lib/blockset.c
@@ -0,0 +1,201 @@
+/*-------------------------------------------------------------------------
+ *
+ * blockset.c
+ *		Data structure for operations on set of block numbers
+ *
+ * This data structure resembles bitmap set in idea and operations, but
+ * has two main differences:
+ * 1. It handles unsigned BlockNumber as position in set instead of int32_t
+ * This allows to work with relation forks bigger than 2B blocks
+ * 2. It is more space efficient for sparse bitmaps: regions are allocated
+ * in chunks of 256 items at once.
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/blockset.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/blockset.h"
+
+/* Lowest level of radix tree is represented by bitmap */
+typedef struct
+{
+	char data[256/8];
+} BlockSetLevel4Data;
+
+typedef BlockSetLevel4Data *BlockSetLevel4;
+
+/* statically typed inner level chunks points to ground level */
+typedef struct
+{
+	/* null references denote empty subtree */
+	BlockSetLevel4 next[256];
+} BlockSetLevel3Data;
+
+typedef BlockSetLevel3Data *BlockSetLevel3;
+
+/* inner level points to another inner level */
+typedef struct
+{
+	BlockSetLevel3 next[256];
+} BlockSetLevel2Data;
+
+typedef BlockSetLevel2Data *BlockSetLevel2;
+
+/* Radix tree root */
+typedef struct BlockSetData
+{
+	BlockSetLevel2 next[256];
+} BlockSetData;
+
+/* multiplex block number into indexes of radix tree */
+#define BLOCKSET_SPLIT_BLKNO				\
+	uint32_t i1, i2, i3, i4, byte_no, byte_mask;	\
+	i4 = blkno % 256;						\
+	blkno /= 256;							\
+	i3 = blkno % 256;						\
+	blkno /= 256;							\
+	i2 = blkno % 256;						\
+	blkno /= 256;							\
+	i1 = blkno;								\
+	byte_no = i4 / 8;						\
+	byte_mask = 1 << (i4 % 8)				\
+
+/* indicate presence of block number in set */
+BlockSet blockset_set(BlockSet bs, BlockNumber blkno)
+{
+	BLOCKSET_SPLIT_BLKNO;
+	if (bs == NULL)
+	{
+		bs = palloc0(sizeof(BlockSetData));
+	}
+	BlockSetLevel2 bs2 = bs->next[i1];
+	if (bs2 == NULL)
+	{
+		bs2 = palloc0(sizeof(BlockSetLevel2Data));
+		bs->next[i1] = bs2;
+	}
+	BlockSetLevel3 bs3 = bs2->next[i2];
+	if (bs3 == NULL)
+	{
+		bs3 = palloc0(sizeof(BlockSetLevel3Data));
+		bs2->next[i2] = bs3;
+	}
+	BlockSetLevel4 bs4 = bs3->next[i3];
+	if (bs4 == NULL)
+	{
+		bs4 = palloc0(sizeof(BlockSetLevel4Data));
+		bs3->next[i3] = bs4;
+	}
+	bs4->data[byte_no] = byte_mask | bs4->data[byte_no];
+	return bs;
+}
+
+/* Test presence of block in set */
+bool blockset_get(BlockNumber blkno, BlockSet bs)
+{
+	BLOCKSET_SPLIT_BLKNO;
+	if (bs == NULL)
+		return false;
+	BlockSetLevel2 bs2 = bs->next[i1];
+	if (bs2 == NULL)
+		return false;
+	BlockSetLevel3 bs3 = bs2->next[i2];
+	if (bs3 == NULL)
+		return false;
+	BlockSetLevel4 bs4 = bs3->next[i3];
+	if (bs4 == NULL)
+		return false;
+	return (bs4->data[byte_no] & byte_mask);
+}
+
+/* 
+ * Find nearest block number in set no less than blkno
+ * Return InvalidBlockNumber if nothing to return
+ * If given InvalidBlockNumber - returns minimal element in set
+ */
+BlockNumber blockset_next(BlockSet bs, BlockNumber blkno)
+{
+	if (blkno == InvalidBlockNumber)
+		blkno = 0; /* equvalent to ++, left for clear code */
+	else
+		blkno++;
+
+	BLOCKSET_SPLIT_BLKNO;
+
+	if (bs == NULL)
+		return InvalidBlockNumber;
+	for (; i1 < 256; i1++)
+	{
+		BlockSetLevel2 bs2 = bs->next[i1];
+		if (!bs2)
+			continue;
+		for (; i2 < 256; i2++)
+		{
+			BlockSetLevel3 bs3 = bs2->next[i2];
+			if (!bs3)
+				continue;
+			for (; i3 < 256; i3++)
+			{
+				BlockSetLevel4 bs4 = bs3->next[i3];
+				if (!bs4)
+					continue;
+				for (; byte_no < 256 / 8; byte_no++)
+				{
+					if (!bs4->data[byte_no])
+						continue;
+					while (byte_mask < 256)
+					{
+						if ((byte_mask & bs4->data[byte_no]) == byte_mask)
+						{
+							i4 = byte_no * 8;
+							while (byte_mask >>= 1) i4++;
+							return i4 + 256 * (i3 + 256 * (i2 + 256 * i1));
+						}
+						byte_mask <<= 1;
+					}
+					byte_mask = 1;
+				}
+				byte_no = 0;
+			}
+			i3 = 0;
+		}
+		i2 = 0;
+	}
+	return InvalidBlockNumber;
+}
+
+/* free anything palloced */
+void blockset_free(BlockSet bs)
+{
+	BlockNumber blkno = 0;
+	BLOCKSET_SPLIT_BLKNO;
+	if (bs == NULL)
+		return;
+	for (; i1 < 256; i1++)
+	{
+		BlockSetLevel2 bs2 = bs->next[i1];
+		if (!bs2)
+			continue;
+		for (; i2 < 256; i2++)
+		{
+			BlockSetLevel3 bs3 = bs2->next[i2];
+			if (!bs3)
+				continue;
+			for (; i3 < 256; i3++)
+			{
+				BlockSetLevel4 bs4 = bs3->next[i3];
+				if (bs4)
+					pfree(bs4);
+			}
+			pfree(bs3);
+		}
+		pfree(bs2);
+	}
+	pfree(bs);
+}
\ No newline at end of file
diff --git a/src/include/lib/blockset.h b/src/include/lib/blockset.h
new file mode 100644
index 0000000000..1966d17a8d
--- /dev/null
+++ b/src/include/lib/blockset.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * blockset.h
+ *		Data structure for operations on set of block numbers
+ *
+ * IDENTIFICATION
+ *    src/include/lib/blockset.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BLOCKSET_H
+#define BLOCKSET_H
+
+#include "storage/block.h"
+
+typedef struct BlockSetData *BlockSet;
+
+extern BlockSet blockset_set(BlockSet bs, BlockNumber blkno);
+extern bool blockset_get(BlockNumber blkno, BlockSet bs);
+extern BlockNumber blockset_next(BlockSet bs, BlockNumber blkno);
+extern void blockset_free(BlockSet bs);
+
+#endif							/* BLOCKSET_H */
diff --git a/src/test/modules/test_blockset/.gitignore b/src/test/modules/test_blockset/.gitignore
new file mode 100644
index 0000000000..5dcb3ff972
--- /dev/null
+++ b/src/test/modules/test_blockset/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_blockset/Makefile b/src/test/modules/test_blockset/Makefile
new file mode 100644
index 0000000000..091cf8c095
--- /dev/null
+++ b/src/test/modules/test_blockset/Makefile
@@ -0,0 +1,21 @@
+# src/test/modules/test_blockset/Makefile
+
+MODULE_big = test_blockset
+OBJS = test_blockset.o $(WIN32RES)
+PGFILEDESC = "test_blockset - test code for block set library"
+
+EXTENSION = test_blockset
+DATA = test_blockset--1.0.sql
+
+REGRESS = test_blockset
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_blockset
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_blockset/README b/src/test/modules/test_blockset/README
new file mode 100644
index 0000000000..730951ff03
--- /dev/null
+++ b/src/test/modules/test_blockset/README
@@ -0,0 +1,8 @@
+test_blockset overview
+=========================
+
+test_blockset is a test harness module for testing block set data structure.
+There are two main tests:
+1. Test of comliance with Bitmapset on numbers avaiable to Bitmapset
+2. Test of numbers that can exceed INT32_MAX but are just shifted on one bit
+from numbers kept in Bitmapset
\ No newline at end of file
diff --git a/src/test/modules/test_blockset/expected/test_blockset.out b/src/test/modules/test_blockset/expected/test_blockset.out
new file mode 100644
index 0000000000..02c29d5fc0
--- /dev/null
+++ b/src/test/modules/test_blockset/expected/test_blockset.out
@@ -0,0 +1,7 @@
+CREATE EXTENSION test_blockset;
+SELECT test_blockset();
+ test_blockset 
+---------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_blockset/sql/test_blockset.sql b/src/test/modules/test_blockset/sql/test_blockset.sql
new file mode 100644
index 0000000000..85d2886676
--- /dev/null
+++ b/src/test/modules/test_blockset/sql/test_blockset.sql
@@ -0,0 +1,3 @@
+CREATE EXTENSION test_blockset;
+
+SELECT test_blockset();
\ No newline at end of file
diff --git a/src/test/modules/test_blockset/test_blockset--1.0.sql b/src/test/modules/test_blockset/test_blockset--1.0.sql
new file mode 100644
index 0000000000..04eea8a614
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_blockset/test_blockset--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_blockset" to load this file. \quit
+
+CREATE FUNCTION test_blockset()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_blockset/test_blockset.c b/src/test/modules/test_blockset/test_blockset.c
new file mode 100644
index 0000000000..7a1d1c86c8
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset.c
@@ -0,0 +1,182 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_blockset.c
+ *		Test block set data structure.
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/test/modules/test_blockset/test_blockset.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "lib/blockset.h"
+#include "nodes/bitmapset.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_blockset);
+
+static void test_blockset_bms_compliance();
+static void test_blockset_big_block_numbers();
+
+/*
+ * SQL-callable entry point to perform all tests.
+ */
+Datum
+test_blockset(PG_FUNCTION_ARGS)
+{
+	test_blockset_bms_compliance(0);
+	test_blockset_bms_compliance(1);
+	test_blockset_bms_compliance(2);
+	test_blockset_bms_compliance(1337);
+	test_blockset_bms_compliance(100000);
+	test_blockset_big_block_numbers(1337);
+	test_blockset_big_block_numbers(31337);
+	PG_RETURN_VOID();
+}
+
+/*
+ * This test creates random bitmap with test_limit members
+ * and checks that block set behavior is similar to Bitmapset
+ */
+static void test_blockset_bms_compliance(int32_t test_limit)
+{
+	BlockSet bs = NULL;
+	Bitmapset *bms = NULL;
+	BlockNumber blockno;
+	int index;
+	int32_t point_index = 0;
+
+	for (int32_t i = 0; i < test_limit; i++)
+	{
+		blockno = random() & INT32_MAX;
+		/* bms does not support block numbers above INT32_MAX */
+		bs = blockset_set(bs, blockno);
+		bms = bms_add_member(bms, (int)blockno);
+	}
+
+	index = -1;
+	blockno = InvalidBlockNumber;
+
+	while (true)
+	{
+		point_index++;
+		BlockNumber next_bn = blockset_next(bs, blockno);
+		int next_index = bms_next_member(bms, index);
+
+
+		if (next_bn == InvalidBlockNumber && next_index == -2)
+			return; /* We have found everything */
+
+		if (((BlockNumber)next_index) != next_bn)
+		{
+			elog(ERROR,
+				 "Bitmapset returned value %X different from block set %X,"
+				 " test_limit %d, point index %d",
+				 next_index, next_bn, test_limit, point_index);
+		}
+
+		if (!blockset_get(next_bn, bs))
+		{
+			elog(ERROR,
+				 "Block set did not found present item %X"
+				 " test_limit %d, point index %d",
+				 next_bn, test_limit, point_index);
+		}
+
+		index = next_index;
+		blockno = next_bn;
+	}
+
+	for (int32_t i = 0; i < test_limit; i++)
+	{
+		blockno = random() & INT32_MAX;
+		if (blockset_get(blockno, bs) != bms_is_member((int)blockno, bms))
+		{
+			elog(ERROR,
+				 "Block set did agree with bitmapset item %X"
+				 " test_limit %d, point index %d",
+				 blockno, test_limit, point_index);
+		}
+	}
+
+	blockset_free(bs);
+	bms_free(bms);
+}
+
+/* 
+ * This test is similar to test_blockset_bms_compliance()
+ * except that it shifts BlockNumbers by one bit to ensure that blockset
+ * operates correctly on values higher that INT32_MAX
+ * This function is copy-pasted from previous with the exception of barrel
+ * shifts for BlockNumbers. I've tried various refactorings, but they all
+ * looked ugly.
+ */
+static void test_blockset_big_block_numbers(int32_t test_limit)
+{
+	BlockSet bs = NULL;
+	Bitmapset *bms = NULL;
+	BlockNumber blockno;
+	int index;
+	int32_t point_index = 0;
+
+	for (int32_t i = 0; i < test_limit; i++)
+	{
+		blockno = random() & INT32_MAX;
+		/* bms does not support block numbers above INT32_MAX */
+		bs = blockset_set(bs, blockno << 1);
+		bms = bms_add_member(bms, (int)blockno);
+	}
+
+	index = -1;
+	blockno = InvalidBlockNumber;
+
+	while (true)
+	{
+		point_index++;
+		BlockNumber next_bn = blockset_next(bs, blockno);
+		int next_index = bms_next_member(bms, index);
+
+
+		if (next_bn == InvalidBlockNumber && next_index == -2)
+			return; /* We have found everything */
+
+		if (((BlockNumber)next_index) != (next_bn >> 1))
+		{
+			elog(ERROR,
+				 "Bitmapset returned value %X different from block set %X,"
+				 " test_limit %d, point index %d",
+				 next_index, next_bn, test_limit, point_index);
+		}
+
+		if (!blockset_get(next_bn, bs))
+		{
+			elog(ERROR,
+				 "Block set did not found present item %X"
+				 " test_limit %d, point index %d",
+				 next_bn, test_limit, point_index);
+		}
+
+		index = next_index;
+		blockno = next_bn;
+	}
+
+	for (int32_t i = 0; i < test_limit; i++)
+	{
+		blockno = random() & INT32_MAX;
+		if (blockset_get(blockno << 1, bs) != bms_is_member((int)blockno, bms))
+		{
+			elog(ERROR,
+				 "Block set did agree with bitmapset item %X"
+				 " test_limit %d, point index %d",
+				 blockno, test_limit, point_index);
+		}
+	}
+
+	blockset_free(bs);
+	bms_free(bms);
+}
diff --git a/src/test/modules/test_blockset/test_blockset.control b/src/test/modules/test_blockset/test_blockset.control
new file mode 100644
index 0000000000..fdb7598c5a
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset.control
@@ -0,0 +1,4 @@
+comment = 'Test code for block set library'
+default_version = '1.0'
+module_pathname = '$libdir/test_blockset'
+relocatable = true
-- 
2.20.1

