From 7f557dc99190469218f769042a695b777a40bc93 Mon Sep 17 00:00:00 2001
From: Amit Kapila <amit.kapila@enterprisedb.com>
Date: Wed, 23 Aug 2017 16:15:49 +0530
Subject: [PATCH 3/3] Improve locking startegy during VACUUM in Hash Index for
 regular tables.

Patch by Ashutosh Sharma.
---
 src/backend/access/hash/README     | 28 +++++++++++-------------
 src/backend/access/hash/hash.c     | 44 ++++++++++++++++++++++++++------------
 src/backend/access/hash/hashovfl.c | 13 +++++++----
 3 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 4465605..8921c78 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -396,8 +396,8 @@ The fourth operation is garbage collection (bulk deletion):
 			mark the target page dirty
 			write WAL for deleting tuples from target page
 			if this is the last bucket page, break out of loop
-			pin and x-lock next page
 			release prior lock and pin (except keep pin on primary bucket page)
+			pin and x-lock next page
 		if the page we have locked is not the primary bucket page:
 			release lock and take exclusive lock on primary bucket page
 		if there are no other pins on the primary bucket page:
@@ -415,21 +415,17 @@ The fourth operation is garbage collection (bulk deletion):
 Note that this is designed to allow concurrent splits and scans.  If a split
 occurs, tuples relocated into the new bucket will be visited twice by the
 scan, but that does no harm.  As we release the lock on bucket page during
-cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
-and ensures that scan will always be behind cleanup.  It is must to keep scans
-behind cleanup, else vacuum could decrease the TIDs that are required to
-complete the scan.  Now, as the scan that returns multiple tuples from the
-same bucket page always expect next valid TID to be greater than or equal to
-the current TID, it might miss the tuples.  This holds true for backward scans
-as well (backward scans first traverse each bucket starting from first bucket
-to last overflow page in the chain).  We must be careful about the statistics
-reported by the VACUUM operation.  What we can do is count the number of
-tuples scanned, and believe this in preference to the stored tuple count if
-the stored tuple count and number of buckets did *not* change at any time
-during the scan.  This provides a way of correcting the stored tuple count if
-it gets out of sync for some reason.  But if a split or insertion does occur
-concurrently, the scan count is untrustworthy; instead, subtract the number of
-tuples deleted from the stored tuple count and use that.
+cleanup scan of a bucket, it will allow concurrent scan to start on a bucket.
+It is quite possible that scans get ahead of vacuum and vacuum removes some
+items from the current page being scanned, but that does no harm as we always
+copy all the matching items from a page at once in the backend local array.
+We must be careful about the statistics reported by the VACUUM operation.  What
+we can do is count the number of tuples scanned, and believe this in preference
+to the stored tuple count if the stored tuple count and number of buckets did
+*not* change at any time during the scan.  This provides a way of correcting the
+stored tuple count if it gets out of sync for some reason.  But if a split or
+insertion does occur concurrently, the scan count is untrustworthy; instead,
+subtract the number of tuples deleted from the stored tuple count and use that.
 
 
 Free Space Management
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 45a3a5a..012e00f 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -660,11 +660,9 @@ hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  * that the next valid TID will be greater than or equal to the current
  * valid TID.  There can't be any concurrent scans in progress when we first
  * enter this function because of the cleanup lock we hold on the primary
- * bucket page, but as soon as we release that lock, there might be.  We
- * handle that by conspiring to prevent those scans from passing our cleanup
- * scan.  To do that, we lock the next page in the bucket chain before
- * releasing the lock on the previous page.  (This type of lock chaining is
- * not ideal, so we might want to look for a better solution at some point.)
+ * bucket page, but as soon as we release that lock, there might be. But,
+ * we do not have to bother about it, as the hash index scan work in page
+ * at a time mode.
  *
  * We need to retain a pin on the primary bucket to ensure that no concurrent
  * split can start.
@@ -833,18 +831,36 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
 		if (!BlockNumberIsValid(blkno))
 			break;
 
-		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-											  LH_OVERFLOW_PAGE,
-											  bstrategy);
-
 		/*
-		 * release the lock on previous page after acquiring the lock on next
-		 * page
+		 * As the hash index scan works in page-at-a-time mode, vacuum can
+		 * release the lock on previous page before acquiring lock on the next
+		 * page for regular tables, but, for unlogged tables, we avoid this as
+		 * we do not want scan to cross vacuum when both are running on the
+		 * same bucket page. This is to ensure that, we are safe during dead
+		 * marking of index tuples in _hash_kill_items().
 		 */
-		if (retain_pin)
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		if (RelationNeedsWAL(rel))
+		{
+			if (retain_pin)
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, buf);
+
+			next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+												  LH_OVERFLOW_PAGE,
+												  bstrategy);
+		}
 		else
-			_hash_relbuf(rel, buf);
+		{
+			next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+												  LH_OVERFLOW_PAGE,
+												  bstrategy);
+
+			if (retain_pin)
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, buf);
+		}
 
 		buf = next_buf;
 	}
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index c206e70..b41afbb 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -524,7 +524,7 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
 	 * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
 	 * up the bucket chain members behind and ahead of the overflow page being
 	 * deleted.  Concurrency issues are avoided by using lock chaining as
-	 * described atop hashbucketcleanup.
+	 * described atop _hash_squeezebucket.
 	 */
 	if (BlockNumberIsValid(prevblkno))
 	{
@@ -790,9 +790,14 @@ _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
  *	Caller must acquire cleanup lock on the primary page of the target
  *	bucket to exclude any scans that are in progress, which could easily
  *	be confused into returning the same tuple more than once or some tuples
- *	not at all by the rearrangement we are performing here.  To prevent
- *	any concurrent scan to cross the squeeze scan we use lock chaining
- *	similar to hasbucketcleanup.  Refer comments atop hashbucketcleanup.
+ *	not at all by the rearrangement we are performing here. This means there
+ *	can't be any concurrent scans in progress when we first enter this
+ *	function because of the cleanup lock we hold on the primary bucket page,
+ *	but as soon as we release that lock, there might be. To prevent any
+ *	concurrent scan to cross the squeeze scan we use lock chaining i.e.
+ *	we lock the next page in the bucket chain before releasing the lock on
+ *	the previous page. (This type of lock chaining is not ideal, so we might
+ *	want to look for a better solution at some point.)
  *
  *	We need to retain a pin on the primary bucket to ensure that no concurrent
  *	split can start.
-- 
1.8.4.msysgit.0

