Hi,

>
> On 03/29/2017 09:16 PM, Ashutosh Sharma wrote:
>>>
>>> This patch needs a rebase.
>>
>>
>> Please try applying these patches on top of [1]. I think you should be
>> able
>> to apply it cleanly. Sorry, I think I forgot to mention this point in my
>> earlier mail.
>>
>> [1] -
>>
>> https://www.postgresql.org/message-id/CAE9k0P%3DV2LhtyeMXd295fhisp%3DNWUhRVJ9EZQCDowWiY9rSohQ%40mail.gmail.com
>>
>
> Thanks, that works !
>
> As you have provided a patch for Robert's comments, and no other review have
> been posted I'm moving this patch to "Ready for Committer" for additional
> committer feedback.

Please find the attached new version of patches for page scan mode in
hash index rebased on top of - [1].

[1] - 
https://www.postgresql.org/message-id/CAE9k0P%3D3rtgUtxopG%2BXQVxgASFzAnGd9sNmYUaj_%3DKeVsKGUdA%40mail.gmail.com
From 498723199f4b14ff9917aca13abf30f9ea261ca7 Mon Sep 17 00:00:00 2001
From: ashu <ashu@localhost.localdomain>
Date: Sat, 1 Apr 2017 12:09:46 +0530
Subject: [PATCH] Rewrite hash index scans to work a page at a timev5

Patch by Ashutosh Sharma
---
 src/backend/access/hash/README       |   9 +-
 src/backend/access/hash/hash.c       | 140 ++---------
 src/backend/access/hash/hashpage.c   |  17 +-
 src/backend/access/hash/hashsearch.c | 441 +++++++++++++++++++++++++++++++----
 src/backend/access/hash/hashutil.c   |  29 ++-
 src/include/access/hash.h            |  44 ++++
 6 files changed, 509 insertions(+), 171 deletions(-)

diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 1541438..f0a7bdf 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -243,10 +243,11 @@ The reader algorithm is:
 -- then, per read request:
 	reacquire content lock on current page
 	step to next page if necessary (no chaining of content locks, but keep
-     the pin on the primary bucket throughout the scan; we also maintain
-     a pin on the page currently being scanned)
-	get tuple
-	release content lock
+     the pin on the primary bucket throughout the scan)
+    save all the matching tuples from current index page into an items array
+    release pin and content lock (but if it is primary bucket page retain
+    it's pin till the end of scan)
+    get tuple from an item array
 -- at scan shutdown:
 	release all pins still held
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index b835f77..bd2827a 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -268,66 +268,23 @@ bool
 hashgettuple(IndexScanDesc scan, ScanDirection dir)
 {
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
-	Relation	rel = scan->indexRelation;
-	Buffer		buf;
-	Page		page;
 	OffsetNumber offnum;
-	ItemPointer current;
 	bool		res;
+	HashScanPosItem	*currItem;
 
 	/* Hash indexes are always lossy since we store only the hash code */
 	scan->xs_recheck = true;
 
 	/*
-	 * We hold pin but not lock on current buffer while outside the hash AM.
-	 * Reacquire the read lock here.
-	 */
-	if (BufferIsValid(so->hashso_curbuf))
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
-
-	/*
 	 * If we've already initialized this scan, we can just advance it in the
 	 * appropriate direction.  If we haven't done so yet, we call a routine to
 	 * get the first item in the scan.
 	 */
-	current = &(so->hashso_curpos);
-	if (ItemPointerIsValid(current))
+	if (!HashScanPosIsValid(so->currPos))
+		res = _hash_first(scan, dir);
+	else
 	{
 		/*
-		 * An insertion into the current index page could have happened while
-		 * we didn't have read lock on it.  Re-find our position by looking
-		 * for the TID we previously returned.  (Because we hold a pin on the
-		 * primary bucket page, no deletions or splits could have occurred;
-		 * therefore we can expect that the TID still exists in the current
-		 * index page, at an offset >= where we were.)
-		 */
-		OffsetNumber maxoffnum;
-
-		buf = so->hashso_curbuf;
-		Assert(BufferIsValid(buf));
-		page = BufferGetPage(buf);
-
-		/*
-		 * We don't need test for old snapshot here as the current buffer is
-		 * pinned, so vacuum can't clean the page.
-		 */
-		maxoffnum = PageGetMaxOffsetNumber(page);
-		for (offnum = ItemPointerGetOffsetNumber(current);
-			 offnum <= maxoffnum;
-			 offnum = OffsetNumberNext(offnum))
-		{
-			IndexTuple	itup;
-
-			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-			if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
-				break;
-		}
-		if (offnum > maxoffnum)
-			elog(ERROR, "failed to re-find scan position within index \"%s\"",
-				 RelationGetRelationName(rel));
-		ItemPointerSetOffsetNumber(current, offnum);
-
-		/*
 		 * Check to see if we should kill the previously-fetched tuple.
 		 */
 		if (scan->kill_prior_tuple)
@@ -346,9 +303,11 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 
 			if (so->numKilled < MaxIndexTuplesPerPage)
 			{
-				so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
-				so->killedItems[so->numKilled].indexOffset =
-							ItemPointerGetOffsetNumber(&(so->hashso_curpos));
+				currItem = &so->currPos.items[so->currPos.itemIndex];
+				offnum = currItem->indexOffset;
+
+				so->killedItems[so->numKilled].heapTid = currItem->heapTid;
+				so->killedItems[so->numKilled].indexOffset = offnum;
 				so->numKilled++;
 			}
 		}
@@ -358,30 +317,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 		 */
 		res = _hash_next(scan, dir);
 	}
-	else
-		res = _hash_first(scan, dir);
-
-	/*
-	 * Skip killed tuples if asked to.
-	 */
-	if (scan->ignore_killed_tuples)
-	{
-		while (res)
-		{
-			offnum = ItemPointerGetOffsetNumber(current);
-			page = BufferGetPage(so->hashso_curbuf);
-			if (!ItemIdIsDead(PageGetItemId(page, offnum)))
-				break;
-			res = _hash_next(scan, dir);
-		}
-	}
-
-	/* Release read lock on current buffer, but keep it pinned */
-	if (BufferIsValid(so->hashso_curbuf))
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
 
 	/* Return current heap TID on success */
-	scan->xs_ctup.t_self = so->hashso_heappos;
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
 
 	return res;
 }
@@ -396,35 +335,22 @@ hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	bool		res;
 	int64		ntids = 0;
+	HashScanPosItem *currItem;
 
 	res = _hash_first(scan, ForwardScanDirection);
 
 	while (res)
 	{
-		bool		add_tuple;
+		currItem = &so->currPos.items[so->currPos.itemIndex];
 
 		/*
-		 * Skip killed tuples if asked to.
+		 * _hash_first() or _hash_next() never returns
+		 * dead tuples. Therefore, we can always add
+		 * the tuples into TIDBitmap without checking
+		 * if a tuple is dead or not.
 		 */
-		if (scan->ignore_killed_tuples)
-		{
-			Page		page;
-			OffsetNumber offnum;
-
-			offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
-			page = BufferGetPage(so->hashso_curbuf);
-			add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
-		}
-		else
-			add_tuple = true;
-
-		/* Save tuple ID, and continue scanning */
-		if (add_tuple)
-		{
-			/* Note we mark the tuple ID as requiring recheck */
-			tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
-			ntids++;
-		}
+		tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
+		ntids++;
 
 		res = _hash_next(scan, ForwardScanDirection);
 	}
@@ -448,12 +374,9 @@ hashbeginscan(Relation rel, int nkeys, int norderbys)
 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
 
 	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
-	so->hashso_curbuf = InvalidBuffer;
+	HashScanPosInvalidate(so->currPos);
 	so->hashso_bucket_buf = InvalidBuffer;
 	so->hashso_split_bucket_buf = InvalidBuffer;
-	/* set position invalid (this will cause _hash_first call) */
-	ItemPointerSetInvalid(&(so->hashso_curpos));
-	ItemPointerSetInvalid(&(so->hashso_heappos));
 
 	so->hashso_buc_populated = false;
 	so->hashso_buc_split = false;
@@ -476,23 +399,14 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	Relation	rel = scan->indexRelation;
 
-	/*
-	 * Before leaving current page, deal with any killed items.
-	 * Also, ensure that we acquire lock on current page before
-	 * calling _hash_kill_items.
-	 */
+	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
-	{
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
 		_hash_kill_items(scan);
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
-	}
 
 	_hash_dropscanbuf(rel, so);
 
 	/* set position invalid (this will cause _hash_first call) */
-	ItemPointerSetInvalid(&(so->hashso_curpos));
-	ItemPointerSetInvalid(&(so->hashso_heappos));
+	HashScanPosInvalidate(so->currPos);
 
 	/* Update scan key, if a new one is given */
 	if (scankey && scan->numberOfKeys > 0)
@@ -515,17 +429,9 @@ hashendscan(IndexScanDesc scan)
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	Relation	rel = scan->indexRelation;
 
-	/*
-	 * Before leaving current page, deal with any killed items.
-	 * Also, ensure that we acquire lock on current page before
-	 * calling _hash_kill_items.
-	 */
+	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
-	{
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
 		_hash_kill_items(scan);
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
-	}
 
 	_hash_dropscanbuf(rel, so);
 
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 61ca2ec..cd3c679 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -298,20 +298,23 @@ _hash_dropscanbuf(Relation rel, HashScanOpaque so)
 {
 	/* release pin we hold on primary bucket page */
 	if (BufferIsValid(so->hashso_bucket_buf) &&
-		so->hashso_bucket_buf != so->hashso_curbuf)
+		so->hashso_bucket_buf != so->currPos.buf)
 		_hash_dropbuf(rel, so->hashso_bucket_buf);
-	so->hashso_bucket_buf = InvalidBuffer;
 
 	/* release pin we hold on primary bucket page  of bucket being split */
 	if (BufferIsValid(so->hashso_split_bucket_buf) &&
-		so->hashso_split_bucket_buf != so->hashso_curbuf)
+		so->hashso_split_bucket_buf != so->currPos.buf)
 		_hash_dropbuf(rel, so->hashso_split_bucket_buf);
-	so->hashso_split_bucket_buf = InvalidBuffer;
 
 	/* release any pin we still hold */
-	if (BufferIsValid(so->hashso_curbuf))
-		_hash_dropbuf(rel, so->hashso_curbuf);
-	so->hashso_curbuf = InvalidBuffer;
+	if (BufferIsValid(so->currPos.buf) &&
+		((so->hashso_bucket_buf == so->currPos.buf) ||
+		(so->hashso_split_bucket_buf == so->currPos.buf)))
+		_hash_dropbuf(rel, so->currPos.buf);
+
+	so->hashso_bucket_buf = InvalidBuffer;
+	so->hashso_split_bucket_buf = InvalidBuffer;
+	so->currPos.buf = InvalidBuffer;
 
 	/* reset split scan */
 	so->hashso_buc_populated = false;
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 2d92049..80c51d1 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -20,44 +20,144 @@
 #include "pgstat.h"
 #include "utils/rel.h"
 
+static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
+			ScanDirection dir);
+static int _hash_load_qualified_items(IndexScanDesc scan, Page page,
+			OffsetNumber offnum, OffsetNumber maxoff,
+			ScanDirection dir);
+static inline void _hash_saveitem(HashScanOpaque so, int itemIndex,
+			OffsetNumber offnum, IndexTuple itup);
+static void _hash_readnext(IndexScanDesc scan, Buffer *bufp,
+			Page *pagep, HashPageOpaque *opaquep);
 
 /*
  *	_hash_next() -- Get the next item in a scan.
  *
- *		On entry, we have a valid hashso_curpos in the scan, and a
- *		pin and read lock on the page that contains that item.
- *		We find the next item in the scan, if any.
- *		On success exit, we have the page containing the next item
- *		pinned and locked.
+ *		On entry, so->currPos describes the current page, which may
+ *		be pinned but not locked, and so->currPos.itemIndex identifies
+ *		which item was previously returned.
+ *
+ *		On successful exit, scan->xs_ctup.t_self is set to the TID
+ *		of the next heap tuple, and if requested, scan->xs_itup
+ *		points to a copy of the index tuple.  so->currPos is updated
+ *		as needed.
+ *
+ *		On failure exit (no more tuples), we return FALSE with no
+ *		pins or locks held.
  */
 bool
 _hash_next(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	HashScanPosItem *currItem;
+	BlockNumber blkno;
 	Buffer		buf;
 	Page		page;
-	OffsetNumber offnum;
-	ItemPointer current;
-	IndexTuple	itup;
-
-	/* we still have the buffer pinned and read-locked */
-	buf = so->hashso_curbuf;
-	Assert(BufferIsValid(buf));
+	HashPageOpaque opaque;
+	bool		end_of_scan = false;
 
 	/*
-	 * step to next valid tuple.
+	 * Advance to next tuple on current page; or if there's no more,
+	 * try to read data from next or prev page based on the scan
+	 * direction. Before moving to the next or prev page make sure
+	 * that we deal with all the killed items.
 	 */
-	if (!_hash_step(scan, &buf, dir))
+	if (ScanDirectionIsForward(dir))
+	{
+		if (++so->currPos.itemIndex > so->currPos.lastItem)
+		{
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			blkno = so->currPos.nextPage;
+			if (BlockNumberIsValid(blkno))
+			{
+				buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+				so->currPos.buf = buf;
+				if (!_hash_readpage(scan, &buf, dir))
+					end_of_scan = true;
+			}
+			else if (so->hashso_buc_populated && !so->hashso_buc_split)
+			{
+				/*
+				 * end of bucket, scan bucket being populated if there was a
+				 * split in progress at the start of scan.
+				 */
+				buf = so->currPos.buf = so->hashso_split_bucket_buf;
+				Assert(BufferIsValid(buf));
+				LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+				/*
+				 * setting hashso_buc_split to true indicates that we are
+				 * scanning bucket being split.
+				 */
+				so->hashso_buc_split = true;
+
+				if (!_hash_readpage(scan, &buf, dir))
+					end_of_scan = true;
+			}
+			else
+				end_of_scan = true;
+		}
+	}
+	else
+	{
+		if (--so->currPos.itemIndex < so->currPos.firstItem)
+		{
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			blkno = so->currPos.prevPage;
+			if (BlockNumberIsValid(blkno))
+			{
+				buf = _hash_getbuf(rel, blkno, HASH_READ,
+								   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+				so->currPos.buf = buf;
+				if (!_hash_readpage(scan, &buf, dir))
+					end_of_scan = true;
+			}
+			else if (so->hashso_buc_populated && so->hashso_buc_split)
+			{
+				/*
+				 * end of bucket, scan bucket being populated if there was a
+				 * split in progress at the start of scan.
+				 */
+				buf = so->hashso_split_bucket_buf;
+				Assert(BufferIsValid(buf));
+				LockBuffer(buf, BUFFER_LOCK_SHARE);
+				page = BufferGetPage(buf);
+				opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+				/* move to the end of bucket chain */
+				while (BlockNumberIsValid(opaque->hasho_nextblkno))
+					   _hash_readnext(scan, &buf, &page, &opaque);
+
+				/*
+				 * setting hashso_buc_split to false indicates that we are
+				 * scanning bucket being split.
+				 */
+
+				so->hashso_buc_split = false;
+
+				if (!_hash_readpage(scan, &buf, dir))
+					end_of_scan = true;
+			}
+			else
+				end_of_scan = true;
+		}
+	}
+
+	if (end_of_scan)
+	{
+		HashScanPosInvalidate(so->currPos);
+		_hash_dropscanbuf(rel, so);
 		return false;
+	}
 
-	/* if we're here, _hash_step found a valid tuple */
-	current = &(so->hashso_curpos);
-	offnum = ItemPointerGetOffsetNumber(current);
-	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-	page = BufferGetPage(buf);
-	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-	so->hashso_heappos = itup->t_tid;
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
 
 	return true;
 }
@@ -212,11 +312,15 @@ _hash_readprev(IndexScanDesc scan,
 /*
  *	_hash_first() -- Find the first item in a scan.
  *
- *		Find the first item in the index that
- *		satisfies the qualification associated with the scan descriptor. On
- *		success, the page containing the current index tuple is read locked
- *		and pinned, and the scan's opaque data entry is updated to
- *		include the buffer.
+ *		We find the first item (or, if backward scan, the last item) in
+ *		the index that satisfies the qualification associated with the
+ *		scan descriptor. On success, the page containing the current
+ *		index tuple is read locked and pinned, and data about the
+ *		matching tuple(s) on the page has been loaded into so->currPos,
+ *		scan->xs_ctup.t_self is set to the heap TID of the current tuple.
+ *
+ *		If there are no matching items in the index, we return FALSE,
+ *		with no pins or locks held.
  */
 bool
 _hash_first(IndexScanDesc scan, ScanDirection dir)
@@ -229,15 +333,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	Buffer		buf;
 	Page		page;
 	HashPageOpaque opaque;
-	IndexTuple	itup;
-	ItemPointer current;
-	OffsetNumber offnum;
 
 	pgstat_count_index_scan(rel);
 
-	current = &(so->hashso_curpos);
-	ItemPointerSetInvalid(current);
-
 	/*
 	 * We do not support hash scans with no index qualification, because we
 	 * would have to read the whole index rather than just one bucket. That
@@ -356,17 +454,15 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 			_hash_readnext(scan, &buf, &page, &opaque);
 	}
 
-	/* Now find the first tuple satisfying the qualification */
-	if (!_hash_step(scan, &buf, dir))
-		return false;
+	/* remember which buffer we have pinned, if any */
+	Assert(BufferIsInvalid(so->currPos.buf));
+	so->currPos.buf = buf;
 
-	/* if we're here, _hash_step found a valid tuple */
-	offnum = ItemPointerGetOffsetNumber(current);
-	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-	page = BufferGetPage(buf);
-	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-	so->hashso_heappos = itup->t_tid;
+	/* Now find all the tuples satisfying the qualification from a page */
+	if (!_hash_readpage(scan, &buf, dir))
+		return false;
 
+	/* if we're here, _hash_readpage found a valid tuples */
 	return true;
 }
 
@@ -575,3 +671,266 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 	ItemPointerSet(current, blkno, offnum);
 	return true;
 }
+
+/*
+ *	_hash_readpage() -- Load data from current index page into so->currPos
+ *
+ *	We scan all the items in the current index page and save them into
+ *	so->currPos if it satifies the qualification. If no matching items
+ *	are found in the current page, we move to the next or previous page
+ *	in a bucket chain as indicated by the direction.
+ *
+ *	Return true if any matching items are found else returns false.
+ */
+static bool
+_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	HashPageOpaque	opaque;
+	OffsetNumber	maxoff;
+	OffsetNumber	offnum;
+	uint16			itemIndex;
+
+	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+	buf = *bufP;
+	Assert(BufferIsValid(buf));
+	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+	page = BufferGetPage(buf);
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* new page, locate starting position by binary search */
+		offnum = _hash_binsearch(page, so->hashso_sk_hash);
+
+		itemIndex = _hash_load_qualified_items(scan, page, offnum, maxoff, dir);
+
+		while (itemIndex == 0)
+		{
+			/*
+			 * Could not find any matching tuples in the current page, move
+			 * to the next page. Before leaving the current page, also deal
+			 * with any killed items.
+			 */
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			_hash_readnext(scan, &buf, &page, &opaque);
+			if (BufferIsValid(buf))
+			{
+				so->currPos.buf = buf;
+				so->currPos.currPage = BufferGetBlockNumber(buf);
+				maxoff = PageGetMaxOffsetNumber(page);
+				offnum = _hash_binsearch(page, so->hashso_sk_hash);
+				itemIndex = _hash_load_qualified_items(scan, page, offnum,
+													   maxoff, dir);
+			}
+			else
+			{
+				/*
+				 * No more matching tuples were found. return FALSE
+				 * indicating the same. Also, remember the prev and
+				 * next block number so that if fetching tuples using
+				 * cursor we remember the page from where to start the
+				 * scan.
+				 */
+				so->currPos.prevPage = (opaque)->hasho_prevblkno;
+				so->currPos.nextPage = (opaque)->hasho_nextblkno;
+				return false;
+			}
+		}
+
+		if (so->currPos.buf == so->hashso_bucket_buf ||
+			so->currPos.buf == so->hashso_split_bucket_buf)
+		{
+			so->currPos.prevPage = InvalidBlockNumber;
+			LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		}
+		else
+		{
+			so->currPos.prevPage = (opaque)->hasho_prevblkno;
+			_hash_relbuf(rel, so->currPos.buf);
+		}
+
+		so->currPos.nextPage = (opaque)->hasho_nextblkno;
+		so->currPos.firstItem = 0;
+		so->currPos.lastItem = itemIndex - 1;
+		so->currPos.itemIndex = 0;
+	}
+	else
+	{
+		/* new page, locate starting position by binary search */
+		offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+
+		itemIndex = _hash_load_qualified_items(scan, page, offnum, maxoff, dir);
+
+		while (itemIndex == MaxIndexTuplesPerPage)
+		{
+			/*
+			 * Could not find any matching tuples in the current page, move
+			 * to the prev page. Before leaving the current page, also deal
+			 * with any killed items.
+			 */
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			_hash_readprev(scan, &buf, &page, &opaque);
+			if (BufferIsValid(buf))
+			{
+				so->currPos.buf = buf;
+				so->currPos.currPage = BufferGetBlockNumber(buf);
+				maxoff = PageGetMaxOffsetNumber(page);
+				offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+				itemIndex = _hash_load_qualified_items(scan, page, offnum,
+													   maxoff, dir);
+			}
+			else
+			{
+				/*
+				 * No more matching tuples were found. return FALSE
+				 * indicating the same. Also, remember the prev and
+				 * next block number so that if fetching tuples using
+				 * cursor we remember the page from where to start the
+				 * scan.
+				 */
+				so->currPos.prevPage = InvalidBlockNumber;
+				so->currPos.nextPage = (opaque)->hasho_nextblkno;
+				return false;
+			}
+		}
+
+		if (so->currPos.buf == so->hashso_bucket_buf ||
+			so->currPos.buf == so->hashso_split_bucket_buf)
+		{
+			so->currPos.prevPage = InvalidBlockNumber;
+			LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		}
+		else
+		{
+			so->currPos.prevPage = (opaque)->hasho_prevblkno;
+			_hash_relbuf(rel, so->currPos.buf);
+		}
+
+		so->currPos.nextPage = (opaque)->hasho_nextblkno;
+		so->currPos.firstItem = itemIndex;
+		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+	}
+
+	return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/*
+ * Load all the qualified items from a current index page
+ * into so->currPos. Helper function for _hash_readpage.
+ */
+static int
+_hash_load_qualified_items(IndexScanDesc scan, Page page, OffsetNumber offnum,
+						   OffsetNumber maxoff, ScanDirection dir)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	IndexTuple      itup;
+	int				itemIndex;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		/* new page, relocate starting position by binary search */
+		offnum = _hash_binsearch(page, so->hashso_sk_hash);
+
+		while (offnum <= maxoff)
+		{
+			Assert(offnum >= FirstOffsetNumber);
+			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+			/*
+			 * skip the tuples that are moved by split operation
+			 * for the scan that has started when split was in
+			 * progress. Also, skip the tuples that are marked
+			 * as dead.
+			 */
+			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
+				(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
+				(scan->ignore_killed_tuples &&
+				(ItemIdIsDead(PageGetItemId(page, offnum)))))
+			{
+				offnum = OffsetNumberNext(offnum);	/* move forward */
+				continue;
+			}
+
+			if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
+				_hash_checkqual(scan, itup))
+			{
+				/* tuple is qualified, so remember it */
+				_hash_saveitem(so, itemIndex, offnum, itup);
+				itemIndex++;
+			}
+
+			offnum = OffsetNumberNext(offnum);
+		}
+
+		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		return itemIndex;
+	}
+	else
+	{
+		/* load items[] in descending order */
+		itemIndex = MaxIndexTuplesPerPage;
+
+		/* new page, locate starting position by binary search */
+		offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+
+		while (offnum >= FirstOffsetNumber)
+		{
+			Assert(offnum <= maxoff);
+			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+			/*
+			 * skip the tuples that are moved by split operation
+			 * for the scan that has started when split was in
+			 * progress. Also, skip the tuples that are marked
+			 * as dead.
+			 */
+			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
+				(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
+				(scan->ignore_killed_tuples &&
+				(ItemIdIsDead(PageGetItemId(page, offnum)))))
+			{
+				offnum = OffsetNumberPrev(offnum);	/* move back */
+				continue;
+			}
+
+			if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
+				_hash_checkqual(scan, itup))
+			{
+				itemIndex--;
+				/* tuple is qualified, so remember it */
+				_hash_saveitem(so, itemIndex, offnum, itup);
+			}
+
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		Assert(itemIndex >= 0);
+		return itemIndex;
+	}
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static inline void
+_hash_saveitem(HashScanOpaque so, int itemIndex,
+			   OffsetNumber offnum, IndexTuple itup)
+{
+	HashScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = itup->t_tid;
+	currItem->indexOffset = offnum;
+}
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 2e99719..a4a03e0 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -463,6 +463,9 @@ void
 _hash_kill_items(IndexScanDesc scan)
 {
 	HashScanOpaque	so = (HashScanOpaque) scan->opaque;
+	Relation    rel = scan->indexRelation;
+	BlockNumber blkno;
+	Buffer  buf;
 	Page	page;
 	HashPageOpaque	opaque;
 	OffsetNumber	offnum, maxoff;
@@ -472,6 +475,7 @@ _hash_kill_items(IndexScanDesc scan)
 
 	Assert(so->numKilled > 0);
 	Assert(so->killedItems != NULL);
+	Assert(BufferIsValid(so->currPos.buf));
 
 	/*
 	 * Always reset the scan state, so we don't look for same
@@ -479,7 +483,23 @@ _hash_kill_items(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
-	page = BufferGetPage(so->hashso_curbuf);
+	blkno = so->currPos.currPage;
+	if (so->hashso_bucket_buf == so->currPos.buf)
+	{
+		buf = so->currPos.buf;
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+	}
+	else
+	{
+		if (BlockNumberIsValid(blkno))
+			buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+
+		/* It might not exist anymore; in which case we can't hint it. */
+		if (!BufferIsValid(buf))
+			return;
+	}
+
+	page = BufferGetPage(buf);
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);
 
@@ -511,6 +531,11 @@ _hash_kill_items(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
-		MarkBufferDirtyHint(so->hashso_curbuf, true);
+		MarkBufferDirtyHint(buf, true);
 	}
+
+	if (so->hashso_bucket_buf == so->currPos.buf)
+		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	else
+		_hash_relbuf(rel, buf);
 }
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index eb1df57..3b01e3e 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -103,6 +103,44 @@ typedef struct HashScanPosItem    /* what we remember about each match */
 	OffsetNumber indexOffset;	/* index item's location within page */
 } HashScanPosItem;
 
+typedef struct HashScanPosData
+{
+	Buffer		buf;			/* if valid, the buffer is pinned */
+	BlockNumber currPage;		/* current hash index page */
+	BlockNumber nextPage;		/* next overflow page */
+	BlockNumber prevPage;		/* prev overflow or bucket page */
+
+	/*
+	 * The items array is always ordered in index order (ie, increasing
+	 * indexoffset).  When scanning backwards it is convenient to fill the
+	 * array back-to-front, so we start at the last slot and fill downwards.
+	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
+	 * itemIndex is a cursor showing which entry was last returned to caller.
+	 */
+	int			firstItem;		/* first valid index in items[] */
+	int			lastItem;		/* last valid index in items[] */
+	int			itemIndex;		/* current index in items[] */
+
+	HashScanPosItem items[MaxIndexTuplesPerPage];		/* MUST BE LAST */
+}	HashScanPosData;
+
+#define HashScanPosIsValid(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BlockNumberIsValid((scanpos).currPage) \
+)
+
+#define HashScanPosInvalidate(scanpos) \
+	do { \
+		(scanpos).buf = InvalidBuffer; \
+		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).nextPage = InvalidBlockNumber; \
+		(scanpos).prevPage = InvalidBlockNumber; \
+		(scanpos).firstItem = 0; \
+		(scanpos).lastItem = 0; \
+		(scanpos).itemIndex = 0; \
+	} while (0);
 
 /*
  *	HashScanOpaqueData is private state for a hash index scan.
@@ -147,6 +185,12 @@ typedef struct HashScanOpaqueData
 	/* info about killed items if any (killedItems is NULL if never used) */
 	HashScanPosItem	*killedItems;	/* tids and offset numbers of killed items */
 	int			numKilled;			/* number of currently stored items */
+
+	/*
+	 * Identify all the matching items on a page and save them
+	 * in HashScanPosData
+	 */
+	HashScanPosData	currPos;		/* current position data */
 } HashScanOpaqueData;
 
 typedef HashScanOpaqueData *HashScanOpaque;
-- 
1.8.3.1

From 1ec664ac796b5b631fc772849c6de1aa1737f309 Mon Sep 17 00:00:00 2001
From: ashu <ashu@localhost.localdomain>
Date: Sun, 12 Feb 2017 10:52:22 +0530
Subject: [PATCH] Remove redundant function _hash_step() and some of the unused
 members of HashScanOpaqueData. The function _hash_step() used to find the
 next qualifing tuple in the index page is no more required as new hash index
 scan works page at a time which means it reads all the qualifing tuples in a
 page at once with the help of a new function _hash_readpage().

Patch by Ashutosh Sharma.
---
 src/backend/access/hash/hashsearch.c | 206 -----------------------------------
 src/include/access/hash.h            |  15 ---
 2 files changed, 221 deletions(-)

diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 96da9b5..913a996 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -410,212 +410,6 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
- *	_hash_step() -- step to the next valid item in a scan in the bucket.
- *
- *		If no valid record exists in the requested direction, return
- *		false.  Else, return true and set the hashso_curpos for the
- *		scan to the right thing.
- *
- *		Here we need to ensure that if the scan has started during split, then
- *		skip the tuples that are moved by split while scanning bucket being
- *		populated and then scan the bucket being split to cover all such
- *		tuples.  This is done to ensure that we don't miss tuples in the scans
- *		that are started during split.
- *
- *		'bufP' points to the current buffer, which is pinned and read-locked.
- *		On success exit, we have pin and read-lock on whichever page
- *		contains the right item; on failure, we have released all buffers.
- */
-bool
-_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
-{
-	Relation	rel = scan->indexRelation;
-	HashScanOpaque so = (HashScanOpaque) scan->opaque;
-	ItemPointer current;
-	Buffer		buf;
-	Page		page;
-	HashPageOpaque opaque;
-	OffsetNumber maxoff;
-	OffsetNumber offnum;
-	BlockNumber blkno;
-	IndexTuple	itup;
-
-	current = &(so->hashso_curpos);
-
-	buf = *bufP;
-	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-	page = BufferGetPage(buf);
-	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-
-	/*
-	 * If _hash_step is called from _hash_first, current will not be valid, so
-	 * we can't dereference it.  However, in that case, we presumably want to
-	 * start at the beginning/end of the page...
-	 */
-	maxoff = PageGetMaxOffsetNumber(page);
-	if (ItemPointerIsValid(current))
-		offnum = ItemPointerGetOffsetNumber(current);
-	else
-		offnum = InvalidOffsetNumber;
-
-	/*
-	 * 'offnum' now points to the last tuple we examined (if any).
-	 *
-	 * continue to step through tuples until: 1) we get to the end of the
-	 * bucket chain or 2) we find a valid tuple.
-	 */
-	do
-	{
-		switch (dir)
-		{
-			case ForwardScanDirection:
-				if (offnum != InvalidOffsetNumber)
-					offnum = OffsetNumberNext(offnum);	/* move forward */
-				else
-				{
-					/* new page, locate starting position by binary search */
-					offnum = _hash_binsearch(page, so->hashso_sk_hash);
-				}
-
-				for (;;)
-				{
-					/*
-					 * check if we're still in the range of items with the
-					 * target hash key
-					 */
-					if (offnum <= maxoff)
-					{
-						Assert(offnum >= FirstOffsetNumber);
-						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-
-						/*
-						 * skip the tuples that are moved by split operation
-						 * for the scan that has started when split was in
-						 * progress
-						 */
-						if (so->hashso_buc_populated && !so->hashso_buc_split &&
-							(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
-						{
-							offnum = OffsetNumberNext(offnum);	/* move forward */
-							continue;
-						}
-
-						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
-							break;		/* yes, so exit for-loop */
-					}
-
-					/* Before leaving current page, deal with any killed items */
-					if (so->numKilled > 0)
-						_hash_kill_items(scan);
-
-					/*
-					 * ran off the end of this page, try the next
-					 */
-					_hash_readnext(scan, &buf, &page, &opaque);
-					if (BufferIsValid(buf))
-					{
-						maxoff = PageGetMaxOffsetNumber(page);
-						offnum = _hash_binsearch(page, so->hashso_sk_hash);
-					}
-					else
-					{
-						itup = NULL;
-						break;	/* exit for-loop */
-					}
-				}
-				break;
-
-			case BackwardScanDirection:
-				if (offnum != InvalidOffsetNumber)
-					offnum = OffsetNumberPrev(offnum);	/* move back */
-				else
-				{
-					/* new page, locate starting position by binary search */
-					offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
-				}
-
-				for (;;)
-				{
-					/*
-					 * check if we're still in the range of items with the
-					 * target hash key
-					 */
-					if (offnum >= FirstOffsetNumber)
-					{
-						Assert(offnum <= maxoff);
-						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-
-						/*
-						 * skip the tuples that are moved by split operation
-						 * for the scan that has started when split was in
-						 * progress
-						 */
-						if (so->hashso_buc_populated && !so->hashso_buc_split &&
-							(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK))
-						{
-							offnum = OffsetNumberPrev(offnum);	/* move back */
-							continue;
-						}
-
-						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
-							break;		/* yes, so exit for-loop */
-					}
-
-					/* Before leaving current page, deal with any killed items */
-					if (so->numKilled > 0)
-						_hash_kill_items(scan);
-
-					/*
-					 * ran off the end of this page, try the next
-					 */
-					_hash_readprev(scan, &buf, &page, &opaque);
-					if (BufferIsValid(buf))
-					{
-						TestForOldSnapshot(scan->xs_snapshot, rel, page);
-						maxoff = PageGetMaxOffsetNumber(page);
-						offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
-					}
-					else
-					{
-						itup = NULL;
-						break;	/* exit for-loop */
-					}
-				}
-				break;
-
-			default:
-				/* NoMovementScanDirection */
-				/* this should not be reached */
-				itup = NULL;
-				break;
-		}
-
-		if (itup == NULL)
-		{
-			/*
-			 * We ran off the end of the bucket without finding a match.
-			 * Release the pin on bucket buffers.  Normally, such pins are
-			 * released at end of scan, however scrolling cursors can
-			 * reacquire the bucket lock and pin in the same scan multiple
-			 * times.
-			 */
-			*bufP = so->hashso_curbuf = InvalidBuffer;
-			ItemPointerSetInvalid(current);
-			_hash_dropscanbuf(rel, so);
-			return false;
-		}
-
-		/* check the tuple quals, loop around if not met */
-	} while (!_hash_checkqual(scan, itup));
-
-	/* if we made it to here, we've found a valid tuple */
-	blkno = BufferGetBlockNumber(buf);
-	*bufP = so->hashso_curbuf = buf;
-	ItemPointerSet(current, blkno, offnum);
-	return true;
-}
-
-/*
  *	_hash_readpage() -- Load data from current index page into so->currPos
  *
  *	We scan all the items in the current index page and save them into
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 4efed52..4056da5 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -150,14 +150,6 @@ typedef struct HashScanOpaqueData
 	/* Hash value of the scan key, ie, the hash key we seek */
 	uint32		hashso_sk_hash;
 
-	/*
-	 * We also want to remember which buffer we're currently examining in the
-	 * scan. We keep the buffer pinned (but not locked) across hashgettuple
-	 * calls, in order to avoid doing a ReadBuffer() for every tuple in the
-	 * index.
-	 */
-	Buffer		hashso_curbuf;
-
 	/* remember the buffer associated with primary bucket */
 	Buffer		hashso_bucket_buf;
 
@@ -168,12 +160,6 @@ typedef struct HashScanOpaqueData
 	 */
 	Buffer		hashso_split_bucket_buf;
 
-	/* Current position of the scan, as an index TID */
-	ItemPointerData hashso_curpos;
-
-	/* Current position of the scan, as a heap TID */
-	ItemPointerData hashso_heappos;
-
 	/* Whether scan starts on bucket being populated due to split */
 	bool		hashso_buc_populated;
 
@@ -409,7 +395,6 @@ extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
 /* hashsearch.c */
 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
-extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
 
 /* hashsort.c */
 typedef struct HSpool HSpool;	/* opaque struct in hashsort.c */
-- 
1.8.3.1

From 532287d11f4fd03e78a613cabb6751f8ab22fa59 Mon Sep 17 00:00:00 2001
From: ashu <ashu@localhost.localdomain>
Date: Wed, 22 Mar 2017 18:51:22 +0530
Subject: [PATCH] Improve locking startegy during VACUUM in Hash Index v2

Patch by Ashutosh Sharma
---
 src/backend/access/hash/hash.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 2450ee1..3ac8154 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -841,19 +841,20 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
 		if (!BlockNumberIsValid(blkno))
 			break;
 
-		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-											  LH_OVERFLOW_PAGE,
-											  bstrategy);
-
 		/*
-		 * release the lock on previous page after acquiring the lock on next
-		 * page
+		 * As the hash index scan work in page at a time mode,
+		 * vacuum can release the lock on previous page before
+		 * acquiring lock on the next page.
 		 */
 		if (retain_pin)
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		else
 			_hash_relbuf(rel, buf);
 
+		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+											  LH_OVERFLOW_PAGE,
+											  bstrategy);
+
 		buf = next_buf;
 	}
 
-- 
1.8.3.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to