On 25/09/2025 7:35 AM, Michael Paquier wrote:
On Mon, Sep 22, 2025 at 07:44:18PM +0300, Константин Книжник wrote:
Attached please find rebased version of the patch with fixed mistypings.
I have looked at v3.
- leftpage = PageGetTempPage(origpage);
+ leftpage = leftpage_buf.data;
+ memcpy(leftpage, origpage, BLCKSZ);
_bt_pageinit(leftpage, BufferGetPageSize(buf));
What's the point of copying the contents of origpage to leftpage?
_bt_pageinit() is going to initialize leftpage (plus a few more things
set like the page LSN, etc.), so the memcpy should not be necessary,
no?
Sorry, memcpy is really not needed here.
+ rightpage = BufferGetPage(rbuf);
+ memcpy(rightpage, rightpage_buf.data, BLCKSZ);
+ ropaque = BTPageGetOpaque(rightpage);
When we reach this state of the logic, aka at the beginning of the
critical section, the right and left pages are in a correct state,
and your code is OK because we copy the contents of the right page
back into its legitimate place. What is not OK to me is that the
copy of the "temporary" right page back to "rbuf" is not explained. I
think that this deserves a comment, especially the part about ropaque
which is set once by this proposal, then manipulated while in the
critical section.
I added the comment, please check that it is clear and complete.
Updated version of the patch is attached.
diff --git a/src/backend/access/nbtree/nbtinsert.c
b/src/backend/access/nbtree/nbtinsert.c
index be60781fc98..268f02bbdf0 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1473,6 +1473,8 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
Page origpage;
Page leftpage,
rightpage;
+ PGAlignedBlock leftpage_buf,
+ rightpage_buf;
BlockNumber origpagenumber,
rightpagenumber;
BTPageOpaque ropaque,
@@ -1543,8 +1545,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
newitem, &newitemonleft);
- /* Allocate temp buffer for leftpage */
- leftpage = PageGetTempPage(origpage);
+ leftpage = leftpage_buf.data;
_bt_pageinit(leftpage, BufferGetPageSize(buf));
lopaque = BTPageGetOpaque(leftpage);
@@ -1707,19 +1708,13 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
/*
* Acquire a new right page to split into, now that left page has a new
- * high key. From here on, it's not okay to throw an error without
- * zeroing rightpage first. This coding rule ensures that we won't
- * confuse future VACUUM operations, which might otherwise try to
re-find
- * a downlink to a leftover junk page as the page undergoes deletion.
- *
- * It would be reasonable to start the critical section just after the
new
- * rightpage buffer is acquired instead; that would allow us to avoid
- * leftover junk pages without bothering to zero rightpage. We do it
this
- * way because it avoids an unnecessary PANIC when either origpage or
its
- * existing sibling page are corrupt.
+ * high key. To prevent confusion of VACUUM with orphan page, we also
+ * first fill in-memory copy of this page.
*/
rbuf = _bt_allocbuf(rel, heaprel);
- rightpage = BufferGetPage(rbuf);
+ rightpage = rightpage_buf.data;
+ memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
+ memset(BufferGetPage(rbuf), 0, BLCKSZ);
rightpagenumber = BufferGetBlockNumber(rbuf);
/* rightpage was initialized by _bt_allocbuf */
ropaque = BTPageGetOpaque(rightpage);
@@ -1768,7 +1763,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
if (PageAddItem(rightpage, (Item) righthighkey, itemsz,
afterrightoff,
false, false) ==
InvalidOffsetNumber)
{
- memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add high key to the right
sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
@@ -1816,7 +1810,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
if (!_bt_pgaddtup(leftpage, newitemsz, newitem,
afterleftoff,
false))
{
- memset(rightpage, 0,
BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to
the left sibling"
" while splitting block %u of
index \"%s\"",
origpagenumber,
RelationGetRelationName(rel));
@@ -1829,7 +1822,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
if (!_bt_pgaddtup(rightpage, newitemsz,
newitem, afterrightoff,
afterrightoff
== minusinfoff))
{
- memset(rightpage, 0,
BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to
the right sibling"
" while splitting block %u of
index \"%s\"",
origpagenumber,
RelationGetRelationName(rel));
@@ -1843,7 +1835,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
{
if (!_bt_pgaddtup(leftpage, itemsz, dataitem,
afterleftoff, false))
{
- memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the left
sibling"
" while splitting block %u of index
\"%s\"",
origpagenumber,
RelationGetRelationName(rel));
@@ -1855,7 +1846,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
if (!_bt_pgaddtup(rightpage, itemsz, dataitem,
afterrightoff,
afterrightoff ==
minusinfoff))
{
- memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the
right sibling"
" while splitting block %u of index
\"%s\"",
origpagenumber,
RelationGetRelationName(rel));
@@ -1876,7 +1866,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
afterrightoff == minusinfoff))
{
- memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right
sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
@@ -1896,7 +1885,6 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
sopaque = BTPageGetOpaque(spage);
if (sopaque->btpo_prev != origpagenumber)
{
- memset(rightpage, 0, BufferGetPageSize(rbuf));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("right sibling's
left-link doesn't match: "
@@ -1939,9 +1927,13 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert
itup_key, Buffer buf,
* original. We need to do this before writing the WAL record, so that
* XLogInsert can WAL log an image of the page if necessary.
*/
- PageRestoreTempPage(leftpage, origpage);
+ memcpy(origpage, leftpage, BLCKSZ);
/* leftpage, lopaque must not be used below here */
+ rightpage = BufferGetPage(rbuf);
+ memcpy(rightpage, rightpage_buf.data, BLCKSZ);
+ ropaque = BTPageGetOpaque(rightpage);
+
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);