On Wed, Feb 3, 2021 at 08:07:16PM -0500, Bruce Momjian wrote:
> > > I can try to add a hint-bit-page-write page counter, but that might
> > > overflow, and then we will need a way to change the LSN anyway.
> >
> > That's just a question of width...
>
> Yeah, the hint bit counter is just delaying the inevitable, plus it
> changes the page format, which I am trying to avoid. Also, I need this
> dummy record only if the page is marked clean, meaning a write
> to the file system already happened in the current checkpoint --- should
> not be to bad.
In looking your comments on Sawada-san's POC patch for buffer
encryption:
https://www.postgresql.org/message-id/20210112193431.2edcz776qjen7kao%40alap3.anarazel.de
I see that he put a similar function call in exactly the same place I
did, but you pointed out that he was inserting into WAL while holding a
buffer lock.
I restructured my patch to not make that same mistake, and modified it
for non-permanent buffers --- attached.
--
Bruce Momjian <[email protected]> https://momjian.us
EDB https://enterprisedb.com
The usefulness of a cup is in its emptiness, Bruce Lee
>From d535b08d264281348ab9e8eac645a5382ec35a8a Mon Sep 17 00:00:00 2001
From: Bruce Momjian <[email protected]>
Date: Thu, 4 Feb 2021 13:43:56 -0500
Subject: [PATCH] hint squash commit
---
src/backend/access/rmgrdesc/xlogdesc.c | 6 +-
src/backend/access/transam/xlog.c | 4 ++
src/backend/access/transam/xloginsert.c | 16 +++++
src/backend/replication/logical/decode.c | 1 +
src/backend/storage/buffer/bufmgr.c | 89 ++++++++++++++++--------
src/include/access/xloginsert.h | 2 +
src/include/catalog/pg_control.h | 1 +
7 files changed, 90 insertions(+), 29 deletions(-)
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 92cc7ea073..16a967bb71 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -80,7 +80,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
appendStringInfoString(buf, xlrec->rp_name);
}
- else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+ else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+ info == XLOG_HINT_LSN)
{
/* no further information to print */
}
@@ -185,6 +186,9 @@ xlog_identify(uint8 info)
case XLOG_FPI_FOR_HINT:
id = "FPI_FOR_HINT";
break;
+ case XLOG_HINT_LSN:
+ id = "HINT_LSN";
+ break;
}
return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f03bd473e2..f67316ee07 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10260,6 +10260,10 @@ xlog_redo(XLogReaderState *record)
UnlockReleaseBuffer(buffer);
}
}
+ else if (info == XLOG_HINT_LSN)
+ {
+ /* nothing to do here */
+ }
else if (info == XLOG_BACKUP_END)
{
XLogRecPtr startpoint;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 7052dc245e..7635a3d44d 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -980,6 +980,22 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
return recptr;
}
+/*
+ * On the first hint bit change during a checkpoint, XLogSaveBufferForHint()
+ * writes a full page image to the WAL and returns a new LSN which can be
+ * assigned to the page. Cluster file encryption needs a new LSN for
+ * every hint bit page write because the LSN is used in the nonce, and
+ * the nonce must be unique. Therefore, this routine just advances the LSN
+ * so the page can be assigned a new LSN.
+ */
+XLogRecPtr
+XLogLSNForHint(void)
+{
+ XLogBeginInsert();
+
+ return XLogInsert(RM_XLOG_ID, XLOG_HINT_LSN);
+}
+
/*
* Write a WAL record containing a full image of a page. Caller is responsible
* for writing the page to disk after calling this routine.
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index afa1df00d0..2ada89c6b1 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -223,6 +223,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
case XLOG_FPW_CHANGE:
case XLOG_FPI_FOR_HINT:
case XLOG_FPI:
+ case XLOG_HINT_LSN:
break;
default:
elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 561c212092..b9f3f76798 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3810,13 +3810,14 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
* If we need to protect hint bit updates from torn writes, WAL-log a
* full page image of the page. This full page image is only necessary
* if the hint bit update is the first change to the page since the
- * last checkpoint.
+ * last checkpoint. If cluster file encryption is enabled, we also
+ * need to generate new page LSNs for non-first-page writes during
+ * a checkpoint.
*
* We don't check full_page_writes here because that logic is included
* when we call XLogInsert() since the value changes dynamically.
*/
- if (XLogHintBitIsNeeded() &&
- (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
+ if (XLogHintBitIsNeeded())
{
/*
* If we must not write WAL, due to a relfilenode-specific
@@ -3826,35 +3827,67 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
*
* See src/backend/storage/page/README for longer discussion.
*/
- if (RecoveryInProgress() ||
- RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+ if (((pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)
+ /* XXX || file_encryption_method != 0 */) &&
+ (RecoveryInProgress() ||
+ RelFileNodeSkippingWAL(bufHdr->tag.rnode)))
return;
+ if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)
+ {
+ /*
+ * If the block is already dirty because we either made a change
+ * or set a hint already, then we don't need to write a full page
+ * image. Note that aggressive cleaning of blocks dirtied by hint
+ * bit setting would increase the call rate. Bulk setting of hint
+ * bits would reduce the call rate...
+ *
+ * We must issue the WAL record before we mark the buffer dirty.
+ * Otherwise we might write the page before we write the WAL. That
+ * causes a race condition, since a checkpoint might occur between
+ * writing the WAL record and marking the buffer dirty. We solve
+ * that with a kluge, but one that is already in use during
+ * transaction commit to prevent race conditions. Basically, we
+ * simply prevent the checkpoint WAL record from being written
+ * until we have marked the buffer dirty. We don't start the
+ * checkpoint flush until we have marked dirty, so our checkpoint
+ * must flush the change to disk successfully or the checkpoint
+ * never gets written, so crash recovery will fix.
+ *
+ * It's possible we may enter here without an xid, so it is
+ * essential that CreateCheckpoint waits for virtual transactions
+ * rather than full transactionids.
+ */
+ MyProc->delayChkpt = delayChkpt = true;
+ lsn = XLogSaveBufferForHint(buffer, buffer_std);
+ }
+
/*
- * If the block is already dirty because we either made a change
- * or set a hint already, then we don't need to write a full page
- * image. Note that aggressive cleaning of blocks dirtied by hint
- * bit setting would increase the call rate. Bulk setting of hint
- * bits would reduce the call rate...
- *
- * We must issue the WAL record before we mark the buffer dirty.
- * Otherwise we might write the page before we write the WAL. That
- * causes a race condition, since a checkpoint might occur between
- * writing the WAL record and marking the buffer dirty. We solve
- * that with a kluge, but one that is already in use during
- * transaction commit to prevent race conditions. Basically, we
- * simply prevent the checkpoint WAL record from being written
- * until we have marked the buffer dirty. We don't start the
- * checkpoint flush until we have marked dirty, so our checkpoint
- * must flush the change to disk successfully or the checkpoint
- * never gets written, so crash recovery will fix.
- *
- * It's possible we may enter here without an xid, so it is
- * essential that CreateCheckpoint waits for virtual transactions
- * rather than full transactionids.
+ * For cluster file encryption, we generate a new page LSN
+ * for hint-bit non-permanent and non-first page writes.
*/
- MyProc->delayChkpt = delayChkpt = true;
- lsn = XLogSaveBufferForHint(buffer, buffer_std);
+ if (/* XXX file_encryption_method != 0 && */
+ XLogRecPtrIsInvalid(lsn) &&
+ /* XXX can we check BM_DIRTY without a page lock? */
+ !(pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY))
+ {
+ /*
+ * If the buffer is clean, and lsn is valid, it must be the
+ * first hint bit change of the checkpoint (the old page lsn
+ * is earlier than the RedoRecPtr). If the page is clean and
+ * the lsn is invalid, the page must have been already written
+ * during this checkpoint, and this is a new hint bit change.
+ * Such page changes usually don't create new page lsns, but we
+ * need one for cluster file encryption because the lsn is used as
+ * part of the nonce, and we don't want to reencrypt the page
+ * with the hint bit changes using the same nonce as the previous
+ * writes during this checkpoint. (It would expose the hint bit
+ * change locations.) Therefore we create a simple WAL record
+ * to advance the lsn, which can can then be assigned to the
+ * page below.
+ */
+ lsn = XLogLSNForHint();
+ }
}
buf_state = LockBufHdr(bufHdr);
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index f1d8c39edf..a066f65229 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -61,6 +61,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forkNum,
BlockNumber startblk, BlockNumber endblk, bool page_std);
extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
+extern XLogRecPtr XLogLSNForHint(void);
+
extern void InitXLogInsert(void);
#endif /* XLOGINSERT_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e3f48158ce..36ac7dfbb8 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -76,6 +76,7 @@ typedef struct CheckPoint
#define XLOG_END_OF_RECOVERY 0x90
#define XLOG_FPI_FOR_HINT 0xA0
#define XLOG_FPI 0xB0
+#define XLOG_HINT_LSN 0xC0
/*
--
2.20.1