On Mon, Apr 4, 2016 at 3:22 PM, Michael Paquier
<[email protected]> wrote:
> The approach introducing the concept of WAL progress addresses the
> problem of unnecessary checkpoints and of unnecessary standby
> snapshots. If we take the problem only from the angle of RUNNING_XACTS
> the current logic used to check if a checkpoint should be skipped or
> not is not addressed.
This discussion has been stalling for a while regarding the main
patches... Attached are rebased versions based on the approach with
XLogInsertExtended(). And I have added as well a couple of days ago an
entry on the 9.6 open item page, not as a 9.6 open item, but as an
older bug to keep at least track of this problem.
--
Michael
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d60c123..5ae6f49 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8357,8 +8357,12 @@ CreateCheckPoint(int flags)
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
{
+ elog(LOG, "Not a forced or shutdown checkpoint: progress_lsn %X/%X, ckpt %X/%X",
+ (uint32) (progress_lsn >> 32), (uint32) progress_lsn,
+ (uint32) (ControlFile->checkPoint >> 32), (uint32) ControlFile->checkPoint);
if (progress_lsn == ControlFile->checkPoint)
{
+ elog(LOG, "Checkpoint is skipped");
WALInsertLockRelease();
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
@@ -8525,7 +8529,11 @@ CreateCheckPoint(int flags)
* recovery we don't need to write running xact data.
*/
if (!shutdown && XLogStandbyInfoActive())
- LogStandbySnapshot();
+ {
+ XLogRecPtr lsn = LogStandbySnapshot();
+ elog(LOG, "snapshot taken by checkpoint %X/%X",
+ (uint32) (lsn >> 32), (uint32) lsn);
+ }
START_CRIT_SECTION();
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 79cfd7b..082e589 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -333,7 +333,9 @@ BackgroundWriterMain(void)
GetLastCheckpointRecPtr() < current_progress_lsn &&
last_progress_lsn < current_progress_lsn)
{
- (void) LogStandbySnapshot();
+ XLogRecPtr lsn = LogStandbySnapshot();
+ elog(LOG, "snapshot taken by bgwriter %X/%X",
+ (uint32) (lsn >> 32), (uint32) lsn);
last_snapshot_ts = now;
last_progress_lsn = current_progress_lsn;
}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9644db..d60c123 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -439,11 +439,30 @@ typedef struct XLogwrtResult
* the WAL record is just copied to the page and the lock is released. But
* to avoid the deadlock-scenario explained above, the indicator is always
* updated before sleeping while holding an insertion lock.
+ *
+ * The progressAt values indicate the insertion progress used to determine
+ * WAL insertion activity since a previous checkpoint, which is aimed at
+ * finding out if a checkpoint should be skipped or not or if standby
+ * activity should be logged. Progress position is basically updated
+ * for all types of records, for the time being only snapshot logging
+ * is out of this scope to properly skip their logging on idle systems.
+ * Tracking the WAL activity directly in WALInsertLock has the advantage
+ * to not rely on taking an exclusive lock on all the WAL insertion locks,
+ * hence reducing the impact of the activity lookup. This takes also
+ * advantage to avoid 8-byte torn reads on some platforms by using the
+ * fact that each insert lock is located on the same cache line.
+ * XXX: There is still room for more improvements here, particularly
+ * WAL operations related to unlogged relations (INIT_FORKNUM) should not
+ * update the progress LSN as those relations are reset during crash
+ * recovery so enforcing buffers of such relations to be flushed for
+ * example in the case of a load only on unlogged relations is a waste
+ * of disk write.
*/
typedef struct
{
LWLock lock;
XLogRecPtr insertingAt;
+ XLogRecPtr progressAt;
} WALInsertLock;
/*
@@ -881,6 +900,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
* which pages need a full-page image, and retry. If fpw_lsn is invalid, the
* record is always inserted.
*
+ * 'flags' gives more in-depth control on the record being inserted. As of
+ * now, this controls if the progress LSN positions are updated.
+ *
* The first XLogRecData in the chain must be for the record header, and its
* data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
* xl_crc fields in the header, the rest of the header must already be filled
@@ -893,7 +915,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
* WAL rule "write the log before the data".)
*/
XLogRecPtr
-XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
+XLogInsertRecord(XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
pg_crc32c rdata_crc;
@@ -992,6 +1016,25 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
inserted = true;
}
+ /*
+ * Update the progress LSN positions. At least one WAL insertion lock
+ * is already taken appropriately before doing that, and it is just more
+ * simple to do that here where WAL record data and type is at hand.
+ * The progress is set at the start position of the record tracked that
+ * is being added, making easier checkpoint progress tracking as the
+ * control file already saves the start LSN position of the last
+ * checkpoint run. If an exclusive lock is taken for WAL insertion,
+ * there is actually no need to update all the progression fields, so
+ * just do it on the first one.
+ */
+ if ((flags & XLOG_INSERT_NO_PROGRESS) == 0)
+ {
+ if (holdingAllLocks)
+ WALInsertLocks[0].l.progressAt = StartPos;
+ else
+ WALInsertLocks[MyLockNo].l.progressAt = StartPos;
+ }
+
if (inserted)
{
/*
@@ -4717,6 +4760,7 @@ XLOGShmemInit(void)
{
LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+ WALInsertLocks[i].l.progressAt = InvalidXLogRecPtr;
}
/*
@@ -7901,6 +7945,55 @@ GetFlushRecPtr(void)
}
/*
+ * GetProgressRecPtr -- Returns the newest WAL activity position, aimed
+ * at the last significant WAL activity, or in other words any activity
+ * not referring to standby logging as of now. Finding the last activity
+ * position is done by scanning each WAL insertion lock by taking directly
+ * the light-weight lock associated to it.
+ */
+XLogRecPtr
+GetProgressRecPtr(void)
+{
+ XLogRecPtr res = InvalidXLogRecPtr;
+ int i;
+
+ /*
+ * Look at the latest LSN position referring to the activity done by
+ * WAL insertion. An exclusive lock is taken because currently the
+ * locking logic for WAL insertion only expects such a level of locking.
+ * Taking a lock is as well necessary to prevent potential torn reads
+ * on some platforms.
+ */
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ {
+ XLogRecPtr progress_lsn;
+
+ LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+ progress_lsn = WALInsertLocks[i].l.progressAt;
+ LWLockRelease(&WALInsertLocks[i].l.lock);
+
+ if (res < progress_lsn)
+ res = progress_lsn;
+ }
+
+ return res;
+}
+
+/*
+ * GetLastCheckpointRecPtr -- Returns the last checkpoint insert position.
+ */
+XLogRecPtr
+GetLastCheckpointRecPtr(void)
+{
+ XLogRecPtr ckpt_lsn;
+
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ ckpt_lsn = ControlFile->checkPoint;
+ LWLockRelease(ControlFileLock);
+ return ckpt_lsn;
+}
+
+/*
* Get the time of the last xlog segment switch
*/
pg_time_t
@@ -8160,7 +8253,7 @@ CreateCheckPoint(int flags)
uint32 freespace;
XLogRecPtr PriorRedoPtr;
XLogRecPtr curInsert;
- XLogRecPtr prevPtr;
+ XLogRecPtr progress_lsn;
VirtualTransactionId *vxids;
int nvxids;
@@ -8241,34 +8334,30 @@ CreateCheckPoint(int flags)
checkPoint.oldestActiveXid = InvalidTransactionId;
/*
+ * Get progress before acquiring insert locks to shorten the locked
+ * section waiting ahead.
+ */
+ progress_lsn = GetProgressRecPtr();
+
+ /*
* We must block concurrent insertions while examining insert state to
* determine the checkpoint REDO pointer.
*/
WALInsertLockAcquireExclusive();
curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
- prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos);
/*
- * If this isn't a shutdown or forced checkpoint, and we have not inserted
- * any XLOG records since the start of the last checkpoint, skip the
- * checkpoint. The idea here is to avoid inserting duplicate checkpoints
- * when the system is idle. That wastes log space, and more importantly it
- * exposes us to possible loss of both current and previous checkpoint
- * records if the machine crashes just as we're writing the update.
- * (Perhaps it'd make even more sense to checkpoint only when the previous
- * checkpoint record is in a different xlog page?)
- *
- * If the previous checkpoint crossed a WAL segment, however, we create
- * the checkpoint anyway, to have the latest checkpoint fully contained in
- * the new segment. This is for a little bit of extra robustness: it's
- * better if you don't need to keep two WAL segments around to recover the
- * checkpoint.
+ * If this isn't a shutdown or forced checkpoint, and if there has been no
+ * WAL activity, skip the checkpoint. The idea here is to avoid inserting
+ * duplicate checkpoints when the system is idle. That wastes log space,
+ * and more importantly it exposes us to possible loss of both current and
+ * previous checkpoint records if the machine crashes just as we're writing
+ * the update.
*/
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
{
- if (prevPtr == ControlFile->checkPointCopy.redo &&
- prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE)
+ if (progress_lsn == ControlFile->checkPoint)
{
WALInsertLockRelease();
LWLockRelease(CheckpointLock);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 1e336ed..137492d 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -394,9 +394,23 @@ XLogIncludeOrigin(void)
}
/*
+ * XLogInsert
+ *
+ * A shorthand for XLogInsertExtended, to update the progress of WAL
+ * activity by default.
+ */
+XLogRecPtr
+XLogInsert(RmgrId rmid, uint8 info)
+{
+ return XLogInsertExtended(rmid, info, 0);
+}
+
+/*
* Insert an XLOG record having the specified RMID and info bytes, with the
* body of the record being the data and buffer references registered earlier
- * with XLogRegister* calls.
+ * with XLogRegister* calls. 'flags' allow users to control more in-depth
+ * operations during WAL record insertion. As of now, this gives control on
+ * if the progress LSN positions are updated or not.
*
* Returns XLOG pointer to end of record (beginning of next record).
* This can be used as LSN for data pages affected by the logged action.
@@ -405,7 +419,7 @@ XLogIncludeOrigin(void)
* WAL rule "write the log before the data".)
*/
XLogRecPtr
-XLogInsert(RmgrId rmid, uint8 info)
+XLogInsertExtended(RmgrId rmid, uint8 info, uint8 flags)
{
XLogRecPtr EndPos;
@@ -450,7 +464,7 @@ XLogInsert(RmgrId rmid, uint8 info)
rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites,
&fpw_lsn);
- EndPos = XLogInsertRecord(rdt, fpw_lsn);
+ EndPos = XLogInsertRecord(rdt, fpw_lsn, flags);
} while (EndPos == InvalidXLogRecPtr);
XLogResetInsertion();
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 00f03d8..79cfd7b 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -78,12 +78,12 @@ int BgWriterDelay = 200;
#define LOG_SNAPSHOT_INTERVAL_MS 15000
/*
- * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid
- * doing so too often or repeatedly if there has been no other write activity
- * in the system.
+ * Last progress LSN and timestamp at which we last logged a standby
+ * snapshot, to avoid doing so too often or repeatedly if there has been
+ * no other write activity in the system.
*/
static TimestampTz last_snapshot_ts;
-static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr;
+static XLogRecPtr last_progress_lsn = InvalidXLogRecPtr;
/*
* Flags set by interrupt handlers for later service in the main loop.
@@ -310,7 +310,7 @@ BackgroundWriterMain(void)
* check whether there has been any WAL inserted since the last time
* we've logged a running xacts.
*
- * We do this logging in the bgwriter as its the only process that is
+ * We do this logging in the bgwriter as it is the only process that is
* run regularly and returns to its mainloop all the time. E.g.
* Checkpointer, when active, is barely ever in its mainloop and thus
* makes it hard to log regularly.
@@ -319,19 +319,23 @@ BackgroundWriterMain(void)
{
TimestampTz timeout = 0;
TimestampTz now = GetCurrentTimestamp();
+ XLogRecPtr current_progress_lsn = GetProgressRecPtr();
timeout = TimestampTzPlusMilliseconds(last_snapshot_ts,
LOG_SNAPSHOT_INTERVAL_MS);
/*
- * only log if enough time has passed and some xlog record has
- * been inserted.
+ * only log if enough time has passed, that some WAL activity
+ * has happened since last checkpoint, and that some xlog record
+ * has been inserted.
*/
if (now >= timeout &&
- last_snapshot_lsn != GetXLogInsertRecPtr())
+ GetLastCheckpointRecPtr() < current_progress_lsn &&
+ last_progress_lsn < current_progress_lsn)
{
- last_snapshot_lsn = LogStandbySnapshot();
+ (void) LogStandbySnapshot();
last_snapshot_ts = now;
+ last_progress_lsn = current_progress_lsn;
}
}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 6a9bf84..75c367c 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -952,7 +952,8 @@ LogStandbySnapshot(void)
* The definitions of RunningTransactionsData and xl_xact_running_xacts
* are similar. We keep them separate because xl_xact_running_xacts
* is a contiguous chunk of memory and never exists fully until it is
- * assembled in WAL.
+ * assembled in WAL. Progress of WAL activity is not updated when
+ * this record is logged.
*/
static XLogRecPtr
LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
@@ -976,7 +977,9 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
XLogRegisterData((char *) CurrRunningXacts->xids,
(xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
- recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
+ recptr = XLogInsertExtended(RM_STANDBY_ID,
+ XLOG_RUNNING_XACTS,
+ XLOG_INSERT_NO_PROGRESS);
if (CurrRunningXacts->subxid_overflow)
elog(trace_recovery(DEBUG2),
@@ -1024,7 +1027,8 @@ LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
- (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
+ (void) XLogInsertExtended(RM_STANDBY_ID, XLOG_STANDBY_LOCK,
+ XLOG_INSERT_NO_PROGRESS);
}
/*
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 0744c3f..794388e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -209,7 +209,9 @@ extern CheckpointStatsData CheckpointStats;
struct XLogRecData;
-extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn);
+extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags);
extern void XLogFlush(XLogRecPtr RecPtr);
extern bool XLogBackgroundFlush(void);
extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
@@ -260,6 +262,8 @@ extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void);
extern XLogRecPtr GetFlushRecPtr(void);
+extern XLogRecPtr GetProgressRecPtr(void);
+extern XLogRecPtr GetLastCheckpointRecPtr(void);
extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
extern void RemovePromoteSignalFiles(void);
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index cc0177e..6ff0176 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,10 +38,15 @@
#define REGBUF_KEEP_DATA 0x10/* include data even if a full-page image is
* taken */
+/* flags for XLogInsertExtended */
+#define XLOG_INSERT_NO_PROGRESS 0x01 /* do not update progress LSN
+ * when inserting record */
+
/* prototypes for public functions in xloginsert.c: */
extern void XLogBeginInsert(void);
extern void XLogIncludeOrigin(void);
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info);
+extern XLogRecPtr XLogInsertExtended(RmgrId rmid, uint8 info, uint8 flags);
extern void XLogEnsureRecordSpace(int nbuffers, int ndatas);
extern void XLogRegisterData(char *data, int len);
extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers