At Thu, 24 Aug 2023 11:22:32 +0900 (JST), Kyotaro Horiguchi <horikyota....@gmail.com> wrote in > I could turn this into something like undo longs in a simple form, but > I'd rather not craft a general-purpose undo log system for this unelss > it's absolutely necessary.
This is a patch for a basic undo log implementation. It looks like it works well for some orphan-files-after-a-crash and data-loss-on-reinit cases. However, it is far from complete and likely has issues with crash-safety and the durability of undo log files (and memory leaks and performance and..). I'm posting this to move the discussion forward. (This doesn't contain the third file "ALTER TABLE ..ALL IN TABLESPACE" part.) regards. -- Kyotaro Horiguchi NTT Open Source Software Center
>From da5696b9026fa916ae991f7da616062c5b19e705 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyota....@gmail.com> Date: Thu, 31 Aug 2023 11:49:10 +0900 Subject: [PATCH v29 1/2] Introduce undo log implementation This patch adds a simple implementation of UNDO log feature. --- src/backend/access/transam/Makefile | 1 + src/backend/access/transam/rmgr.c | 4 +- src/backend/access/transam/simpleundolog.c | 343 +++++++++++++++++++++ src/backend/access/transam/twophase.c | 3 + src/backend/access/transam/xact.c | 24 ++ src/backend/access/transam/xlog.c | 20 +- src/backend/catalog/storage.c | 171 ++++++++++ src/backend/storage/file/reinit.c | 78 +++++ src/backend/storage/smgr/smgr.c | 9 + src/bin/initdb/initdb.c | 17 + src/bin/pg_rewind/parsexlog.c | 2 +- src/bin/pg_waldump/rmgrdesc.c | 2 +- src/include/access/rmgr.h | 2 +- src/include/access/rmgrlist.h | 44 +-- src/include/access/simpleundolog.h | 36 +++ src/include/catalog/storage.h | 3 + src/include/catalog/storage_ulog.h | 35 +++ src/include/catalog/storage_xlog.h | 9 + src/include/storage/reinit.h | 2 + src/include/storage/smgr.h | 1 + src/tools/pgindent/typedefs.list | 6 + 21 files changed, 780 insertions(+), 32 deletions(-) create mode 100644 src/backend/access/transam/simpleundolog.c create mode 100644 src/include/access/simpleundolog.h create mode 100644 src/include/catalog/storage_ulog.h diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 661c55a9db..531505cbbd 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -21,6 +21,7 @@ OBJS = \ rmgr.o \ slru.o \ subtrans.o \ + simpleundolog.o \ timeline.o \ transam.o \ twophase.o \ diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 7d67eda5f7..840cbdecd3 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -35,8 +35,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ - { name, redo, desc, identify, startup, cleanup, mask, decode }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ + { name, redo, desc, identify, startup, cleanup, mask, decode}, RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/simpleundolog.c b/src/backend/access/transam/simpleundolog.c new file mode 100644 index 0000000000..ebbacce298 --- /dev/null +++ b/src/backend/access/transam/simpleundolog.c @@ -0,0 +1,343 @@ +/*------------------------------------------------------------------------- + * + * simpleundolog.c + * Simple implementation of PostgreSQL transaction-undo-log manager + * + * In this module, procedures required during a transaction abort are + * logged. Persisting this information becomes crucial, particularly for + * ensuring reliable post-processing during the restart following a transaction + * crash. At present, in this module, logging of information is performed by + * simply appending data to a created file. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/clog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/simpleundolog.h" +#include "access/twophase_rmgr.h" +#include "access/parallel.h" +#include "access/xact.h" +#include "catalog/storage_ulog.h" +#include "storage/fd.h" + +#define ULOG_FILE_MAGIC 0x12345678 + +typedef struct UndoLogFileHeader +{ + int32 magic; + bool prepared; +} UndoLogFileHeader; + +typedef struct UndoDescData +{ + const char *name; + void (*rm_undo) (SimpleUndoLogRecord *record, bool prepared); +} UndoDescData; + +/* must be kept in sync with RmgrData definition in xlog_internal.h */ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ + { name, undo }, + +UndoDescData UndoRoutines[RM_MAX_ID + 1] = { +#include "access/rmgrlist.h" +}; +#undef PG_RMGR + +#if defined(O_DSYNC) +static int undo_sync_mode = O_DSYNC; +#elif defined(O_SYNC) +static int undo_sync_mode = O_SYNC; +#else +static int undo_sync_mode = 0; +#endif + +static char current_ulogfile_name[MAXPGPATH]; +static int current_ulogfile_fd = -1; +static int current_xid = InvalidTransactionId; +static UndoLogFileHeader current_fhdr; + +static void +undolog_check_file_header(void) +{ + if (read(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not read undolog file \"%s\": %m", + current_ulogfile_name)); + if (current_fhdr.magic != ULOG_FILE_MAGIC) + ereport(PANIC, + errcode_for_file_access(), + errmsg("invalid undolog file \"%s\": magic don't match", + current_ulogfile_name)); +} + +static bool +undolog_open_current_file(TransactionId xid, bool forread, bool append) +{ + int omode; + + if (current_ulogfile_fd >= 0) + { + /* use existing open file */ + if (current_xid == xid) + { + if (append) + return true; + + if (lseek(current_ulogfile_fd, + sizeof(UndoLogFileHeader), SEEK_SET) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + } + + close(current_ulogfile_fd); + current_ulogfile_fd = -1; + ReleaseExternalFD(); + } + + current_xid = xid; + if (!TransactionIdIsValid(xid)) + return false; + + omode = PG_BINARY | undo_sync_mode; + + if (forread) + omode |= O_RDONLY; + else + { + omode |= O_RDWR; + + if (!append) + omode |= O_TRUNC; + } + + snprintf(current_ulogfile_name, MAXPGPATH, "%s/%08x", + SIMPLE_UNDOLOG_DIR, xid); + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, omode); + if (current_ulogfile_fd >= 0) + undolog_check_file_header(); + else + { + if (forread) + return false; + + current_fhdr.magic = ULOG_FILE_MAGIC; + current_fhdr.prepared = false; + + omode |= O_CREAT; + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, omode); + if (current_ulogfile_fd < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not create undolog file \"%s\": %m", + current_ulogfile_name)); + + if (write(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not write undolog file \"%s\": %m", + current_ulogfile_name)); + } + + /* + * move file pointer to the end of the file. we do this not using O_APPEND, + * to allow us to modify data at any location in the file. We already moved + * to the first record in the case of !append. + */ + if (append) + { + if (lseek(current_ulogfile_fd, 0, SEEK_END) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + } + ReserveExternalFD(); + + return true; +} + +/* + * Write ulog record + */ +void +SimpleUndoLogWrite(RmgrId rmgr, uint8 info, + TransactionId xid, void *data, int len) +{ + int reclen = sizeof(SimpleUndoLogRecord) + len; + SimpleUndoLogRecord *rec = palloc(reclen); + pg_crc32c undodata_crc; + + Assert(!IsParallelWorker()); + Assert(xid != InvalidTransactionId); + + undolog_open_current_file(xid, false, true); + + rec->ul_tot_len = reclen; + rec->ul_rmid = rmgr; + rec->ul_info = info; + rec->ul_xid = current_xid; + + memcpy((char *)rec + sizeof(SimpleUndoLogRecord), data, len); + + /* Calculate CRC of the data */ + INIT_CRC32C(undodata_crc); + COMP_CRC32C(undodata_crc, rec, + reclen - offsetof(SimpleUndoLogRecord, ul_rmid)); + rec->ul_crc = undodata_crc; + + + if (write(current_ulogfile_fd, rec, reclen) < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not write to undolog file \"%s\": %m", + current_ulogfile_name)); +} + +static void +SimpleUndoLogUndo(bool cleanup) +{ + int bufsize; + char *buf; + + bufsize = 1024; + buf = palloc(bufsize); + + Assert(current_ulogfile_fd >= 0); + + while (read(current_ulogfile_fd, buf, sizeof(SimpleUndoLogRecord)) == + sizeof(SimpleUndoLogRecord)) + { + SimpleUndoLogRecord *rec = (SimpleUndoLogRecord *) buf; + int readlen = rec->ul_tot_len - sizeof(SimpleUndoLogRecord); + int ret; + + if (rec->ul_tot_len > bufsize) + { + bufsize *= 2; + buf = repalloc(buf, bufsize); + } + + ret = read(current_ulogfile_fd, + buf + sizeof(SimpleUndoLogRecord), readlen); + if (ret != readlen) + { + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not read undo log file \"%s\": %m", + current_ulogfile_name)); + + ereport(ERROR, + errcode_for_file_access(), + errmsg("reading undo log expected %d bytes, but actually %d: %s", + readlen, ret, current_ulogfile_name)); + + } + + UndoRoutines[rec->ul_rmid].rm_undo(rec, + current_fhdr.prepared && cleanup); + } +} + +void +AtEOXact_SimpleUndoLog(bool isCommit, TransactionId xid) +{ + if (IsParallelWorker()) + return; + + if (!undolog_open_current_file(xid, true, false)) + return; + + if (!isCommit) + SimpleUndoLogUndo(false); + + if (current_ulogfile_fd > 0) + { + if (close(current_ulogfile_fd) != 0) + ereport(PANIC, errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + current_ulogfile_name)); + + current_ulogfile_fd = -1; + ReleaseExternalFD(); + durable_unlink(current_ulogfile_name, FATAL); + } + + return; +} + +void +UndoLogCleanup(void) +{ + DIR *dirdesc; + struct dirent *de; + char **loglist; + int loglistspace = 128; + int loglistlen = 0; + int i; + + loglist = palloc(sizeof(char*) * loglistspace); + + dirdesc = AllocateDir(SIMPLE_UNDOLOG_DIR); + while ((de = ReadDir(dirdesc, SIMPLE_UNDOLOG_DIR)) != NULL) + { + if (strspn(de->d_name, "01234567890abcdef") < strlen(de->d_name)) + continue; + + if (loglistlen >= loglistspace) + { + loglistspace *= 2; + loglist = repalloc(loglist, sizeof(char*) * loglistspace); + } + loglist[loglistlen++] = pstrdup(de->d_name); + } + + for (i = 0 ; i < loglistlen ; i++) + { + snprintf(current_ulogfile_name, MAXPGPATH, "%s/%s", + SIMPLE_UNDOLOG_DIR, loglist[i]); + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, + O_RDWR | PG_BINARY | + undo_sync_mode); + undolog_check_file_header(); + SimpleUndoLogUndo(true); + if (close(current_ulogfile_fd) != 0) + ereport(PANIC, errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + current_ulogfile_name)); + current_ulogfile_fd = -1; + + /* do not remove ulog files for prepared transactions */ + if (!current_fhdr.prepared) + durable_unlink(current_ulogfile_name, FATAL); + } +} + +void +SimpleUndoLogSetPrpared(TransactionId xid, bool prepared) +{ + Assert(xid != InvalidTransactionId); + + undolog_open_current_file(xid, false, true); + current_fhdr.prepared = prepared; + if (lseek(current_ulogfile_fd, 0, SEEK_SET) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + + if (write(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not write undolog file \"%s\": %m", + current_ulogfile_name)); +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index c6af8cfd7e..a32ec28eb0 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -78,6 +78,7 @@ #include "access/commit_ts.h" #include "access/htup_details.h" +#include "access/simpleundolog.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -1565,6 +1566,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) abortstats, gid); + AtEOXact_SimpleUndoLog(isCommit, xid); + ProcArrayRemove(proc, latestXid); /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 8daaa535ed..8bbe8fdb08 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -24,6 +24,7 @@ #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" +#include "access/simpleundolog.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -2224,6 +2225,9 @@ CommitTransaction(void) */ smgrDoPendingSyncs(true, is_parallel_worker); + /* Likewise perform uncommitted storage file deletion. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2365,6 +2369,7 @@ CommitTransaction(void) AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true, is_parallel_worker); AtEOXact_SMgr(); + AtEOXact_SimpleUndoLog(true, GetCurrentTransactionIdIfAny()); AtEOXact_Files(true); AtEOXact_ComboCid(); AtEOXact_HashTables(true); @@ -2475,6 +2480,9 @@ PrepareTransaction(void) */ smgrDoPendingSyncs(true, false); + /* Likewise perform uncommitted storage file deletion. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2799,6 +2807,7 @@ AbortTransaction(void) AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); smgrDoPendingSyncs(false, is_parallel_worker); + smgrDoPendingCleanups(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false, is_parallel_worker); @@ -2866,6 +2875,7 @@ AbortTransaction(void) AtEOXact_on_commit_actions(false); AtEOXact_Namespace(false, is_parallel_worker); AtEOXact_SMgr(); + AtEOXact_SimpleUndoLog(false, GetCurrentTransactionIdIfAny()); AtEOXact_Files(false); AtEOXact_ComboCid(); AtEOXact_HashTables(false); @@ -5002,6 +5012,8 @@ CommitSubTransaction(void) AtEOSubXact_Inval(true); AtSubCommit_smgr(); + AtEOXact_SimpleUndoLog(true, GetCurrentTransactionIdIfAny()); + /* * The only lock we actually release here is the subtransaction XID lock. */ @@ -5181,6 +5193,7 @@ AbortSubTransaction(void) RESOURCE_RELEASE_AFTER_LOCKS, false, false); AtSubAbort_smgr(); + AtEOXact_SimpleUndoLog(false, GetCurrentTransactionIdIfAny()); AtEOXact_GUC(false, s->gucNestLevel); AtEOSubXact_SPI(false, s->subTransactionId); @@ -5660,7 +5673,10 @@ XactLogCommitRecord(TimestampTz commit_time, if (!TransactionIdIsValid(twophase_xid)) info = XLOG_XACT_COMMIT; else + { + elog(LOG, "COMMIT PREPARED: %d", twophase_xid); info = XLOG_XACT_COMMIT_PREPARED; + } /* First figure out and collect all the information needed */ @@ -6060,6 +6076,8 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, DropRelationFiles(parsed->xlocators, parsed->nrels, true); } + AtEOXact_SimpleUndoLog(true, xid); + if (parsed->nstats > 0) { /* see equivalent call for relations above */ @@ -6171,6 +6189,8 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, DropRelationFiles(parsed->xlocators, parsed->nrels, true); } + AtEOXact_SimpleUndoLog(false, xid); + if (parsed->nstats > 0) { /* see equivalent call for relations above */ @@ -6236,6 +6256,10 @@ xact_redo(XLogReaderState *record) } else if (info == XLOG_XACT_PREPARE) { + xl_xact_prepare *xlrec = (xl_xact_prepare *) XLogRecGetData(record); + + AtEOXact_SimpleUndoLog(true, xlrec->xid); + /* * Store xid and start/end pointers of the WAL record in TwoPhaseState * gxact entry. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f6f8adc72a..d6cb9aceec 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -51,6 +51,7 @@ #include "access/heaptoast.h" #include "access/multixact.h" #include "access/rewriteheap.h" +#include "access/simpleundolog.h" #include "access/subtrans.h" #include "access/timeline.h" #include "access/transam.h" @@ -5385,6 +5386,12 @@ StartupXLOG(void) /* Check that the GUCs used to generate the WAL allow recovery */ CheckRequiredParameterValues(); + /* + * Perform undo processing. This must be done before resetting unlogged + * relations. + */ + UndoLogCleanup(); + /* * We're in recovery, so unlogged relations may be trashed and must be * reset. This should be done BEFORE allowing Hot Standby @@ -5530,14 +5537,17 @@ StartupXLOG(void) } /* - * Reset unlogged relations to the contents of their INIT fork. This is - * done AFTER recovery is complete so as to include any unlogged relations - * created during recovery, but BEFORE recovery is marked as having - * completed successfully. Otherwise we'd not retry if any of the post - * end-of-recovery steps fail. + * Process undo logs left ater recovery, then reset unlogged relations to + * the contents of their INIT fork. This is done AFTER recovery is complete + * so as to include any file creations during recovery, but BEFORE recovery + * is marked as having completed successfully. Otherwise we'd not retry if + * any of the post end-of-recovery steps fail. */ if (InRecovery) + { + UndoLogCleanup(); ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + } /* * Pre-scan prepared transactions to find out the range of XIDs present. diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 2add053489..1778801bbd 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -19,16 +19,20 @@ #include "postgres.h" +#include "access/amapi.h" #include "access/parallel.h" #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "access/xlogutils.h" +#include "access/simpleundolog.h" #include "catalog/storage.h" #include "catalog/storage_xlog.h" +#include "catalog/storage_ulog.h" #include "miscadmin.h" #include "storage/freespace.h" +#include "storage/reinit.h" #include "storage/smgr.h" #include "utils/hsearch.h" #include "utils/memutils.h" @@ -66,6 +70,19 @@ typedef struct PendingRelDelete struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; +#define PCOP_UNLINK_FORK (1 << 0) + +typedef struct PendingCleanup +{ + RelFileLocator rlocator; /* relation that need a cleanup */ + int op; /* operation mask */ + ForkNumber unlink_forknum; /* forknum to unlink */ + BackendId backend; /* InvalidBackendId if not a temp rel */ + bool atCommit; /* T=delete at commit; F=delete at abort */ + int nestLevel; /* xact nesting level of request */ + struct PendingCleanup *next; /* linked-list link */ +} PendingCleanup; + typedef struct PendingRelSync { RelFileLocator rlocator; @@ -73,6 +90,7 @@ typedef struct PendingRelSync } PendingRelSync; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ +static PendingCleanup * pendingCleanups = NULL; /* head of linked list */ static HTAB *pendingSyncHash = NULL; @@ -148,6 +166,19 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, srel = smgropen(rlocator, backend); smgrcreate(srel, MAIN_FORKNUM, false); + /* Write undo log, this requires irrelevant to needs_wal */ + if (register_delete) + { + ul_uncommitted_storage ul_storage; + + ul_storage.rlocator = rlocator; + ul_storage.forknum = MAIN_FORKNUM; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + } + if (needs_wal) log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM); @@ -191,12 +222,32 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) */ xlrec.rlocator = *rlocator; xlrec.forkNum = forkNum; + xlrec.xid = GetTopTransactionId(); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_UNLINK record to WAL. + */ +void +log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum) +{ + xl_smgr_unlink xlrec; + + /* + * Make an XLOG entry reporting the file unlink. + */ + xlrec.rlocator = *rlocator; + xlrec.forkNum = forkNum; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -711,6 +762,75 @@ smgrDoPendingDeletes(bool isCommit) } } +/* + * smgrDoPendingUnmark() -- Clean up work that emits WAL records + * + * The operations handled in the function emits WAL records, which must be + * part of the current transaction. + */ +void +smgrDoPendingCleanups(bool isCommit) +{ + int nestLevel = GetCurrentTransactionNestLevel(); + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + if (pending->nestLevel < nestLevel) + { + /* outer-level entries should not be processed yet */ + prev = pending; + } + else + { + /* unlink list entry first, so we don't retry on failure */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + /* do cleanup if called for */ + if (pending->atCommit == isCommit) + { + SMgrRelation srel; + + srel = smgropen(pending->rlocator, pending->backend); + + Assert((pending->op & ~(PCOP_UNLINK_FORK)) == 0); + + if (pending->op & PCOP_UNLINK_FORK) + { + BlockNumber firstblock = 0; + + /* + * Unlink the fork file. Currently this operation is + * applied only to init-forks. As it is not ceratin that + * the init-fork is not loaded on shared buffers, drop all + * buffers for it. + */ + Assert(pending->unlink_forknum == INIT_FORKNUM); + DropRelationBuffers(srel, &pending->unlink_forknum, 1, + &firstblock); + + /* Don't emit wal while recovery. */ + if (!InRecovery) + log_smgrunlink(&pending->rlocator, + pending->unlink_forknum); + smgrunlink(srel, pending->unlink_forknum, false); + } + } + + /* must explicitly free the list entry */ + pfree(pending); + /* prev does not change */ + } + } +} + /* * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact. */ @@ -920,6 +1040,9 @@ PostPrepare_smgr(void) /* must explicitly free the list entry */ pfree(pending); } + + /* Mark undolog as prepared */ + SimpleUndoLogSetPrpared(GetCurrentTransactionId(), true); } @@ -967,10 +1090,28 @@ smgr_redo(XLogReaderState *record) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; + ul_uncommitted_storage ul_storage; + + /* write undo log */ + ul_storage.rlocator = xlrec->rlocator; + ul_storage.forknum = xlrec->forkNum; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + xlrec->xid, + &ul_storage, sizeof(ul_storage)); reln = smgropen(xlrec->rlocator, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } + else if (info == XLOG_SMGR_UNLINK) + { + xl_smgr_unlink *xlrec = (xl_smgr_unlink *) XLogRecGetData(record); + SMgrRelation reln; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + smgrunlink(reln, xlrec->forkNum, true); + smgrclose(reln); + } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); @@ -1062,3 +1203,33 @@ smgr_redo(XLogReaderState *record) else elog(PANIC, "smgr_redo: unknown op code %u", info); } + +void +smgr_undo(SimpleUndoLogRecord *record, bool crash_prepared) +{ + uint8 info = record->ul_info; + + + if (info == ULOG_SMGR_UNCOMMITED_STORAGE) + { + ul_uncommitted_storage *ul_storage = + (ul_uncommitted_storage *) ULogRecGetData(record); + + if (!crash_prepared) + { + SMgrRelation reln; + + reln = smgropen(ul_storage->rlocator, InvalidBackendId); + smgrunlink(reln, ul_storage->forknum, true); + smgrclose(reln); + } + else + { + /* Inform reinit to ignore this file during cleanup */ + ResetUnloggedRelationIgnore(ul_storage->rlocator); + } + + } + else + elog(PANIC, "smgr_undo: unknown op code %u", info); +} diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index fb55371b1b..d302feadb1 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -34,6 +34,39 @@ typedef struct Oid reloid; /* hash key */ } unlogged_relation_entry; +static char **ignore_files = NULL; +static int nignore_elems = 0; +static int nignore_files = 0; + +/* + * identify the file should be ignored during resetting unlogged relations. + */ +static bool +reinit_ignore_file(const char *dirname, const char *name) +{ + char fnamebuf[MAXPGPATH]; + int len; + + if (nignore_files == 0) + return false; + + strncpy(fnamebuf, dirname, MAXPGPATH - 1); + strncat(fnamebuf, "/", MAXPGPATH - 1); + strncat(fnamebuf, name, MAXPGPATH - 1); + fnamebuf[MAXPGPATH - 1] = 0; + + for (int i = 0 ; i < nignore_files ; i++) + { + /* match ignoring fork part */ + len = strlen(ignore_files[i]); + if (strncmp(fnamebuf, ignore_files[i], len) == 0 && + (fnamebuf[len] == 0 || fnamebuf[len] == '_')) + return true; + } + + return false; +} + /* * Reset unlogged relations from before the last restart. * @@ -203,6 +236,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -243,6 +280,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* We never remove the init fork. */ if (forkNum == INIT_FORKNUM) continue; @@ -295,6 +336,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -337,6 +382,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -365,6 +414,35 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) } } +/* + * Record relfilenodes that should be left alone during reinitializing unlogged + * relations. + */ +void +ResetUnloggedRelationIgnore(RelFileLocator rloc) +{ + RelFileLocatorBackend rbloc; + + if (nignore_files >= nignore_elems) + { + if (ignore_files == NULL) + { + nignore_elems = 16; + ignore_files = palloc(sizeof(char *) * nignore_elems); + } + else + { + nignore_elems *= 2; + ignore_files = repalloc(ignore_files, + sizeof(char *) * nignore_elems); + } + } + + rbloc.backend = InvalidBackendId; + rbloc.locator = rloc; + ignore_files[nignore_files++] = relpath(rbloc, MAIN_FORKNUM); +} + /* * Basic parsing of putative relation filenames. * diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 5d0f3d515c..92945c32c3 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -723,6 +723,15 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum) smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); } +/* + * smgrunlink() -- unlink the storage file + */ +void +smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rlocator, forknum, isRedo); +} + /* * AtEOXact_SMgr * diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 905b979947..c0938bdf3a 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -303,6 +303,7 @@ void setup_signals(void); void setup_text_search(void); void create_data_directory(void); void create_xlog_or_symlink(void); +void create_ulog(void); void warn_on_mount_point(int error); void initialize_data_directory(void); @@ -2938,6 +2939,21 @@ create_xlog_or_symlink(void) free(subdirloc); } +/* Create undo log directory */ +void +create_ulog(void) +{ + char *subdirloc; + + /* form name of the place for the subdirectory */ + subdirloc = psprintf("%s/pg_ulog", pg_data); + + if (mkdir(subdirloc, pg_dir_create_mode) < 0) + pg_fatal("could not create directory \"%s\": %m", + subdirloc); + + free(subdirloc); +} void warn_on_mount_point(int error) @@ -2972,6 +2988,7 @@ initialize_data_directory(void) create_data_directory(); create_xlog_or_symlink(); + create_ulog(); /* Create required subdirectories (other than pg_wal) */ printf(_("creating subdirectories ... ")); diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 27782237d0..87b4659e27 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -28,7 +28,7 @@ * RmgrNames is an array of the built-in resource manager names, to make error * messages a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ name, static const char *RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 6b8c17bb4c..a21009c5b8 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ { name, desc, identify}, static const RmgrDescData RmgrDescTable[RM_N_BUILTIN_IDS] = { diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index 3b6a497e1b..d705de9256 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 463bcb67c5..e15d951000 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -25,25 +25,25 @@ */ /* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, smgr_undo) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode, NULL) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode, NULL) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL, NULL) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL, NULL) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL, NULL) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL, NULL) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL, NULL) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL, NULL) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL, NULL) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL, NULL) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode, NULL) diff --git a/src/include/access/simpleundolog.h b/src/include/access/simpleundolog.h new file mode 100644 index 0000000000..3d3bd2f7e2 --- /dev/null +++ b/src/include/access/simpleundolog.h @@ -0,0 +1,36 @@ +#ifndef SIMPLE_UNDOLOG_H +#define SIMPLE_UNDOLOG_H + +#include "access/rmgr.h" +#include "port/pg_crc32c.h" + +#define SIMPLE_UNDOLOG_DIR "pg_ulog" + +typedef struct SimpleUndoLogRecord +{ + uint32 ul_tot_len; /* total length of entire record */ + pg_crc32c ul_crc; /* CRC for this record */ + RmgrId ul_rmid; /* resource manager for this record */ + uint8 ul_info; /* record info */ + TransactionId ul_xid; /* transaction id */ + /* rmgr-specific data follow, no padding */ +} SimpleUndoLogRecord; + +extern void SimpleUndoLogWrite(RmgrId rmgr, uint8 info, + TransactionId xid, void *data, int len); +extern void SimpleUndoLogSetPrpared(TransactionId xid, bool prepared); +extern void AtEOXact_SimpleUndoLog(bool isCommit, TransactionId xid); +extern void UndoLogCleanup(void); + +extern void AtPrepare_UndoLog(TransactionId xid); +extern void PostPrepare_UndoLog(void); +extern void undolog_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); + +#endif /* SIMPLE_UNDOLOG_H */ diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index 45a3c7835c..0b39c6ef56 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -25,6 +25,8 @@ extern PGDLLIMPORT int wal_skip_threshold; extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete); +extern void RelationCreateInitFork(Relation rel); +extern void RelationDropInitFork(Relation rel); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit); extern void RelationPreTruncate(Relation rel); @@ -43,6 +45,7 @@ extern void RestorePendingSyncs(char *startAddress); extern void smgrDoPendingDeletes(bool isCommit); extern void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker); extern int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr); +extern void smgrDoPendingCleanups(bool isCommit); extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); diff --git a/src/include/catalog/storage_ulog.h b/src/include/catalog/storage_ulog.h new file mode 100644 index 0000000000..8e47428e66 --- /dev/null +++ b/src/include/catalog/storage_ulog.h @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * storage_ulog.h + * prototypes for Undo Log support for backend/catalog/storage.c + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/storage_ulog.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGE_ULOG_H +#define STORAGE_ULOG_H + +/* ULOG gives us high 4 bits (just following xlog) */ +#define ULOG_SMGR_UNCOMMITED_STORAGE 0x10 + +/* undo log entry for uncommitted storage files */ +typedef struct ul_uncommitted_storage +{ + RelFileLocator rlocator; + ForkNumber forknum; + bool remove; +} ul_uncommitted_storage; + +/* flags for xl_smgr_truncate */ +#define SMGR_TRUNCATE_HEAP 0x0001 + +void smgr_undo(SimpleUndoLogRecord *record, bool crash_prepared); + +#define ULogRecGetData(record) ((char *)record + sizeof(SimpleUndoLogRecord)) + +#endif /* STORAGE_XLOG_H */ diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 6b0a7aa3df..5122f5b61d 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -29,13 +29,21 @@ /* XLOG gives us high 4 bits */ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 +#define XLOG_SMGR_UNLINK 0x30 typedef struct xl_smgr_create { RelFileLocator rlocator; ForkNumber forkNum; + TransactionId xid; } xl_smgr_create; +typedef struct xl_smgr_unlink +{ + RelFileLocator rlocator; + ForkNumber forkNum; +} xl_smgr_unlink; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -51,6 +59,7 @@ typedef struct xl_smgr_truncate } xl_smgr_truncate; extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum); +extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index e2bbb5abe9..ccd182531d 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -16,9 +16,11 @@ #define REINIT_H #include "common/relpath.h" +#include "storage/relfilelocator.h" extern void ResetUnloggedRelations(int op); +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); extern bool parse_filename_for_nontemp_relation(const char *name, int *relnumchars, ForkNumber *fork); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a179aaba..74194cf1e4 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -88,6 +88,7 @@ extern void smgrcloserellocator(RelFileLocatorBackend rlocator); extern void smgrrelease(SMgrRelation reln); extern void smgrreleaseall(void); extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 49a33c0387..b9255e5e25 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1996,6 +1996,7 @@ PatternInfo PatternInfoArray Pattern_Prefix_Status Pattern_Type +PendingCleanup PendingFsyncEntry PendingRelDelete PendingRelSync @@ -2553,6 +2554,7 @@ SimplePtrListCell SimpleStats SimpleStringList SimpleStringListCell +SimpleUndoLogRecord SingleBoundSortItem Size SkipPages @@ -2909,6 +2911,8 @@ ULONG ULONG_PTR UV UVersionInfo +UndoDescData +UndoLogFileHeader UnicodeNormalizationForm UnicodeNormalizationQC Unique @@ -3826,6 +3830,7 @@ uint8 uint8_t uint8x16_t uintptr_t +ul_uncommitted_storage unicodeStyleBorderFormat unicodeStyleColumnFormat unicodeStyleFormat @@ -3938,6 +3943,7 @@ xl_running_xacts xl_seq_rec xl_smgr_create xl_smgr_truncate +xl_smgr_unlink xl_standby_lock xl_standby_locks xl_tblspc_create_rec -- 2.39.3
>From 8cfe03157c412f6936e5c1b156d1ce28ac922763 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyota....@gmail.com> Date: Mon, 4 Sep 2023 17:23:05 +0900 Subject: [PATCH v29 2/2] In-place table persistence change Previously, the command caused a large amount of file I/O due to heap rewrites, even though ALTER TABLE SET UNLOGGED does not require any data rewrites. This patch eliminates the need for rewrites. Additionally, ALTER TABLE SET LOGGED is updated to emit XLOG_FPI records instead of numerous HEAP_INSERTs when wal_level > minimal, reducing resource consumption. --- src/backend/access/rmgrdesc/smgrdesc.c | 12 + src/backend/catalog/storage.c | 338 ++++++++++++++++++++++++- src/backend/commands/tablecmds.c | 268 +++++++++++++++++--- src/backend/storage/buffer/bufmgr.c | 84 ++++++ src/bin/pg_rewind/parsexlog.c | 6 + src/include/catalog/storage_xlog.h | 10 + src/include/storage/bufmgr.h | 3 + src/include/storage/reinit.h | 2 +- src/tools/pgindent/typedefs.list | 1 + 9 files changed, 683 insertions(+), 41 deletions(-) diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index bd841b96e8..620e02bc26 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -40,6 +40,15 @@ smgr_desc(StringInfo buf, XLogReaderState *record) xlrec->blkno, xlrec->flags); pfree(path); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec; + char *path = relpathperm(xlrec->rlocator, MAIN_FORKNUM); + + appendStringInfoString(buf, path); + appendStringInfo(buf, " persistence %d", xlrec->persistence); + pfree(path); + } } const char * @@ -55,6 +64,9 @@ smgr_identify(uint8 info) case XLOG_SMGR_TRUNCATE: id = "TRUNCATE"; break; + case XLOG_SMGR_BUFPERSISTENCE: + id = "BUFPERSISTENCE"; + break; } return id; diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 1778801bbd..e7c917c50f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -71,11 +71,13 @@ typedef struct PendingRelDelete } PendingRelDelete; #define PCOP_UNLINK_FORK (1 << 0) +#define PCOP_SET_PERSISTENCE (1 << 1) typedef struct PendingCleanup { RelFileLocator rlocator; /* relation that need a cleanup */ int op; /* operation mask */ + bool bufpersistence; /* buffer persistence to set */ ForkNumber unlink_forknum; /* forknum to unlink */ BackendId backend; /* InvalidBackendId if not a temp rel */ bool atCommit; /* T=delete at commit; F=delete at abort */ @@ -209,6 +211,208 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return srel; } +/* + * RelationCreateInitFork + * Create physical storage for the init fork of a relation. + * + * Create the init fork for the relation. + * + * This function is transactional. The creation is WAL-logged, and if the + * transaction aborts later on, the init fork will be removed. + */ +void +RelationCreateInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + SMgrRelation srel; + ul_uncommitted_storage ul_storage; + bool create = true; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), false, false); + + /* + * If a pending-unlink exists for this relation's init-fork, it indicates + * the init-fork's existed before the current transaction; this function + * reverts the pending-unlink by removing the entry. See + * RelationDropInitFork. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + create = false; + } + else + prev = pending; + } + + if (!create) + return; + + /* create undo log entry, then the init fork */ + srel = smgropen(rlocator, InvalidBackendId); + + /* write undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* We don't have existing init fork, create it. */ + smgrcreate(srel, INIT_FORKNUM, false); + + /* + * For index relations, WAL-logging and file sync are handled by + * ambuildempty. In contrast, for heap relations, these tasks are performed + * directly. + */ + if (rel->rd_rel->relkind == RELKIND_INDEX) + rel->rd_indam->ambuildempty(rel); + else + { + log_smgrcreate(&rlocator, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + /* drop the init fork, mark file then revert persistence at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE; + pending->unlink_forknum = INIT_FORKNUM; + pending->bufpersistence = true; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + +/* + * RelationDropInitFork + * Delete physical storage for the init fork of a relation. + */ +void +RelationDropInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + bool inxact_created = false; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), true, false); + + /* + * Search for a pending-unlink associated with the init-fork of the + * relation. Its presence indicates that the init-fork was created within + * the current transaction. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + ul_uncommitted_storage ul_storage; + + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* unlink list entry */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + + /* prev does not change */ + + inxact_created = true; + } + else + prev = pending; + } + + /* + * If the init-fork was created in this transaction, remove the init-fork + * and cancel preceding undo log. Otherwise, register an at-commit + * pending-unlink for the existing init-fork. See RelationCreateInitFork. + */ + if (inxact_created) + { + SMgrRelation srel = smgropen(rlocator, InvalidBackendId); + ForkNumber forknum = INIT_FORKNUM; + BlockNumber firstblock = 0; + ul_uncommitted_storage ul_storage; + + /* + * Some AMs initialize init-fork via the buffer manager. To properly + * drop the init-fork, first drop all buffers for the init-fork, then + * unlink the init-fork and cancel preceding undo log. + */ + DropRelationBuffers(srel, &forknum, 1, &firstblock); + + /* cancel existing undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + log_smgrunlink(&rlocator, INIT_FORKNUM); + smgrunlink(srel, INIT_FORKNUM, false); + return; + } + + /* register drop of this init fork file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK; + pending->unlink_forknum = INIT_FORKNUM; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + /* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ @@ -248,6 +452,25 @@ log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum) XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_BUFPERSISTENCE record to WAL. + */ +void +log_smgrbufpersistence(const RelFileLocator rlocator, bool persistence) +{ + xl_smgr_bufpersistence xlrec; + + /* + * Make an XLOG entry reporting the change of buffer persistence. + */ + xlrec.rlocator = rlocator; + xlrec.persistence = persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_BUFPERSISTENCE | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -800,7 +1023,14 @@ smgrDoPendingCleanups(bool isCommit) srel = smgropen(pending->rlocator, pending->backend); - Assert((pending->op & ~(PCOP_UNLINK_FORK)) == 0); + Assert((pending->op & + ~(PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE)) == 0); + + if (pending->op & PCOP_SET_PERSISTENCE) + { + SetRelationBuffersPersistence(srel, pending->bufpersistence, + InRecovery); + } if (pending->op & PCOP_UNLINK_FORK) { @@ -1200,6 +1430,112 @@ smgr_redo(XLogReaderState *record) FreeFakeRelcacheEntry(rel); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } else elog(PANIC, "smgr_redo: unknown op code %u", info); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 47c556669f..6c4cfbfa78 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -55,6 +55,7 @@ #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/policy.h" +#include "commands/progress.h" #include "commands/sequence.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" @@ -5571,6 +5572,188 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, return newcmd; } +/* + * RelationChangePersistence: perform in-place persistence change of a relation + */ +static void +RelationChangePersistence(AlteredTableInfo *tab, char persistence, + LOCKMODE lockmode) +{ + Relation rel; + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + int i; + List *relids; + ListCell *lc_oid; + + Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE); + Assert(lockmode == AccessExclusiveLock); + + /* + * Use ATRewriteTable instead of this function under the following + * condition. + */ + Assert(tab->constraints == NULL && tab->partition_constraint == NULL && + tab->newvals == NULL && !tab->verify_new_notnull); + + rel = table_open(tab->relid, lockmode); + + Assert(rel->rd_rel->relpersistence != persistence); + + elog(DEBUG1, "perform in-place persistence change"); + + /* + * Initially, gather all relations that require a persistence change. + */ + + /* Collect OIDs of indexes and toast relations */ + relids = RelationGetIndexList(rel); + relids = lcons_oid(rel->rd_id, relids); + + /* Add toast relation if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + List *toastidx; + Relation toastrel = table_open(rel->rd_rel->reltoastrelid, lockmode); + + relids = lappend_oid(relids, rel->rd_rel->reltoastrelid); + toastidx = RelationGetIndexList(toastrel); + relids = list_concat(relids, toastidx); + pfree(toastidx); + table_close(toastrel, NoLock); + } + + table_close(rel, NoLock); + + /* Make changes in storage */ + classRel = table_open(RelationRelationId, RowExclusiveLock); + + foreach(lc_oid, relids) + { + Oid reloid = lfirst_oid(lc_oid); + Relation r = relation_open(reloid, lockmode); + + /* + * XXXX: Some access methods don't support in-place persistence + * changes. GiST uses page LSNs to figure out whether a block has been + * modified. However, UNLOGGED GiST indexes use fake LSNs, which are + * incompatible with the real LSNs used for LOGGED indexes. + * + * Potentially, if gistGetFakeLSN behaved similarly for both permanent + * and unlogged indexes, we could avoid index rebuilds by emitting + * extra WAL records while the index is unlogged. + * + * Compare relam against a positive list to ensure the hard way is + * taken for unknown AMs. + */ + if (r->rd_rel->relkind == RELKIND_INDEX && + /* GiST is excluded */ + r->rd_rel->relam != BTREE_AM_OID && + r->rd_rel->relam != HASH_AM_OID && + r->rd_rel->relam != GIN_AM_OID && + r->rd_rel->relam != SPGIST_AM_OID && + r->rd_rel->relam != BRIN_AM_OID) + { + int reindex_flags; + ReindexParams params = {0}; + + /* reindex doesn't allow concurrent use of the index */ + table_close(r, NoLock); + + reindex_flags = + REINDEX_REL_SUPPRESS_INDEX_USE | + REINDEX_REL_CHECK_CONSTRAINTS; + + /* Set the same persistence with the parent relation. */ + if (persistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + reindex_index(reloid, reindex_flags, persistence, ¶ms); + + continue; + } + + /* Create or drop init fork */ + if (persistence == RELPERSISTENCE_UNLOGGED) + RelationCreateInitFork(r); + else + RelationDropInitFork(r); + + /* + * If this relation becomes WAL-logged, immediately sync all files + * except the init-fork to establish the initial state on storage. The + * buffers should have already been flushed out by + * RelationCreate(Drop)InitFork called just above. The init-fork should + * already be synchronized as required. + */ + if (persistence == RELPERSISTENCE_PERMANENT) + { + for (i = 0; i < INIT_FORKNUM; i++) + { + if (smgrexists(RelationGetSmgr(r), i)) + smgrimmedsync(RelationGetSmgr(r), i); + } + } + + /* Update catalog */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", reloid); + + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + + new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence); + new_null[Anum_pg_class_relpersistence - 1] = false; + new_repl[Anum_pg_class_relpersistence - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + + /* + * If wal_level >= replica, switching to LOGGED necessitates WAL-logging + * the relation content for later recovery. This is not emitted when + * wal_level = minimal. + */ + if (persistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded()) + { + ForkNumber fork; + xl_smgr_truncate xlrec; + + xlrec.blkno = 0; + xlrec.rlocator = r->rd_locator; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + for (fork = 0; fork < INIT_FORKNUM; fork++) + { + if (smgrexists(RelationGetSmgr(r), fork)) + log_newpage_range(r, fork, 0, + smgrnblocks(RelationGetSmgr(r), fork), + false); + } + } + + table_close(r, NoLock); + } + + table_close(classRel, NoLock); +} + /* * ATRewriteTables: ALTER TABLE phase 3 */ @@ -5701,48 +5884,55 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, tab->relid, tab->rewrite); - /* - * Create transient table that will receive the modified data. - * - * Ensure it is marked correctly as logged or unlogged. We have - * to do this here so that buffers for the new relfilenumber will - * have the right persistence set, and at the same time ensure - * that the original filenumbers's buffers will get read in with - * the correct setting (i.e. the original one). Otherwise a - * rollback after the rewrite would possibly result with buffers - * for the original filenumbers having the wrong persistence - * setting. - * - * NB: This relies on swap_relation_files() also swapping the - * persistence. That wouldn't work for pg_class, but that can't be - * unlogged anyway. - */ - OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, NewAccessMethod, - persistence, lockmode); + if (tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE) + RelationChangePersistence(tab, persistence, lockmode); + else + { + /* + * Create transient table that will receive the modified data. + * + * Ensure it is marked correctly as logged or unlogged. We + * have to do this here so that buffers for the new + * relfilenumber will have the right persistence set, and at + * the same time ensure that the original filenumbers's buffers + * will get read in with the correct setting (i.e. the original + * one). Otherwise a rollback after the rewrite would possibly + * result with buffers for the original filenumbers having the + * wrong persistence setting. + * + * NB: This relies on swap_relation_files() also swapping the + * persistence. That wouldn't work for pg_class, but that + * can't be unlogged anyway. + */ + OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, + NewAccessMethod, + persistence, lockmode); - /* - * Copy the heap data into the new table with the desired - * modifications, and test the current data within the table - * against new constraints generated by ALTER TABLE commands. - */ - ATRewriteTable(tab, OIDNewHeap, lockmode); + /* + * Copy the heap data into the new table with the desired + * modifications, and test the current data within the table + * against new constraints generated by ALTER TABLE commands. + */ + ATRewriteTable(tab, OIDNewHeap, lockmode); - /* - * Swap the physical files of the old and new heaps, then rebuild - * indexes and discard the old heap. We can use RecentXmin for - * the table's new relfrozenxid because we rewrote all the tuples - * in ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have no - * interest in letting this code work on system catalogs. - */ - finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, - !OidIsValid(tab->newTableSpace), - RecentXmin, - ReadNextMultiXactId(), - persistence); + /* + * Swap the physical files of the old and new heaps, then + * rebuild indexes and discard the old heap. We can use + * RecentXmin for the table's new relfrozenxid because we + * rewrote all the tuples in ATRewriteTable, so no older Xid + * remains in the table. Also, we never try to swap toast + * tables by content, since we have no interest in letting + * this code work on system catalogs. + */ + finish_heap_swap(tab->relid, OIDNewHeap, + false, false, true, + !OidIsValid(tab->newTableSpace), + RecentXmin, + ReadNextMultiXactId(), + persistence); - InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + } } else if (tab->rewrite > 0 && tab->relkind == RELKIND_SEQUENCE) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 3bd82dbfca..04ab6ec8a7 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3708,6 +3708,90 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } +/* --------------------------------------------------------------------- + * SetRelationBuffersPersistence + * + * This function changes the persistence of all buffer pages of a relation + * then writes all dirty pages to disk (or kernel disk buffers) when + * switching to PERMANENT, ensuring the kernel has an up-to-date view of + * the relation. + * + * The caller must be holding AccessExclusiveLock on the target relation + * to ensure no other backend is busy dirtying more blocks. + * + * XXX currently it sequentially searches the buffer pool; consider + * implementing more efficient search methods. This routine isn't used in + * performance-critical code paths, so it's not worth additional overhead + * to make it go faster; see also DropRelationBuffers. + * -------------------------------------------------------------------- + */ +void +SetRelationBuffersPersistence(SMgrRelation srel, bool permanent, bool isRedo) +{ + int i; + RelFileLocatorBackend rlocator = srel->smgr_rlocator; + + Assert(!RelFileLocatorBackendIsTemp(rlocator)); + + if (!isRedo) + log_smgrbufpersistence(srel->smgr_rlocator.locator, permanent); + + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + { + UnlockBufHdr(bufHdr, buf_state); + continue; + } + + if (permanent) + { + /* The init fork is being dropped, drop buffers for it. */ + if (BufTagGetForkNum(&bufHdr->tag) == INIT_FORKNUM) + { + InvalidateBuffer(bufHdr); + continue; + } + + buf_state |= BM_PERMANENT; + pg_atomic_write_u32(&bufHdr->state, buf_state); + + /* flush this buffer when switching to PERMANENT */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_SHARED); + FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + else + { + /* There shouldn't be an init fork */ + Assert(BufTagGetForkNum(&bufHdr->tag) != INIT_FORKNUM); + UnlockBufHdr(bufHdr, buf_state); + } + } +} + /* --------------------------------------------------------------------- * DropRelationsAllBuffers * diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 87b4659e27..db12f4f397 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -418,6 +418,12 @@ extractPageInfo(XLogReaderState *record) * source system. */ } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_BUFPERSISTENCE) + { + /* + * We can safely ignore these. These don't make any on-disk changes. + */ + } else if (rmid == RM_XACT_ID && ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 5122f5b61d..eaa162f0c7 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -14,6 +14,7 @@ #ifndef STORAGE_XLOG_H #define STORAGE_XLOG_H +#include "access/simpleundolog.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" @@ -30,6 +31,7 @@ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 #define XLOG_SMGR_UNLINK 0x30 +#define XLOG_SMGR_BUFPERSISTENCE 0x40 typedef struct xl_smgr_create { @@ -44,6 +46,12 @@ typedef struct xl_smgr_unlink ForkNumber forkNum; } xl_smgr_unlink; +typedef struct xl_smgr_bufpersistence +{ + RelFileLocator rlocator; + bool persistence; +} xl_smgr_bufpersistence; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -60,6 +68,8 @@ typedef struct xl_smgr_truncate extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum); extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum); +extern void log_smgrbufpersistence(const RelFileLocator rlocator, + bool persistence); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b379c76e27..0e4e290392 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -222,6 +222,9 @@ extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln, int nforks, BlockNumber *firstDelBlock); extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln, int nlocators); +extern void SetRelationBuffersPersistence(struct SMgrRelationData *srel, + bool permanent, bool isRedo); + extern void DropDatabaseBuffers(Oid dbid); #define RelationGetNumberOfBlocks(reln) \ diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index ccd182531d..e59fb7892e 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -20,10 +20,10 @@ extern void ResetUnloggedRelations(int op); -extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); extern bool parse_filename_for_nontemp_relation(const char *name, int *relnumchars, ForkNumber *fork); +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); #define UNLOGGED_RELATION_CLEANUP 0x0001 #define UNLOGGED_RELATION_INIT 0x0002 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b9255e5e25..2c34434555 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3941,6 +3941,7 @@ xl_replorigin_set xl_restore_point xl_running_xacts xl_seq_rec +xl_smgr_bufpersistence xl_smgr_create xl_smgr_truncate xl_smgr_unlink -- 2.39.3