Here's an updated version of the patch. There was a bogus assertion in
the previous one, comparing against mdsync_cycle_ctr instead of
mdunlink_cycle_ctr.
Heikki Linnakangas wrote:
> Tom Lane wrote:
>> Heikki Linnakangas <[EMAIL PROTECTED]> writes:
>>> The best I can think of is to rename the obsolete file to
>>> <relfilenode>.stale, when it's scheduled for deletion at next
>>> checkpoint, and check for .stale-suffixed files in GetNewRelFileNode,
>>> and delete them immediately in DropTableSpace.
>> This is getting too Rube Goldbergian for my tastes. What if we just
>> make DROP TABLESPACE force a checkpoint before proceeding?
>
> Patch attached.
>
> The scenario we're preventing is still possible if for some reason the
> latest checkpoint record is damaged, and we start recovery from the
> previous checkpoint record. I think the probability of that happening,
> together with the OID wrap-around and hitting the relfilenode of a
> recently deleted file with a new one, is low enough to not worry about.
> If we cared, we could fix it by letting the files to linger for two
> checkpoint cycles instead of one.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.286
diff -c -r1.286 xlog.c
*** src/backend/access/transam/xlog.c 12 Oct 2007 19:39:59 -0000 1.286
--- src/backend/access/transam/xlog.c 18 Oct 2007 20:16:56 -0000
***************
*** 45,50 ****
--- 45,51 ----
#include "storage/fd.h"
#include "storage/pmsignal.h"
#include "storage/procarray.h"
+ #include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
***************
*** 5668,5673 ****
--- 5669,5680 ----
checkPoint.time = time(NULL);
/*
+ * Let the md storage manager to prepare for checkpoint before
+ * we determine the REDO pointer.
+ */
+ mdcheckpointbegin();
+
+ /*
* We must hold WALInsertLock while examining insert state to determine
* the checkpoint REDO pointer.
*/
***************
*** 5887,5892 ****
--- 5894,5904 ----
END_CRIT_SECTION();
/*
+ * Let the md storage manager to do its post-checkpoint work.
+ */
+ mdcheckpointend();
+
+ /*
* Delete old log files (those no longer needed even for previous
* checkpoint).
*/
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.49
diff -c -r1.49 tablespace.c
*** src/backend/commands/tablespace.c 1 Aug 2007 22:45:08 -0000 1.49
--- src/backend/commands/tablespace.c 18 Oct 2007 20:31:53 -0000
***************
*** 460,472 ****
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
/*
! * Try to remove the physical infrastructure
*/
if (!remove_tablespace_directories(tablespaceoid, false))
! ereport(ERROR,
! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! errmsg("tablespace \"%s\" is not empty",
! tablespacename)));
/* Record the filesystem change in XLOG */
{
--- 460,488 ----
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
/*
! * Try to remove the physical infrastructure.
*/
if (!remove_tablespace_directories(tablespaceoid, false))
! {
! /*
! * There can be lingering empty files in the directories, left behind
! * by for example DROP TABLE, that have been scheduled for deletion
! * at next checkpoint (see comments in mdunlink() for details). We
! * could just delete them immediately, but we can't tell them apart
! * from important data files that we mustn't delete. So instead, we
! * force a checkpoint which will clean out any lingering files, and
! * try again.
! */
! RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
! if (!remove_tablespace_directories(tablespaceoid, false))
! {
! /* Still not empty, the files must be important then */
! ereport(ERROR,
! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! errmsg("tablespace \"%s\" is not empty",
! tablespacename)));
! }
! }
/* Record the filesystem change in XLOG */
{
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.129
diff -c -r1.129 md.c
*** src/backend/storage/smgr/md.c 3 Jul 2007 14:51:24 -0000 1.129
--- src/backend/storage/smgr/md.c 20 Oct 2007 14:10:08 -0000
***************
*** 34,39 ****
--- 34,40 ----
/* special values for the segno arg to RememberFsyncRequest */
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
+ #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
/*
* On Windows, we have to interpret EACCES as possibly meaning the same as
***************
*** 113,118 ****
--- 114,123 ----
* table remembers the pending operations. We use a hash table mostly as
* a convenient way of eliminating duplicate requests.
*
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after next checkpoint, but we use a linked list instead of hash
+ * table, because we don't expect there to be any duplicates.
+ *
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
*/
***************
*** 131,139 ****
--- 136,152 ----
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
} PendingOperationEntry;
+ typedef struct
+ {
+ RelFileNode rnode; /* the dead relation to delete */
+ CycleCtr cycle_ctr; /* mdunlink_cycle_ctr when request was made */
+ } PendingUnlinkEntry;
+
static HTAB *pendingOpsTable = NULL;
+ static List *pendingUnlinks = NIL;
static CycleCtr mdsync_cycle_ctr = 0;
+ static CycleCtr mdunlink_cycle_ctr = 0;
typedef enum /* behavior for mdopen & _mdfd_getseg */
***************
*** 146,151 ****
--- 159,165 ----
/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+ static void register_unlink(RelFileNode rnode);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
***************
*** 188,193 ****
--- 202,208 ----
100L,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ pendingUnlinks = NIL;
}
}
***************
*** 257,267 ****
--- 272,300 ----
* If isRedo is true, it's okay for the relation to be already gone.
* Also, any failure should be reported as WARNING not ERROR, because
* we are usually not in a transaction anymore when this is called.
+ *
+ * If isRedo is false, the relation is actually just truncated to release
+ * the space, and left to linger as an empty file until the next checkpoint.
+ * This is to avoid reusing the same relfilenode for a new relation file,
+ * until the commit record containing the deletion has been flushed out.
+ *
+ * The scenario we're trying to protect from is this:
+ * After crash, we replay the commit WAL record of a transaction that
+ * deleted a relation, like DROP TABLE. But before the crash, after deleting
+ * the old relation, we created a new one, and it happened to get the same
+ * relfilenode as the deleted relation (OID must've wrapped around for
+ * that to happen). Now replaying the deletion of the old relation
+ * deletes the new one instead, because they had the same relfilenode.
+ * Normally the new relation would be recreated later in the WAL replay, but
+ * if we relied on fsyncing the file after populating it, like CLUSTER and
+ * CREATE INDEX do, for example, the contents of the file would be lost
+ * forever.
*/
void
mdunlink(RelFileNode rnode, bool isRedo)
{
char *path;
+ int ret;
/*
* We have to clean out any pending fsync requests for the doomed relation,
***************
*** 271,278 ****
path = relpath(rnode);
! /* Delete the first segment, or only segment if not doing segmenting */
! if (unlink(path) < 0)
{
if (!isRedo || errno != ENOENT)
ereport(WARNING,
--- 304,318 ----
path = relpath(rnode);
! /*
! * Delete or truncate the first segment, or only segment if not doing
! * segmenting
! */
! if (!isRedo)
! ret = truncate(path, 0);
! else
! ret = unlink(path);
! if (ret < 0)
{
if (!isRedo || errno != ENOENT)
ereport(WARNING,
***************
*** 316,321 ****
--- 356,364 ----
#endif
pfree(path);
+
+ if (!isRedo)
+ register_unlink(rnode);
}
/*
***************
*** 1096,1114 ****
}
}
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
! * We stuff the fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
*
* The range of possible segment numbers is way less than the range of
* BlockNumber, so we can reserve high values of segno for special purposes.
! * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
! * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
! * a whole database. (These are a tad slow because the hash table has to be
! * searched linearly, but it doesn't seem worth rethinking the table structure
! * for them.)
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
--- 1139,1268 ----
}
}
+
+ /*
+ * register_unlink() -- Schedule a relation to be deleted after next checkpoint
+ */
+ static void
+ register_unlink(RelFileNode rnode)
+ {
+ if (pendingOpsTable)
+ {
+ /* standalone backend or startup process: fsync state is local */
+ RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+ }
+ else if (IsUnderPostmaster)
+ {
+ /*
+ * Notify the bgwriter about it. If we fail to queue the revoke
+ * message, we have to sleep and try again ... ugly, but hopefully
+ * won't happen often.
+ *
+ * XXX should we just leave the file orphaned instead?
+ */
+ while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+ pg_usleep(10000L); /* 10 msec seems a good number */
+ }
+ }
+
+ /*
+ * mdcheckpointbegin() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started and those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must called *before* RedoRecPtr is determined.
+ */
+ void
+ mdcheckpointbegin()
+ {
+ PendingUnlinkEntry *entry;
+ ListCell *cell;
+
+ /*
+ * Just in case the prior checkpoint failed, stamp all entries in
+ * the list with the current cycle counter. Anything that's in the
+ * list at the start of checkpoint can surely be deleted after the
+ * checkpoint is finished, regardless of when the request was made.
+ */
+ foreach(cell, pendingUnlinks)
+ {
+ entry = lfirst(cell);
+ entry->cycle_ctr = mdunlink_cycle_ctr;
+ }
+
+ /*
+ * Any unlink requests arriving after this point will be assigned the
+ * next cycle counter, and won't be unlinked until next checkpoint.
+ */
+ mdunlink_cycle_ctr++;
+ }
+
+ /*
+ * mdcheckpointend() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+ void
+ mdcheckpointend()
+ {
+ while(pendingUnlinks != NIL)
+ {
+ PendingUnlinkEntry *entry = linitial(pendingUnlinks);
+ char *path;
+
+ /*
+ * New entries are appended to the end, so if the entry is new
+ * we've reached the end of old entries.
+ */
+ if (entry->cycle_ctr == mdsync_cycle_ctr)
+ break;
+
+ /* Else assert we haven't missed it */
+ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdunlink_cycle_ctr);
+
+ /* Unlink the file */
+ path = relpath(entry->rnode);
+ if (unlink(path) < 0)
+ {
+ /*
+ * ENOENT shouldn't happen either, but it doesn't really matter
+ * because we would've deleted it now anyway.
+ */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove relation %u/%u/%u: %m",
+ entry->rnode.spcNode,
+ entry->rnode.dbNode,
+ entry->rnode.relNode)));
+ }
+ pfree(path);
+
+ pendingUnlinks = list_delete_first(pendingUnlinks);
+ }
+ }
+
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
! * We stuff normal fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
*
* The range of possible segment numbers is way less than the range of
* BlockNumber, so we can reserve high values of segno for special purposes.
! * We define three:
! * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
! * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
! * - UNLINK_REQUEST is a request to delete a lingering file after next
! * checkpoint. These are stuffed into a linked list separate from
! * the normal fsync requests.
! *
! * (Handling the FORGET_* requests is a tad slow because the hash table has to
! * be searched linearly, but it doesn't seem worth rethinking the table
! * structure for them.)
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
***************
*** 1147,1152 ****
--- 1301,1322 ----
}
}
}
+ else if (segno == UNLINK_RELATION_REQUEST)
+ {
+ /* Unlink request: put it in the linked list */
+ MemoryContext oldcxt;
+ PendingUnlinkEntry *entry;
+
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+
+ entry = palloc(sizeof(PendingUnlinkEntry));
+ memcpy(&entry->rnode, &rnode, sizeof(RelFileNode));
+ entry->cycle_ctr = mdunlink_cycle_ctr;
+
+ pendingUnlinks = lappend(pendingUnlinks, entry);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
else
{
/* Normal case: enter a request to fsync this segment */
Index: src/include/storage/smgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.59
diff -c -r1.59 smgr.h
*** src/include/storage/smgr.h 5 Sep 2007 18:10:48 -0000 1.59
--- src/include/storage/smgr.h 18 Oct 2007 09:52:43 -0000
***************
*** 110,115 ****
--- 110,118 ----
extern void ForgetRelationFsyncRequests(RelFileNode rnode);
extern void ForgetDatabaseFsyncRequests(Oid dbid);
+ extern void mdcheckpointbegin(void);
+ extern void mdcheckpointend(void);
+
/* smgrtype.c */
extern Datum smgrout(PG_FUNCTION_ARGS);
extern Datum smgrin(PG_FUNCTION_ARGS);
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at
http://www.postgresql.org/about/donate