Here's an updated version of the patch. There was a bogus assertion in
the previous one, comparing against mdsync_cycle_ctr instead of
mdunlink_cycle_ctr.

Heikki Linnakangas wrote:
> Tom Lane wrote:
>> Heikki Linnakangas <[EMAIL PROTECTED]> writes:
>>> The best I can think of is to rename the obsolete file to
>>> <relfilenode>.stale, when it's scheduled for deletion at next
>>> checkpoint, and check for .stale-suffixed files in GetNewRelFileNode,
>>> and delete them immediately in DropTableSpace.
>> This is getting too Rube Goldbergian for my tastes.  What if we just
>> make DROP TABLESPACE force a checkpoint before proceeding?
> 
> Patch attached.
> 
> The scenario we're preventing is still possible if for some reason the
> latest checkpoint record is damaged, and we start recovery from the
> previous checkpoint record. I think the probability of that happening,
> together with the OID wrap-around and hitting the relfilenode of a
> recently deleted file with a new one, is low enough to not worry about.
> If we cared, we could fix it by letting the files to linger for two
> checkpoint cycles instead of one.

-- 
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.286
diff -c -r1.286 xlog.c
*** src/backend/access/transam/xlog.c	12 Oct 2007 19:39:59 -0000	1.286
--- src/backend/access/transam/xlog.c	18 Oct 2007 20:16:56 -0000
***************
*** 45,50 ****
--- 45,51 ----
  #include "storage/fd.h"
  #include "storage/pmsignal.h"
  #include "storage/procarray.h"
+ #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
  #include "utils/pg_locale.h"
***************
*** 5668,5673 ****
--- 5669,5680 ----
  	checkPoint.time = time(NULL);
  
  	/*
+ 	 * Let the md storage manager to prepare for checkpoint before
+ 	 * we determine the REDO pointer.
+ 	 */
+ 	mdcheckpointbegin();
+ 
+ 	/*
  	 * We must hold WALInsertLock while examining insert state to determine
  	 * the checkpoint REDO pointer.
  	 */
***************
*** 5887,5892 ****
--- 5894,5904 ----
  	END_CRIT_SECTION();
  
  	/*
+ 	 * Let the md storage manager to do its post-checkpoint work.
+ 	 */
+ 	mdcheckpointend();
+ 
+ 	/*
  	 * Delete old log files (those no longer needed even for previous
  	 * checkpoint).
  	 */
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.49
diff -c -r1.49 tablespace.c
*** src/backend/commands/tablespace.c	1 Aug 2007 22:45:08 -0000	1.49
--- src/backend/commands/tablespace.c	18 Oct 2007 20:31:53 -0000
***************
*** 460,472 ****
  	LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
  
  	/*
! 	 * Try to remove the physical infrastructure
  	 */
  	if (!remove_tablespace_directories(tablespaceoid, false))
! 		ereport(ERROR,
! 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! 				 errmsg("tablespace \"%s\" is not empty",
! 						tablespacename)));
  
  	/* Record the filesystem change in XLOG */
  	{
--- 460,488 ----
  	LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
  
  	/*
! 	 * Try to remove the physical infrastructure. 
  	 */
  	if (!remove_tablespace_directories(tablespaceoid, false))
! 	{
! 		/*
! 		 * There can be lingering empty files in the directories, left behind
! 		 * by for example DROP TABLE, that have been scheduled for deletion
! 		 * at next checkpoint (see comments in mdunlink() for details). We 
! 		 * could just delete them immediately, but we can't tell them apart
! 		 * from important data files that we mustn't delete. So instead, we
! 		 * force a checkpoint which will clean out any lingering files, and
! 		 * try again.
! 		 */
! 		RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
! 		if (!remove_tablespace_directories(tablespaceoid, false))
! 		{
! 			/* Still not empty, the files must be important then */
! 			ereport(ERROR,
! 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! 					 errmsg("tablespace \"%s\" is not empty",
! 							tablespacename)));
! 		}
! 	}
  
  	/* Record the filesystem change in XLOG */
  	{
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.129
diff -c -r1.129 md.c
*** src/backend/storage/smgr/md.c	3 Jul 2007 14:51:24 -0000	1.129
--- src/backend/storage/smgr/md.c	20 Oct 2007 14:10:08 -0000
***************
*** 34,39 ****
--- 34,40 ----
  /* special values for the segno arg to RememberFsyncRequest */
  #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
  #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
+ #define UNLINK_RELATION_REQUEST	(InvalidBlockNumber-2)
  
  /*
   * On Windows, we have to interpret EACCES as possibly meaning the same as
***************
*** 113,118 ****
--- 114,123 ----
   * table remembers the pending operations.	We use a hash table mostly as
   * a convenient way of eliminating duplicate requests.
   *
+  * We use a similar mechanism to remember no-longer-needed files that can
+  * be deleted after next checkpoint, but we use a linked list instead of hash
+  * table, because we don't expect there to be any duplicates.
+  *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
   */
***************
*** 131,139 ****
--- 136,152 ----
  	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr when request was made */
  } PendingOperationEntry;
  
+ typedef struct
+ {
+ 	RelFileNode rnode;			/* the dead relation to delete */
+ 	CycleCtr cycle_ctr;			/* mdunlink_cycle_ctr when request was made */
+ } PendingUnlinkEntry;
+ 
  static HTAB *pendingOpsTable = NULL;
+ static List *pendingUnlinks = NIL;
  
  static CycleCtr mdsync_cycle_ctr = 0;
+ static CycleCtr mdunlink_cycle_ctr = 0;
  
  
  typedef enum					/* behavior for mdopen & _mdfd_getseg */
***************
*** 146,151 ****
--- 159,165 ----
  /* local routines */
  static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
  static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+ static void register_unlink(RelFileNode rnode);
  static MdfdVec *_fdvec_alloc(void);
  
  #ifndef LET_OS_MANAGE_FILESIZE
***************
*** 188,193 ****
--- 202,208 ----
  									  100L,
  									  &hash_ctl,
  								   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ 		pendingUnlinks = NIL;
  	}
  }
  
***************
*** 257,267 ****
--- 272,300 ----
   * If isRedo is true, it's okay for the relation to be already gone.
   * Also, any failure should be reported as WARNING not ERROR, because
   * we are usually not in a transaction anymore when this is called.
+  *
+  * If isRedo is false, the relation is actually just truncated to release
+  * the space, and left to linger as an empty file until the next checkpoint.
+  * This is to avoid reusing the same relfilenode for a new relation file,
+  * until the commit record containing the deletion has been flushed out. 
+  *
+  * The scenario we're trying to protect from is this:
+  * After crash, we replay the commit WAL record of a transaction that
+  * deleted a relation, like DROP TABLE. But before the crash, after deleting
+  * the old relation, we created a new one, and it happened to get the same
+  * relfilenode as the deleted relation (OID must've wrapped around for
+  * that to happen). Now replaying the deletion of the old relation
+  * deletes the new one instead, because they had the same relfilenode.
+  * Normally the new relation would be recreated later in the WAL replay, but
+  * if we relied on fsyncing the file after populating it,  like CLUSTER and
+  * CREATE INDEX do, for example, the contents of the file would be lost
+  * forever.
   */
  void
  mdunlink(RelFileNode rnode, bool isRedo)
  {
  	char	   *path;
+ 	int ret;
  
  	/*
  	 * We have to clean out any pending fsync requests for the doomed relation,
***************
*** 271,278 ****
  
  	path = relpath(rnode);
  
! 	/* Delete the first segment, or only segment if not doing segmenting */
! 	if (unlink(path) < 0)
  	{
  		if (!isRedo || errno != ENOENT)
  			ereport(WARNING,
--- 304,318 ----
  
  	path = relpath(rnode);
  
! 	/*
! 	 * Delete or truncate the first segment, or only segment if not doing
! 	 * segmenting 
! 	 */
! 	if (!isRedo)
! 		ret = truncate(path, 0);
! 	else
! 		ret = unlink(path);
! 	if (ret < 0)
  	{
  		if (!isRedo || errno != ENOENT)
  			ereport(WARNING,
***************
*** 316,321 ****
--- 356,364 ----
  #endif
  
  	pfree(path);
+ 
+ 	if (!isRedo)
+ 		register_unlink(rnode);
  }
  
  /*
***************
*** 1096,1114 ****
  	}
  }
  
  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff the fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
!  * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
!  * a whole database.  (These are a tad slow because the hash table has to be
!  * searched linearly, but it doesn't seem worth rethinking the table structure
!  * for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
--- 1139,1268 ----
  	}
  }
  
+ 
+ /*
+  * register_unlink() -- Schedule a relation to be deleted after next checkpoint
+  */
+ static void
+ register_unlink(RelFileNode rnode)
+ {
+ 	if (pendingOpsTable)
+ 	{
+ 		/* standalone backend or startup process: fsync state is local */
+ 		RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+ 	}
+ 	else if (IsUnderPostmaster)
+ 	{
+ 		/*
+ 		 * Notify the bgwriter about it.  If we fail to queue the revoke
+ 		 * message, we have to sleep and try again ... ugly, but hopefully
+ 		 * won't happen often.
+ 		 *
+ 		 * XXX should we just leave the file orphaned instead? 
+ 		 */
+ 		while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+ 			pg_usleep(10000L);	/* 10 msec seems a good number */
+ 	}
+ }
+ 
+ /*
+  * mdcheckpointbegin() -- Do pre-checkpoint work
+  *
+  * To distinguish unlink requests that arrived before this checkpoint
+  * started and those that arrived during the checkpoint, we use a cycle
+  * counter similar to the one we use for fsync requests. That cycle
+  * counter is incremented here.
+  *
+  * This must called *before* RedoRecPtr is determined.
+  */
+ void
+ mdcheckpointbegin()
+ {
+ 	PendingUnlinkEntry *entry;
+ 	ListCell *cell;
+ 
+ 	/*
+ 	 * Just in case the prior checkpoint failed, stamp all entries in
+ 	 * the list with the current cycle counter. Anything that's in the
+ 	 * list at the start of checkpoint can surely be deleted after the 
+ 	 * checkpoint is finished, regardless of when the request was made.
+ 	 */
+ 	foreach(cell, pendingUnlinks)
+ 	{
+ 		entry = lfirst(cell);
+ 		entry->cycle_ctr = mdunlink_cycle_ctr;
+ 	}
+ 
+ 	/*
+ 	 * Any unlink requests arriving after this point will be assigned the
+ 	 * next cycle counter, and won't be unlinked until next checkpoint.
+ 	 */
+ 	mdunlink_cycle_ctr++;
+ }
+ 
+ /*
+  * mdcheckpointend() -- Do post-checkpoint work
+  *
+  * Remove any lingering files that can now be safely removed.
+  */
+ void
+ mdcheckpointend()
+ {
+ 	while(pendingUnlinks != NIL)
+ 	{
+ 		PendingUnlinkEntry *entry = linitial(pendingUnlinks);
+ 		char *path;
+ 
+ 		/*
+ 		 * New entries are appended to the end, so if the entry is new 
+ 		 * we've reached the end of old entries.
+ 		 */
+ 		if (entry->cycle_ctr == mdsync_cycle_ctr)
+ 			break;
+ 
+ 		/* Else assert we haven't missed it */
+ 		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdunlink_cycle_ctr);
+ 
+ 		/* Unlink the file */
+ 		path = relpath(entry->rnode);
+ 		if (unlink(path) < 0)
+ 		{
+ 			/*
+ 			 * ENOENT shouldn't happen either, but it doesn't really matter
+ 			 * because we would've deleted it now anyway.
+ 			 */
+ 			if (errno != ENOENT)
+ 				ereport(WARNING,
+ 						(errcode_for_file_access(),
+ 						 errmsg("could not remove relation %u/%u/%u: %m",
+ 								entry->rnode.spcNode,
+ 								entry->rnode.dbNode,
+ 								entry->rnode.relNode)));
+ 		}
+ 		pfree(path);
+ 
+ 		pendingUnlinks = list_delete_first(pendingUnlinks);
+ 	}
+ }
+ 
  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
!  * We stuff normal fsync request into the local hash table for execution
   * during the bgwriter's next checkpoint.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define three: 
!  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
!  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
!  * - UNLINK_REQUEST is a request to delete a lingering file after next 
!  *   checkpoint. These are stuffed into a linked list separate from
!  *   the normal fsync requests.
!  *
!  * (Handling the FORGET_* requests is a tad slow because the hash table has to
!  * be searched linearly, but it doesn't seem worth rethinking the table
!  * structure for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
***************
*** 1147,1152 ****
--- 1301,1322 ----
  			}
  		}
  	}
+ 	else if (segno == UNLINK_RELATION_REQUEST)
+ 	{
+ 		/* Unlink request: put it in the linked list */
+ 		MemoryContext oldcxt;
+ 		PendingUnlinkEntry *entry;
+ 
+ 		oldcxt = MemoryContextSwitchTo(MdCxt);
+ 
+ 		entry = palloc(sizeof(PendingUnlinkEntry));
+ 		memcpy(&entry->rnode, &rnode, sizeof(RelFileNode));
+ 		entry->cycle_ctr = mdunlink_cycle_ctr;
+ 
+ 		pendingUnlinks = lappend(pendingUnlinks, entry);
+ 
+ 		MemoryContextSwitchTo(oldcxt);
+ 	}
  	else
  	{
  		/* Normal case: enter a request to fsync this segment */
Index: src/include/storage/smgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.59
diff -c -r1.59 smgr.h
*** src/include/storage/smgr.h	5 Sep 2007 18:10:48 -0000	1.59
--- src/include/storage/smgr.h	18 Oct 2007 09:52:43 -0000
***************
*** 110,115 ****
--- 110,118 ----
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
  extern void ForgetDatabaseFsyncRequests(Oid dbid);
  
+ extern void mdcheckpointbegin(void);
+ extern void mdcheckpointend(void);
+ 
  /* smgrtype.c */
  extern Datum smgrout(PG_FUNCTION_ARGS);
  extern Datum smgrin(PG_FUNCTION_ARGS);
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at

                http://www.postgresql.org/about/donate

Reply via email to