diff -cpr HEAD/src/backend/storage/smgr/md.c fix_mdsync/src/backend/storage/smgr/md.c
*** HEAD/src/backend/storage/smgr/md.c	Mon Jan 22 13:08:11 2007
--- fix_mdsync/src/backend/storage/smgr/md.c	Mon Apr  9 13:27:50 2007
*************** typedef struct
*** 126,135 ****
--- 126,148 ----
  {
  	PendingOperationTag tag;	/* hash table key (must be first!) */
  	int			failures;		/* number of failed attempts to fsync */
+ 								/* or -1 if canceled */
+ 	uint32		counter;		/* counter incremented for each mdsync */
  } PendingOperationEntry;
  
  static HTAB *pendingOpsTable = NULL;
  
+ #define PendingOperationCancel(entry) \
+ 	do { (entry)->failures = -1; } while(0)
+ #define PendingOperationIsCanceled(entry) \
+ 	((entry)->failures < 0)
+ 
+ /*
+  * The mdsync counter will eventually wrap around, so we must use
+  * mdsyncCounterCompare to compare two counters.
+  */
+ static uint32	mdsyncCounter = 0;
+ #define mdsyncCounterCompare(lhs, rhs) ((int32) ((lhs) - (rhs)))
  
  typedef enum					/* behavior for mdopen & _mdfd_getseg */
  {
*************** mdsync(void)
*** 869,874 ****
--- 882,901 ----
  		elog(ERROR, "cannot sync without a pendingOpsTable");
  
  	/*
+ 	 * If we are in the bgwriter, the sync had better include all fsync
+ 	 * requests that were queued by backends before the checkpoint REDO
+ 	 * point was determined. We go that a little better by accepting all
+ 	 * requests queued up to the point where we start fsync'ing.
+ 	 */
+ 	AbsorbFsyncRequests();
+ 
+ 	/*
+ 	 * Increment mdsyncCounter and we only fsync entries that have smaller 
+ 	 * counter values than mdsyncCounter.
+ 	 */
+ 	mdsyncCounter++;
+ 
+ 	/*
  	 * The fsync table could contain requests to fsync relations that have
  	 * been deleted (unlinked) by the time we get to them.  Rather than
  	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
*************** mdsync(void)
*** 885,898 ****
  
  		need_retry = false;
  
- 		/*
- 		 * If we are in the bgwriter, the sync had better include all fsync
- 		 * requests that were queued by backends before the checkpoint REDO
- 		 * point was determined. We go that a little better by accepting all
- 		 * requests queued up to the point where we start fsync'ing.
- 		 */
- 		AbsorbFsyncRequests();
- 
  		absorb_counter = FSYNCS_PER_ABSORB;
  		hash_seq_init(&hstat, pendingOpsTable);
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
--- 912,917 ----
*************** mdsync(void)
*** 901,923 ****
  			 * If fsync is off then we don't have to bother opening the file
  			 * at all.  (We delay checking until this point so that changing
  			 * fsync on the fly behaves sensibly.)
  			 */
! 			if (enableFsync)
  			{
  				SMgrRelation reln;
  				MdfdVec    *seg;
  
  				/*
  				 * If in bgwriter, we want to absorb pending requests every so
! 				 * often to prevent overflow of the fsync request queue.  This
! 				 * could result in deleting the current entry out from under
! 				 * our hashtable scan, so the procedure is to fall out of the
! 				 * scan and start over from the top of the function.
  				 */
  				if (--absorb_counter <= 0)
  				{
! 					need_retry = true;
! 					break;
  				}
  
  				/*
--- 920,944 ----
  			 * If fsync is off then we don't have to bother opening the file
  			 * at all.  (We delay checking until this point so that changing
  			 * fsync on the fly behaves sensibly.)
+ 			 * We also merely remove the canceled pending operations.
  			 */
! 			if (enableFsync && !PendingOperationIsCanceled(entry))
  			{
  				SMgrRelation reln;
  				MdfdVec    *seg;
  
+ 				/* Skip entries added after we started this mdsync. */
+ 				if (mdsyncCounterCompare(entry->counter, mdsyncCounter) >= 0)
+ 					continue;
+ 
  				/*
  				 * If in bgwriter, we want to absorb pending requests every so
! 				 * often to prevent overflow of the fsync request queue.
  				 */
  				if (--absorb_counter <= 0)
  				{
! 					AbsorbFsyncRequests();
! 					absorb_counter = FSYNCS_PER_ABSORB;
  				}
  
  				/*
*************** mdsync(void)
*** 983,988 ****
--- 1004,1015 ----
  							HASH_REMOVE, NULL) == NULL)
  				elog(ERROR, "pendingOpsTable corrupted");
  		}
+ 
+ 		/*
+ 		 * Absorb pending requests for the next retry, especially we need to
+ 		 * see requests against recently droped relations.
+ 		 */
+ 		AbsorbFsyncRequests();
  	} while (need_retry);
  }
  
*************** register_dirty_segment(SMgrRelation reln
*** 1032,1037 ****
--- 1059,1066 ----
   * a whole database.  (These are a tad slow because the hash table has to be
   * searched linearly, but it doesn't seem worth rethinking the table structure
   * for them.)
+  * Notice that we don't actually delete FORGET entries here to avoid dangling
+  * references in linear search at mdsync.
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
*************** RememberFsyncRequest(RelFileNode rnode, 
*** 1048,1059 ****
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
  			if (RelFileNodeEquals(entry->tag.rnode, rnode))
! 			{
! 				/* Okay, delete this entry */
! 				if (hash_search(pendingOpsTable, &entry->tag,
! 								HASH_REMOVE, NULL) == NULL)
! 					elog(ERROR, "pendingOpsTable corrupted");
! 			}
  		}
  	}
  	else if (segno == FORGET_DATABASE_FSYNC)
--- 1077,1083 ----
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
  			if (RelFileNodeEquals(entry->tag.rnode, rnode))
! 				PendingOperationCancel(entry);
  		}
  	}
  	else if (segno == FORGET_DATABASE_FSYNC)
*************** RememberFsyncRequest(RelFileNode rnode, 
*** 1066,1077 ****
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
  			if (entry->tag.rnode.dbNode == rnode.dbNode)
! 			{
! 				/* Okay, delete this entry */
! 				if (hash_search(pendingOpsTable, &entry->tag,
! 								HASH_REMOVE, NULL) == NULL)
! 					elog(ERROR, "pendingOpsTable corrupted");
! 			}
  		}
  	}
  	else
--- 1090,1096 ----
  		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
  			if (entry->tag.rnode.dbNode == rnode.dbNode)
! 				PendingOperationCancel(entry);
  		}
  	}
  	else
*************** RememberFsyncRequest(RelFileNode rnode, 
*** 1090,1097 ****
  													  &key,
  													  HASH_ENTER,
  													  &found);
! 		if (!found)				/* new entry, so initialize it */
  			entry->failures = 0;
  	}
  }
  
--- 1109,1120 ----
  													  &key,
  													  HASH_ENTER,
  													  &found);
! 		if (!found || PendingOperationIsCanceled(entry))
! 		{
! 			/* new or recycled entry, so initialize it */
  			entry->failures = 0;
+ 			entry->counter = mdsyncCounter;
+ 		}
  	}
  }