[PATCHES] Fix mdsync never-ending loop problem

Heikki Linnakangas Thu, 05 Apr 2007 04:48:05 -0700

Here's a fix for the problem that on a busy system, mdsync neverfinishes. See the original problem description on hackers:

http://archives.postgresql.org/pgsql-hackers/2007-04/msg00259.php

The solution is taken from ITAGAKI Takahiro's Load DistributedCheckpoint patch. At the beginning of mdsync, the pendingOpsTable iscopied to a linked list, and that list is then processed until it's empty.


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.127
diff -c -r1.127 md.c
*** src/backend/storage/smgr/md.c	17 Jan 2007 16:25:01 -0000	1.127
--- src/backend/storage/smgr/md.c	5 Apr 2007 10:43:56 -0000
***************
*** 863,989 ****
  void
  mdsync(void)
  {
! 	bool		need_retry;
  
  	if (!pendingOpsTable)
  		elog(ERROR, "cannot sync without a pendingOpsTable");
  
  	/*
! 	 * The fsync table could contain requests to fsync relations that have
! 	 * been deleted (unlinked) by the time we get to them.  Rather than
! 	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
! 	 * what we will do is retry the whole process after absorbing fsync
! 	 * request messages again.  Since mdunlink() queues a "revoke" message
! 	 * before actually unlinking, the fsync request is guaranteed to be gone
! 	 * the second time if it really was this case.  DROP DATABASE likewise
! 	 * has to tell us to forget fsync requests before it starts deletions.
  	 */
! 	do {
! 		HASH_SEQ_STATUS hstat;
! 		PendingOperationEntry *entry;
! 		int			absorb_counter;
  
! 		need_retry = false;
  
  		/*
! 		 * If we are in the bgwriter, the sync had better include all fsync
! 		 * requests that were queued by backends before the checkpoint REDO
! 		 * point was determined. We go that a little better by accepting all
! 		 * requests queued up to the point where we start fsync'ing.
  		 */
  		AbsorbFsyncRequests();
  
! 		absorb_counter = FSYNCS_PER_ABSORB;
! 		hash_seq_init(&hstat, pendingOpsTable);
! 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
! 			/*
! 			 * If fsync is off then we don't have to bother opening the file
! 			 * at all.  (We delay checking until this point so that changing
! 			 * fsync on the fly behaves sensibly.)
! 			 */
! 			if (enableFsync)
! 			{
! 				SMgrRelation reln;
! 				MdfdVec    *seg;
  
! 				/*
! 				 * If in bgwriter, we want to absorb pending requests every so
! 				 * often to prevent overflow of the fsync request queue.  This
! 				 * could result in deleting the current entry out from under
! 				 * our hashtable scan, so the procedure is to fall out of the
! 				 * scan and start over from the top of the function.
! 				 */
! 				if (--absorb_counter <= 0)
! 				{
! 					need_retry = true;
! 					break;
! 				}
  
! 				/*
! 				 * Find or create an smgr hash entry for this relation. This
! 				 * may seem a bit unclean -- md calling smgr?  But it's really
! 				 * the best solution.  It ensures that the open file reference
! 				 * isn't permanently leaked if we get an error here. (You may
! 				 * say "but an unreferenced SMgrRelation is still a leak!" Not
! 				 * really, because the only case in which a checkpoint is done
! 				 * by a process that isn't about to shut down is in the
! 				 * bgwriter, and it will periodically do smgrcloseall(). This
! 				 * fact justifies our not closing the reln in the success path
! 				 * either, which is a good thing since in non-bgwriter cases
! 				 * we couldn't safely do that.)  Furthermore, in many cases
! 				 * the relation will have been dirtied through this same smgr
! 				 * relation, and so we can save a file open/close cycle.
! 				 */
! 				reln = smgropen(entry->tag.rnode);
! 
! 				/*
! 				 * It is possible that the relation has been dropped or
! 				 * truncated since the fsync request was entered.  Therefore,
! 				 * allow ENOENT, but only if we didn't fail once already on
! 				 * this file.  This applies both during _mdfd_getseg() and
! 				 * during FileSync, since fd.c might have closed the file
! 				 * behind our back.
! 				 */
! 				seg = _mdfd_getseg(reln,
! 								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
! 								   false, EXTENSION_RETURN_NULL);
! 				if (seg == NULL ||
! 					FileSync(seg->mdfd_vfd) < 0)
! 				{
! 					/*
! 					 * XXX is there any point in allowing more than one try?
! 					 * Don't see one at the moment, but easy to change the
! 					 * test here if so.
! 					 */
! 					if (!FILE_POSSIBLY_DELETED(errno) ||
! 						++(entry->failures) > 1)
! 						ereport(ERROR,
! 								(errcode_for_file_access(),
! 								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! 										entry->tag.segno,
! 										entry->tag.rnode.spcNode,
! 										entry->tag.rnode.dbNode,
! 										entry->tag.rnode.relNode)));
! 					else
! 						ereport(DEBUG1,
! 								(errcode_for_file_access(),
! 								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! 										entry->tag.segno,
! 										entry->tag.rnode.spcNode,
! 										entry->tag.rnode.dbNode,
! 										entry->tag.rnode.relNode)));
! 					need_retry = true;
! 					continue;	/* don't delete the hashtable entry */
! 				}
! 			}
  
  			/* Okay, delete this entry */
  			if (hash_search(pendingOpsTable, &entry->tag,
  							HASH_REMOVE, NULL) == NULL)
  				elog(ERROR, "pendingOpsTable corrupted");
  		}
! 	} while (need_retry);
  }
  
  /*
--- 863,1012 ----
  void
  mdsync(void)
  {
! 	HASH_SEQ_STATUS hstat;
! 	List		   *syncOps;
! 	ListCell	   *cell;
! 	PendingOperationEntry *entry;
  
  	if (!pendingOpsTable)
  		elog(ERROR, "cannot sync without a pendingOpsTable");
  
  	/*
! 	 * If fsync=off, mdsync is a no-op. Just clear the pendingOpsTable.
  	 */
! 	if(!enableFsync)
! 	{
! 		hash_seq_init(&hstat, pendingOpsTable);
! 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
! 		{
! 			if (hash_search(pendingOpsTable, &entry->tag,
! 							HASH_REMOVE, NULL) == NULL)
! 				elog(ERROR, "pendingOpsTable corrupted");
! 		}
! 		return;
! 	}
! 
! 	/*
! 	 * If we are in the bgwriter, the sync had better include all fsync
! 	 * requests that were queued by backends before the checkpoint REDO
! 	 * point was determined. We go that a little better by accepting all
! 	 * requests queued up to the point where we start fsync'ing.
! 	 */
! 	AbsorbFsyncRequests();
! 
! 	/* Take a snapshot of pendingOpsTable into a list. We don't want to 
! 	 * scan through pendingOpsTable directly in the loop because we call
! 	 * AbsorbFsyncRequests inside it which modifies the table.
! 	 */
! 	syncOps = NULL;
! 	hash_seq_init(&hstat, pendingOpsTable);
! 	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
! 	{
! 		PendingOperationEntry *dup;
! 
! 		/* Entries could be deleted in our scan, so copy them. */
! 		dup = (PendingOperationEntry *) palloc(sizeof(PendingOperationEntry));
! 		memcpy(dup, entry, sizeof(PendingOperationEntry));
! 		syncOps = lappend(syncOps, dup);
! 	}
! 
! 	/* Process fsync requests until the list is empty */
! 	while((cell = list_head(syncOps)) != NULL)
! 	{
! 		SMgrRelation reln;
! 		MdfdVec    *seg;
  
! 		entry = lfirst(cell);
  
  		/*
! 		 * If in bgwriter, we want to absorb pending requests every so
! 		 * often to prevent overflow of the fsync request queue.  
  		 */
  		AbsorbFsyncRequests();
  
! 		/* Check that the entry is still in pendingOpsTable. It could've
! 		 * been deleted by an absorbed relation or database deletion event.
! 		 */
! 		entry = hash_search(pendingOpsTable, &entry->tag, HASH_FIND, NULL);
! 		if(entry == NULL)
  		{
! 			list_delete_cell(syncOps, cell, NULL);
! 			continue;
! 		}
  
! 		/*
! 		 * Find or create an smgr hash entry for this relation. This
! 		 * may seem a bit unclean -- md calling smgr?  But it's really
! 		 * the best solution.  It ensures that the open file reference
! 		 * isn't permanently leaked if we get an error here. (You may
! 		 * say "but an unreferenced SMgrRelation is still a leak!" Not
! 		 * really, because the only case in which a checkpoint is done
! 		 * by a process that isn't about to shut down is in the
! 		 * bgwriter, and it will periodically do smgrcloseall(). This
! 		 * fact justifies our not closing the reln in the success path
! 		 * either, which is a good thing since in non-bgwriter cases
! 		 * we couldn't safely do that.)  Furthermore, in many cases
! 		 * the relation will have been dirtied through this same smgr
! 		 * relation, and so we can save a file open/close cycle.
! 		 */
! 		reln = smgropen(entry->tag.rnode);
  
! 		/*
! 		 * It is possible that the relation has been dropped or
! 		 * truncated since the fsync request was entered.  Therefore,
! 		 * allow ENOENT, but only if we didn't fail once already on
! 		 * this file.  This applies both during _mdfd_getseg() and
! 		 * during FileSync, since fd.c might have closed the file
! 		 * behind our back.
! 		 */
! 		seg = _mdfd_getseg(reln,
! 						   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
! 						   false, EXTENSION_RETURN_NULL);
! 		if (seg == NULL || FileSync(seg->mdfd_vfd) < 0)
! 		{
! 			/*
! 			 * The fsync table could contain requests to fsync relations that have
! 			 * been deleted (unlinked) by the time we get to them.  Rather than
! 			 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
! 			 * what we will do is retry after absorbing fsync
! 			 * request messages again.  Since mdunlink() queues a "revoke" message
! 			 * before actually unlinking, the fsync request is guaranteed to be gone
! 			 * the second time if it really was this case.  DROP DATABASE likewise
! 			 * has to tell us to forget fsync requests before it starts deletions.
! 			 *
! 			 * XXX is there any point in allowing more than one try?
! 			 * Don't see one at the moment, but easy to change the
! 			 * test here if so.
! 			 */
! 			if (!FILE_POSSIBLY_DELETED(errno) ||
! 				++(entry->failures) > 1)
! 				ereport(ERROR,
! 						(errcode_for_file_access(),
! 						 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! 								entry->tag.segno,
! 								entry->tag.rnode.spcNode,
! 								entry->tag.rnode.dbNode,
! 								entry->tag.rnode.relNode)));
! 			else
! 				ereport(DEBUG1,
! 						(errcode_for_file_access(),
! 						 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! 								entry->tag.segno,
! 								entry->tag.rnode.spcNode,
! 								entry->tag.rnode.dbNode,
! 								entry->tag.rnode.relNode)));
  
+ 		} 
+ 		else 
+ 		{
  			/* Okay, delete this entry */
  			if (hash_search(pendingOpsTable, &entry->tag,
  							HASH_REMOVE, NULL) == NULL)
  				elog(ERROR, "pendingOpsTable corrupted");
+ 
+ 			list_delete_cell(syncOps, cell, NULL);
  		}
! 	}
  }
  
  /*

---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings

[PATCHES] Fix mdsync never-ending loop problem

Reply via email to