Here's a fix for the problem that on a busy system, mdsync never
finishes. See the original problem description on hackers:
http://archives.postgresql.org/pgsql-hackers/2007-04/msg00259.php
The solution is taken from ITAGAKI Takahiro's Load Distributed
Checkpoint patch. At the beginning of mdsync, the pendingOpsTable is
copied to a linked list, and that list is then processed until it's empty.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.127
diff -c -r1.127 md.c
*** src/backend/storage/smgr/md.c 17 Jan 2007 16:25:01 -0000 1.127
--- src/backend/storage/smgr/md.c 5 Apr 2007 10:43:56 -0000
***************
*** 863,989 ****
void
mdsync(void)
{
! bool need_retry;
if (!pendingOpsTable)
elog(ERROR, "cannot sync without a pendingOpsTable");
/*
! * The fsync table could contain requests to fsync relations that have
! * been deleted (unlinked) by the time we get to them. Rather than
! * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
! * what we will do is retry the whole process after absorbing fsync
! * request messages again. Since mdunlink() queues a "revoke" message
! * before actually unlinking, the fsync request is guaranteed to be gone
! * the second time if it really was this case. DROP DATABASE likewise
! * has to tell us to forget fsync requests before it starts deletions.
*/
! do {
! HASH_SEQ_STATUS hstat;
! PendingOperationEntry *entry;
! int absorb_counter;
! need_retry = false;
/*
! * If we are in the bgwriter, the sync had better include all fsync
! * requests that were queued by backends before the checkpoint REDO
! * point was determined. We go that a little better by accepting all
! * requests queued up to the point where we start fsync'ing.
*/
AbsorbFsyncRequests();
! absorb_counter = FSYNCS_PER_ABSORB;
! hash_seq_init(&hstat, pendingOpsTable);
! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
! /*
! * If fsync is off then we don't have to bother opening the file
! * at all. (We delay checking until this point so that changing
! * fsync on the fly behaves sensibly.)
! */
! if (enableFsync)
! {
! SMgrRelation reln;
! MdfdVec *seg;
! /*
! * If in bgwriter, we want to absorb pending requests every so
! * often to prevent overflow of the fsync request queue. This
! * could result in deleting the current entry out from under
! * our hashtable scan, so the procedure is to fall out of the
! * scan and start over from the top of the function.
! */
! if (--absorb_counter <= 0)
! {
! need_retry = true;
! break;
! }
! /*
! * Find or create an smgr hash entry for this relation. This
! * may seem a bit unclean -- md calling smgr? But it's really
! * the best solution. It ensures that the open file reference
! * isn't permanently leaked if we get an error here. (You may
! * say "but an unreferenced SMgrRelation is still a leak!" Not
! * really, because the only case in which a checkpoint is done
! * by a process that isn't about to shut down is in the
! * bgwriter, and it will periodically do smgrcloseall(). This
! * fact justifies our not closing the reln in the success path
! * either, which is a good thing since in non-bgwriter cases
! * we couldn't safely do that.) Furthermore, in many cases
! * the relation will have been dirtied through this same smgr
! * relation, and so we can save a file open/close cycle.
! */
! reln = smgropen(entry->tag.rnode);
!
! /*
! * It is possible that the relation has been dropped or
! * truncated since the fsync request was entered. Therefore,
! * allow ENOENT, but only if we didn't fail once already on
! * this file. This applies both during _mdfd_getseg() and
! * during FileSync, since fd.c might have closed the file
! * behind our back.
! */
! seg = _mdfd_getseg(reln,
! entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
! false, EXTENSION_RETURN_NULL);
! if (seg == NULL ||
! FileSync(seg->mdfd_vfd) < 0)
! {
! /*
! * XXX is there any point in allowing more than one try?
! * Don't see one at the moment, but easy to change the
! * test here if so.
! */
! if (!FILE_POSSIBLY_DELETED(errno) ||
! ++(entry->failures) > 1)
! ereport(ERROR,
! (errcode_for_file_access(),
! errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! entry->tag.segno,
! entry->tag.rnode.spcNode,
! entry->tag.rnode.dbNode,
! entry->tag.rnode.relNode)));
! else
! ereport(DEBUG1,
! (errcode_for_file_access(),
! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! entry->tag.segno,
! entry->tag.rnode.spcNode,
! entry->tag.rnode.dbNode,
! entry->tag.rnode.relNode)));
! need_retry = true;
! continue; /* don't delete the hashtable entry */
! }
! }
/* Okay, delete this entry */
if (hash_search(pendingOpsTable, &entry->tag,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
! } while (need_retry);
}
/*
--- 863,1012 ----
void
mdsync(void)
{
! HASH_SEQ_STATUS hstat;
! List *syncOps;
! ListCell *cell;
! PendingOperationEntry *entry;
if (!pendingOpsTable)
elog(ERROR, "cannot sync without a pendingOpsTable");
/*
! * If fsync=off, mdsync is a no-op. Just clear the pendingOpsTable.
*/
! if(!enableFsync)
! {
! hash_seq_init(&hstat, pendingOpsTable);
! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
! {
! if (hash_search(pendingOpsTable, &entry->tag,
! HASH_REMOVE, NULL) == NULL)
! elog(ERROR, "pendingOpsTable corrupted");
! }
! return;
! }
!
! /*
! * If we are in the bgwriter, the sync had better include all fsync
! * requests that were queued by backends before the checkpoint REDO
! * point was determined. We go that a little better by accepting all
! * requests queued up to the point where we start fsync'ing.
! */
! AbsorbFsyncRequests();
!
! /* Take a snapshot of pendingOpsTable into a list. We don't want to
! * scan through pendingOpsTable directly in the loop because we call
! * AbsorbFsyncRequests inside it which modifies the table.
! */
! syncOps = NULL;
! hash_seq_init(&hstat, pendingOpsTable);
! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
! {
! PendingOperationEntry *dup;
!
! /* Entries could be deleted in our scan, so copy them. */
! dup = (PendingOperationEntry *) palloc(sizeof(PendingOperationEntry));
! memcpy(dup, entry, sizeof(PendingOperationEntry));
! syncOps = lappend(syncOps, dup);
! }
!
! /* Process fsync requests until the list is empty */
! while((cell = list_head(syncOps)) != NULL)
! {
! SMgrRelation reln;
! MdfdVec *seg;
! entry = lfirst(cell);
/*
! * If in bgwriter, we want to absorb pending requests every so
! * often to prevent overflow of the fsync request queue.
*/
AbsorbFsyncRequests();
! /* Check that the entry is still in pendingOpsTable. It could've
! * been deleted by an absorbed relation or database deletion event.
! */
! entry = hash_search(pendingOpsTable, &entry->tag, HASH_FIND, NULL);
! if(entry == NULL)
{
! list_delete_cell(syncOps, cell, NULL);
! continue;
! }
! /*
! * Find or create an smgr hash entry for this relation. This
! * may seem a bit unclean -- md calling smgr? But it's really
! * the best solution. It ensures that the open file reference
! * isn't permanently leaked if we get an error here. (You may
! * say "but an unreferenced SMgrRelation is still a leak!" Not
! * really, because the only case in which a checkpoint is done
! * by a process that isn't about to shut down is in the
! * bgwriter, and it will periodically do smgrcloseall(). This
! * fact justifies our not closing the reln in the success path
! * either, which is a good thing since in non-bgwriter cases
! * we couldn't safely do that.) Furthermore, in many cases
! * the relation will have been dirtied through this same smgr
! * relation, and so we can save a file open/close cycle.
! */
! reln = smgropen(entry->tag.rnode);
! /*
! * It is possible that the relation has been dropped or
! * truncated since the fsync request was entered. Therefore,
! * allow ENOENT, but only if we didn't fail once already on
! * this file. This applies both during _mdfd_getseg() and
! * during FileSync, since fd.c might have closed the file
! * behind our back.
! */
! seg = _mdfd_getseg(reln,
! entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
! false, EXTENSION_RETURN_NULL);
! if (seg == NULL || FileSync(seg->mdfd_vfd) < 0)
! {
! /*
! * The fsync table could contain requests to fsync relations that have
! * been deleted (unlinked) by the time we get to them. Rather than
! * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
! * what we will do is retry after absorbing fsync
! * request messages again. Since mdunlink() queues a "revoke" message
! * before actually unlinking, the fsync request is guaranteed to be gone
! * the second time if it really was this case. DROP DATABASE likewise
! * has to tell us to forget fsync requests before it starts deletions.
! *
! * XXX is there any point in allowing more than one try?
! * Don't see one at the moment, but easy to change the
! * test here if so.
! */
! if (!FILE_POSSIBLY_DELETED(errno) ||
! ++(entry->failures) > 1)
! ereport(ERROR,
! (errcode_for_file_access(),
! errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! entry->tag.segno,
! entry->tag.rnode.spcNode,
! entry->tag.rnode.dbNode,
! entry->tag.rnode.relNode)));
! else
! ereport(DEBUG1,
! (errcode_for_file_access(),
! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! entry->tag.segno,
! entry->tag.rnode.spcNode,
! entry->tag.rnode.dbNode,
! entry->tag.rnode.relNode)));
+ }
+ else
+ {
/* Okay, delete this entry */
if (hash_search(pendingOpsTable, &entry->tag,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
+
+ list_delete_cell(syncOps, cell, NULL);
}
! }
}
/*
---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings