I wrote: > I got a bit suspicious of the transient-file mechanism introduced in > commit fba105b1099f4f5fa7283bb17cba6fed2baa8d0c after noticing that > ... > I believe that we probably ought to revert this mechanism entirely, and > build a new implementation based on these concepts: > * An SMgrRelation is transient if and only if it doesn't have an > "owning" relcache entry. Keep a list of all such SmgrRelations, and > close them all at transaction end. (Obviously, an SMgrRelation gets > removed from the list if it acquires an owner mid-transaction.) > * There's no such concept as FD_XACT_TRANSIENT at the fd.c level. > Rather, we close and delete the VFD entry when told to by SmgrRelation > closure.
Attached is a draft patch for that, presented in two parts: the first part just reverts commit fba105b1099f4f5fa7283bb17cba6fed2baa8d0c, and the second part installs the new mechanism. I'm reasonably pleased with the way this turned out; I think it's cleaner as well as more reliable than the previous patch. The list-management code could possibly be replaced with slist once that patch goes in, but since this needs to be back-patched, I didn't consider that for the moment. Comments? regards, tom lane
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 56095b32501efef651b0e650c5938de9b16d46f9..bdcbe47ac9e9f5ce0db5f90b8ddb567bf34ade00 100644 *** a/src/backend/storage/buffer/bufmgr.c --- b/src/backend/storage/buffer/bufmgr.c *************** BufferGetTag(Buffer buffer, RelFileNode *** 1882,1891 **** * written.) * * If the caller has an smgr reference for the buffer's relation, pass it ! * as the second parameter. If not, pass NULL. In the latter case, the ! * relation will be marked as "transient" so that the corresponding ! * kernel-level file descriptors are closed when the current transaction ends, ! * if any. */ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) --- 1882,1888 ---- * written.) * * If the caller has an smgr reference for the buffer's relation, pass it ! * as the second parameter. If not, pass NULL. */ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1909,1920 **** errcontext.previous = error_context_stack; error_context_stack = &errcontext; ! /* Find smgr relation for buffer, and mark it as transient */ if (reln == NULL) - { reln = smgropen(buf->tag.rnode, InvalidBackendId); - smgrsettransient(reln); - } TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, --- 1906,1914 ---- errcontext.previous = error_context_stack; error_context_stack = &errcontext; ! /* Find smgr relation for buffer */ if (reln == NULL) reln = smgropen(buf->tag.rnode, InvalidBackendId); TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index fed25fd7e7cdda9aaa2c5b646f2e40cf50f000ed..ecb62ba01aeb968aedab3260d95ce07a44aa61fb 100644 *** a/src/backend/storage/file/fd.c --- b/src/backend/storage/file/fd.c *************** int max_safe_fds = 32; /* default if n *** 126,133 **** /* these are the assigned bits in fdstate below: */ #define FD_TEMPORARY (1 << 0) /* T = delete when closed */ #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */ - #define FD_XACT_TRANSIENT (1 << 2) /* T = close (not delete) at aoXact, - * but keep VFD */ typedef struct vfd { --- 126,131 ---- *************** static Size SizeVfdCache = 0; *** 158,165 **** */ static int nfile = 0; ! /* True if there are files to close/delete at end of transaction */ ! static bool have_pending_fd_cleanup = false; /* * Tracks the total size of all temporary files. Note: when temp_file_limit --- 156,166 ---- */ static int nfile = 0; ! /* ! * Flag to tell whether it's worth scanning VfdCache looking for temp files ! * to close ! */ ! static bool have_xact_temporary_files = false; /* * Tracks the total size of all temporary files. Note: when temp_file_limit *************** LruDelete(File file) *** 607,613 **** Vfd *vfdP; Assert(file != 0); - Assert(!FileIsNotOpen(file)); DO_DB(elog(LOG, "LruDelete %d (%s)", file, VfdCache[file].fileName)); --- 608,613 ---- *************** OpenTemporaryFile(bool interXact) *** 971,977 **** VfdCache[file].resowner = CurrentResourceOwner; /* ensure cleanup happens at eoxact */ ! have_pending_fd_cleanup = true; } return file; --- 971,977 ---- VfdCache[file].resowner = CurrentResourceOwner; /* ensure cleanup happens at eoxact */ ! have_xact_temporary_files = true; } return file; *************** OpenTemporaryFileInTablespace(Oid tblspc *** 1045,1069 **** } /* - * Set the transient flag on a file - * - * This flag tells CleanupTempFiles to close the kernel-level file descriptor - * (but not the VFD itself) at end of transaction. - */ - void - FileSetTransient(File file) - { - Vfd *vfdP; - - Assert(FileIsValid(file)); - - vfdP = &VfdCache[file]; - vfdP->fdstate |= FD_XACT_TRANSIENT; - - have_pending_fd_cleanup = true; - } - - /* * close a file when done with it */ void --- 1045,1050 ---- *************** AtEOSubXact_Files(bool isCommit, SubTran *** 1863,1871 **** * particularly care which). All still-open per-transaction temporary file * VFDs are closed, which also causes the underlying files to be deleted * (although they should've been closed already by the ResourceOwner ! * cleanup). Transient files have their kernel file descriptors closed. ! * Furthermore, all "allocated" stdio files are closed. We also forget any ! * transaction-local temp tablespace list. */ void AtEOXact_Files(void) --- 1844,1851 ---- * particularly care which). All still-open per-transaction temporary file * VFDs are closed, which also causes the underlying files to be deleted * (although they should've been closed already by the ResourceOwner ! * cleanup). Furthermore, all "allocated" stdio files are closed. We also ! * forget any transaction-local temp tablespace list. */ void AtEOXact_Files(void) *************** AtProcExit_Files(int code, Datum arg) *** 1888,1897 **** } /* ! * General cleanup routine for fd.c. ! * ! * Temporary files are closed, and their underlying files deleted. ! * Transient files are closed. * * isProcExit: if true, this is being called as the backend process is * exiting. If that's the case, we should remove all temporary files; if --- 1868,1874 ---- } /* ! * Close temporary files and delete their underlying files. * * isProcExit: if true, this is being called as the backend process is * exiting. If that's the case, we should remove all temporary files; if *************** CleanupTempFiles(bool isProcExit) *** 1908,1958 **** * Careful here: at proc_exit we need extra cleanup, not just * xact_temporary files. */ ! if (isProcExit || have_pending_fd_cleanup) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { unsigned short fdstate = VfdCache[i].fdstate; ! if (VfdCache[i].fileName != NULL) { ! if (fdstate & FD_TEMPORARY) ! { ! /* ! * If we're in the process of exiting a backend process, ! * close all temporary files. Otherwise, only close ! * temporary files local to the current transaction. They ! * should be closed by the ResourceOwner mechanism ! * already, so this is just a debugging cross-check. ! */ ! if (isProcExit) ! FileClose(i); ! else if (fdstate & FD_XACT_TEMPORARY) ! { ! elog(WARNING, ! "temporary file %s not closed at end-of-transaction", ! VfdCache[i].fileName); ! FileClose(i); ! } ! } ! else if (fdstate & FD_XACT_TRANSIENT) { ! /* ! * Close the FD, and remove the entry from the LRU ring, ! * but also remove the flag from the VFD. This is to ! * ensure that if the VFD is reused in the future for ! * non-transient access, we don't close it inappropriately ! * then. ! */ ! if (!FileIsNotOpen(i)) ! LruDelete(i); ! VfdCache[i].fdstate &= ~FD_XACT_TRANSIENT; } } } ! have_pending_fd_cleanup = false; } /* Clean up "allocated" stdio files and dirs. */ --- 1885,1919 ---- * Careful here: at proc_exit we need extra cleanup, not just * xact_temporary files. */ ! if (isProcExit || have_xact_temporary_files) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { unsigned short fdstate = VfdCache[i].fdstate; ! if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) { ! /* ! * If we're in the process of exiting a backend process, close ! * all temporary files. Otherwise, only close temporary files ! * local to the current transaction. They should be closed by ! * the ResourceOwner mechanism already, so this is just a ! * debugging cross-check. ! */ ! if (isProcExit) ! FileClose(i); ! else if (fdstate & FD_XACT_TEMPORARY) { ! elog(WARNING, ! "temporary file %s not closed at end-of-transaction", ! VfdCache[i].fileName); ! FileClose(i); } } } ! have_xact_temporary_files = false; } /* Clean up "allocated" stdio files and dirs. */ diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 97742b92bb4ffa33db3554658511911955f47550..3f4ab49dab4a5a8d92c9673120f2d4964c29fdac 100644 *** a/src/backend/storage/smgr/md.c --- b/src/backend/storage/smgr/md.c *************** mdcreate(SMgrRelation reln, ForkNumber f *** 300,308 **** pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - reln->md_fd[forkNum] = _fdvec_alloc(); reln->md_fd[forkNum]->mdfd_vfd = fd; --- 300,305 ---- *************** mdopen(SMgrRelation reln, ForkNumber for *** 585,593 **** pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - reln->md_fd[forknum] = mdfd = _fdvec_alloc(); mdfd->mdfd_vfd = fd; --- 582,587 ---- *************** _mdfd_openseg(SMgrRelation reln, ForkNum *** 1680,1688 **** if (fd < 0) return NULL; - if (reln->smgr_transient) - FileSetTransient(fd); - /* allocate an mdfdvec entry for it */ v = _fdvec_alloc(); --- 1674,1679 ---- diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 0cec1477f3faf6551ad141b5107da1ea70b8b54b..6319d1e8589138be32e498e77cc4ce484211c1a1 100644 *** a/src/backend/storage/smgr/smgr.c --- b/src/backend/storage/smgr/smgr.c *************** smgropen(RelFileNode rnode, BackendId ba *** 163,196 **** reln->smgr_targblock = InvalidBlockNumber; reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; - reln->smgr_transient = false; reln->smgr_which = 0; /* we only have md.c at present */ /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) reln->md_fd[forknum] = NULL; } - else - /* if it was transient before, it no longer is */ - reln->smgr_transient = false; return reln; } /* - * smgrsettransient() -- mark an SMgrRelation object as transaction-bound - * - * The main effect of this is that all opened files are marked to be - * kernel-level closed (but not necessarily VFD-closed) when the current - * transaction ends. - */ - void - smgrsettransient(SMgrRelation reln) - { - reln->smgr_transient = true; - } - - /* * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object * * There can be only one owner at a time; this is sufficient since currently --- 163,179 ---- diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index bad9f10c62ec84351bf191eecde7ad99d433d867..849bb1025d9d483916784378e7beb8370e647e51 100644 *** a/src/include/storage/fd.h --- b/src/include/storage/fd.h *************** extern int max_safe_fds; *** 66,72 **** /* Operations on virtual Files --- equivalent to Unix kernel file ops */ extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); extern File OpenTemporaryFile(bool interXact); - extern void FileSetTransient(File file); extern void FileClose(File file); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); --- 66,71 ---- diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 3560d539076da83a3f2897f3109fcabc4b72d5d0..92b9198d5428c1f51617f595d062da1bc8af5106 100644 *** a/src/include/storage/smgr.h --- b/src/include/storage/smgr.h *************** typedef struct SMgrRelationData *** 60,66 **** * submodules. Do not touch them from elsewhere. */ int smgr_which; /* storage manager selector */ - bool smgr_transient; /* T if files are to be closed at EOXact */ /* for md.c; NULL for forks that are not open */ struct _MdfdVec *md_fd[MAX_FORKNUM + 1]; --- 60,65 ---- *************** typedef SMgrRelationData *SMgrRelation; *** 73,79 **** extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); - extern void smgrsettransient(SMgrRelation reln); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclose(SMgrRelation reln); --- 72,77 ----
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index e7a6606ec3b33d285ac032c18fb64e557abdf821..c24df3f38c2bbf2e0becfe0394cb92fd6b6566d1 100644 *** a/src/backend/access/transam/xact.c --- b/src/backend/access/transam/xact.c *************** CommitTransaction(void) *** 1949,1955 **** AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); ! /* smgrcommit already done */ AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); --- 1949,1955 ---- AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); ! AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); *************** PrepareTransaction(void) *** 2202,2208 **** AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); ! /* smgrcommit already done */ AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); --- 2202,2208 ---- AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); ! AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); *************** AbortTransaction(void) *** 2348,2353 **** --- 2348,2354 ---- AtEOXact_SPI(false); AtEOXact_on_commit_actions(false); AtEOXact_Namespace(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(false); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 748fd85edb0d7175e5ffd99288b92eddd8706d20..709ccf1f2561907f96f1ae2bf8ddc47e326efbb1 100644 *** a/src/backend/postmaster/bgwriter.c --- b/src/backend/postmaster/bgwriter.c *************** BackgroundWriterMain(void) *** 183,188 **** --- 183,189 ---- false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index c5f32059a792bc407bd01e1fab26c180148ebf3d..18e6a4e8c4be1c1e6041cd854a55f367ffd4ba6e 100644 *** a/src/backend/postmaster/checkpointer.c --- b/src/backend/postmaster/checkpointer.c *************** CheckpointerMain(void) *** 291,296 **** --- 291,297 ---- false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 43139017c276648ec292a81a4684e605701e2205..c3e15ef7595de7373e319540691304285672c178 100644 *** a/src/backend/postmaster/walwriter.c --- b/src/backend/postmaster/walwriter.c *************** WalWriterMain(void) *** 188,193 **** --- 188,194 ---- false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 6319d1e8589138be32e498e77cc4ce484211c1a1..5dff8b3702ee64c355a050aa76242b99b6eea151 100644 *** a/src/backend/storage/smgr/smgr.c --- b/src/backend/storage/smgr/smgr.c *************** static const int NSmgr = lengthof(smgrsw *** 76,86 **** --- 76,90 ---- /* * Each backend has a hashtable that stores all extant SMgrRelation objects. + * In addition, "unowned" SMgrRelation objects are chained together in a list. */ static HTAB *SMgrRelationHash = NULL; + static SMgrRelation first_unowned_reln = NULL; + /* local function prototypes */ static void smgrshutdown(int code, Datum arg); + static void remove_from_unowned_list(SMgrRelation reln); /* *************** smgrshutdown(int code, Datum arg) *** 124,130 **** /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * ! * This does not attempt to actually open the object. */ SMgrRelation smgropen(RelFileNode rnode, BackendId backend) --- 128,134 ---- /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * ! * This does not attempt to actually open the underlying file. */ SMgrRelation smgropen(RelFileNode rnode, BackendId backend) *************** smgropen(RelFileNode rnode, BackendId ba *** 144,149 **** --- 148,154 ---- ctl.hash = tag_hash; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_FUNCTION); + first_unowned_reln = NULL; } /* Look up or create an entry */ *************** smgropen(RelFileNode rnode, BackendId ba *** 168,173 **** --- 173,182 ---- /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) reln->md_fd[forknum] = NULL; + + /* place it at head of unowned list (to make smgrsetowner cheap) */ + reln->next_unowned_reln = first_unowned_reln; + first_unowned_reln = reln; } return reln; *************** smgropen(RelFileNode rnode, BackendId ba *** 182,195 **** --- 191,212 ---- void smgrsetowner(SMgrRelation *owner, SMgrRelation reln) { + /* We don't currently support "disowning" an SMgrRelation here */ + Assert(owner != NULL); + /* * First, unhook any old owner. (Normally there shouldn't be any, but it * seems possible that this can happen during swap_relation_files() * depending on the order of processing. It's ok to close the old * relcache entry early in that case.) + * + * If there isn't an old owner, then the reln should be in the unowned + * list, and we need to remove it. */ if (reln->smgr_owner) *(reln->smgr_owner) = NULL; + else + remove_from_unowned_list(reln); /* Now establish the ownership relationship. */ reln->smgr_owner = owner; *************** smgrsetowner(SMgrRelation *owner, SMgrRe *** 197,202 **** --- 214,251 ---- } /* + * remove_from_unowned_list -- unlink an SMgrRelation from the unowned list + * + * If the reln is not present in the list, nothing happens. Typically this + * would be caller error, but there seems no reason to throw an error. + * + * In the worst case this could be rather slow; but in all the cases that seem + * likely to be performance-critical, the reln being sought will actually be + * first in the list. Furthermore, the number of unowned relns touched in any + * one transaction shouldn't be all that high typically. So it doesn't seem + * worth expending the additional space and management logic needed for a + * doubly-linked list. + */ + static void + remove_from_unowned_list(SMgrRelation reln) + { + SMgrRelation *link; + SMgrRelation cur; + + for (link = &first_unowned_reln, cur = *link; + cur != NULL; + link = &cur->next_unowned_reln, cur = *link) + { + if (cur == reln) + { + *link = cur->next_unowned_reln; + cur->next_unowned_reln = NULL; + break; + } + } + } + + /* * smgrexists() -- Does the underlying file for a fork exist? */ bool *************** smgrclose(SMgrRelation reln) *** 219,224 **** --- 268,276 ---- owner = reln->smgr_owner; + if (!owner) + remove_from_unowned_list(reln); + if (hash_search(SMgrRelationHash, (void *) &(reln->smgr_rnode), HASH_REMOVE, NULL) == NULL) *************** smgrpostckpt(void) *** 600,602 **** --- 652,680 ---- (*(smgrsw[i].smgr_post_ckpt)) (); } } + + /* + * AtEOXact_SMgr + * + * This routine is called during transaction commit or abort (it doesn't + * particularly care which). All transient SMgrRelation objects are closed. + * + * We do this as a compromise between wanting transient SMgrRelations to + * live awhile (to amortize the costs of blind writes of multiple blocks) + * and needing them to not live forever (since we're probably holding open + * a kernel file descriptor for the underlying file, and we need to ensure + * that gets closed reasonably soon if the file gets deleted). + */ + void + AtEOXact_SMgr(void) + { + /* + * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each + * one from the list. + */ + while (first_unowned_reln != NULL) + { + Assert(first_unowned_reln->smgr_owner == NULL); + smgrclose(first_unowned_reln); + } + } diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 92b9198d5428c1f51617f595d062da1bc8af5106..9eccf7671ed99be4c068e3df50a75c21c1ce9e51 100644 *** a/src/include/storage/smgr.h --- b/src/include/storage/smgr.h *************** *** 33,38 **** --- 33,41 ---- * without having to make the smgr explicitly aware of relcache. There * can't be more than one "owner" pointer per SMgrRelation, but that's * all we need. + * + * SMgrRelations that do not have an "owner" are considered to be transient, + * and are deleted at end of transaction. */ typedef struct SMgrRelationData { *************** typedef struct SMgrRelationData *** 63,68 **** --- 66,74 ---- /* for md.c; NULL for forks that are not open */ struct _MdfdVec *md_fd[MAX_FORKNUM + 1]; + + /* if unowned, list link in list of all unowned SMgrRelations */ + struct SMgrRelationData *next_unowned_reln; } SMgrRelationData; typedef SMgrRelationData *SMgrRelation; *************** extern void smgrimmedsync(SMgrRelation r *** 95,100 **** --- 101,107 ---- extern void smgrpreckpt(void); extern void smgrsync(void); extern void smgrpostckpt(void); + extern void AtEOXact_SMgr(void); /* internals: move me elsewhere -- ay 7/94 */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers