Hi,

attached is a patch that improves performance when dropping multiple
tables within a transaction. Instead of scanning the shared buffers for
each table separately, the patch removes this and evicts all the tables
in a single pass through shared buffers.

Our system creates a lot of "working tables" (even 100.000) and we need
to perform garbage collection (dropping obsolete tables) regularly. This
often took ~ 1 hour, because we're using big AWS instances with lots of
RAM (which tends to be slower than RAM on bare hw). After applying this
patch and dropping tables in groups of 100, the gc runs in less than 4
minutes (i.e. a 15x speed-up).

This is not likely to improve usual performance, but for systems like
ours, this patch is a significant improvement.

kind regards
Tomas
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 993bc49..593c360 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -335,6 +335,9 @@ smgrDoPendingDeletes(bool isCommit)
        PendingRelDelete *pending;
        PendingRelDelete *prev;
        PendingRelDelete *next;
+    
+       SMgrRelation   *srels = palloc(sizeof(SMgrRelation));
+       int                             nrels = 0, i = 0;
 
        prev = NULL;
        for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -358,14 +361,25 @@ smgrDoPendingDeletes(bool isCommit)
                                SMgrRelation srel;
 
                                srel = smgropen(pending->relnode, 
pending->backend);
-                               smgrdounlink(srel, false);
-                               smgrclose(srel);
+
+                               srels = repalloc(srels, sizeof(SMgrRelation) * 
(nrels+1));
+                               srels[nrels++] = srel;
                        }
                        /* must explicitly free the list entry */
                        pfree(pending);
                        /* prev does not change */
                }
        }
+
+       if (nrels > 0) {
+               smgrdounlinkall(srels, nrels, false);
+               
+               for (i = 0; i < nrels; i++)
+                       smgrclose(srels[i]);
+               
+               pfree(srels);
+       }
+
 }
 
 /*
diff --git a/src/backend/storage/buffer/bufmgr.c 
b/src/backend/storage/buffer/bufmgr.c
index a3bf9a4..8238f49 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -108,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
 
+static int rnode_comparator(const void * p1, const void * p2);
 
 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -2130,6 +2131,53 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
 }
 
 /* ---------------------------------------------------------------------
+ *     DropRelFileNodeAllBuffersList
+ *
+ *     This function removes from the buffer pool all the pages of all
+ *     forks of the specified relations.  It's equivalent to calling
+ *     DropRelFileNodeBuffers once per fork with firstDelBlock = 0 for
+ *     each of the relations.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int nnodes)
+{
+       int         i;
+       int         j;
+
+       /* sort the list of rnodes */
+       pg_qsort(rnodes, nnodes, sizeof(RelFileNodeBackend), rnode_comparator);
+
+       /* If it's a local relation, it's localbuf.c's problem. */
+       for (j = 0; j < nnodes; j++) {
+               if (rnodes[j].backend != InvalidBackendId)
+               {
+                       if (rnodes[j].backend == MyBackendId)
+                               DropRelFileNodeAllLocalBuffers(rnodes[j].node);
+               }
+       }
+
+       for (i = 0; i < NBuffers; i++)
+       {
+               volatile BufferDesc *bufHdr = &BufferDescriptors[i];
+
+               RelFileNodeBackend * rnode = bsearch((const void 
*)&(bufHdr->tag.rnode), rnodes,
+                                                                               
                nnodes, sizeof(RelFileNodeBackend),
+                                                                               
                rnode_comparator);
+
+               /* buffer does not belong to any of the relations */
+               if (rnode == NULL)
+                       continue;
+
+               LockBufHdr(bufHdr);
+               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode->node))
+                       InvalidateBuffer(bufHdr);   /* releases spinlock */
+               else
+                       UnlockBufHdr(bufHdr);
+       }
+}
+
+/* ---------------------------------------------------------------------
  *             DropDatabaseBuffers
  *
  *             This function removes all the buffers in the buffer cache for a
@@ -2957,3 +3005,30 @@ local_buffer_write_error_callback(void *arg)
                pfree(path);
        }
 }
+
+/* Used to sort relfilenode array (ordered by [relnode, dbnode, spcnode]),
+ * so that it's suitable for bsearch. */
+static int
+rnode_comparator(const void * p1, const void * p2) {
+   
+       RelFileNodeBackend n1 = *(RelFileNodeBackend *)p1;
+       RelFileNodeBackend n2 = *(RelFileNodeBackend *)p2;
+
+       if (n1.node.relNode < n2.node.relNode)
+               return -1;
+       else if (n1.node.relNode > n2.node.relNode)
+               return 1;
+
+       if (n1.node.dbNode < n2.node.dbNode)
+               return -1;
+       else if (n1.node.dbNode > n2.node.dbNode)
+               return 1;
+
+       if (n1.node.spcNode < n2.node.spcNode)
+               return -1;
+       else if (n1.node.spcNode > n2.node.spcNode)
+               return 1;
+       else
+               return 0;
+
+}
\ No newline at end of file
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0cec147..4219958 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -384,6 +384,63 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
        (*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
 }
 
+void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo)
+{
+       int i = 0;
+       RelFileNodeBackend rnodes[nrels];
+       ForkNumber  forknum;
+
+       for (i = 0; i < nrels; i++) {
+               RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+               int         which = rels[i]->smgr_which;
+
+               rnodes[i] = rnode;
+
+               /* Close the forks at smgr level */
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                       (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+       }
+
+       /*
+        * Get rid of any remaining buffers for the relation.  bufmgr will just
+        * drop them without bothering to write the contents.
+        */
+       DropRelFileNodeAllBuffersList(rnodes, nrels);
+
+       /*
+        * It'd be nice to tell the stats collector to forget it immediately, 
too.
+        * But we can't because we don't know the OID (and in cases involving
+        * relfilenode swaps, it's not always clear which table OID to forget,
+        * anyway).
+        */
+
+       /*
+        * Send a shared-inval message to force other backends to close any
+        * dangling smgr references they may have for this rel.  We should do 
this
+        * before starting the actual unlinking, in case we fail partway through
+        * that step.  Note that the sinval message will eventually come back to
+        * this backend, too, and thereby provide a backstop that we closed our
+        * own smgr rel.
+        */
+       for (i = 0; i < nrels; i++) {
+               CacheInvalidateSmgr(rnodes[i]);
+       }
+
+       /*
+        * Delete the physical file(s).
+        *
+        * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+        * ERROR, because we've already decided to commit or abort the current
+        * xact.
+        */
+
+       for (i = 0; i < nrels; i++) {
+               int         which = rels[i]->smgr_which;
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                       (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, 
isRedo);
+       }
+}
+
 /*
  *     smgrdounlinkfork() -- Immediately unlink one fork of a relation.
  *
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b..a29441a 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,7 @@ extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
                                           ForkNumber forkNum, BlockNumber 
firstDelBlock);
 extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodeAllBuffersList(RelFileNodeBackend * rnodes, int 
nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
 #define RelationGetNumberOfBlocks(reln) \
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3560d53..600ef15 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -81,6 +81,7 @@ extern void smgrcloseall(void);
 extern void smgrclosenode(RelFileNodeBackend rnode);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation * rels, int nrels, bool isRedo);
 extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool 
isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                   BlockNumber blocknum, char *buffer, bool skipFsync);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to