On Mon, Jul 14, 2025 at 01:28:03PM -0400, Tom Lane wrote:
> Is it intentional that this does
> 
> +#include "catalog/pg_largeobject_metadata.h"
> +#include "catalog/pg_shdepend.h"
> 
> rather than including the corresponding *_d.h headers?

Nope, that was an oversight.

-- 
nathan
>From 99551525801a462ca7f1beaa0096890df8d67c0a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 14 Jul 2025 12:47:14 -0500
Subject: [PATCH v5 1/1] pg_upgrade: Use COPY for large object metadata.

Presently, pg_dump generates commands like

    SELECT pg_catalog.lo_create('5432');
    ALTER LARGE OBJECT 5432 OWNER TO alice;
    GRANT SELECT ON LARGE OBJECT 5432 TO bob;

for each large object.  This is particularly slow at restore time,
especially when there are tens or hundreds of millions of large
objects.  From reports and personal experience, such slow restores
are primarily encountered at pg_upgrade time.  This commit teaches
pg_dump to instead dump pg_largeobject_metadata and the
corresponding pg_shdepend rows when in binary upgrade mode, i.e.,
pg_dump now generates commands like

    COPY pg_catalog.pg_largeobject_metadata (oid, lomowner, lomacl) FROM stdin;
    5432        16384   {alice=rw/alice,bob=r/alice}
    \.

    COPY pg_catalog.pg_shdepend (dbid, classid, objid, objsubid, refclassid, 
refobjid, deptype) FROM stdin;
    5   2613    5432    0       1260    16384   o
    5   2613    5432    0       1260    16385   a
    \.

Testing indicates the COPY approach can be significantly faster.
To do any better, we'd probably need to find a way to copy/link
pg_largeobject_metadata's files during pg_upgrade, which would be
complicated and limited to upgrades from >= v16 (since commit
7b378237aa changed the storage format for aclitem, which is used
for pg_largeobject_metadata.lomacl).

Note that this change only applies to binary upgrade mode (i.e.,
dumps initiated by pg_upgrade) since it inserts rows directly into
catalogs, bypassing the ordinary privilege checks.  Also, this
optimization can only be used for upgrades from >= v12 because
pg_largeobject_metadata was created WITH OIDS in older versions,
which prevents pg_dump from handling pg_largeobject_metadata.oid
properly.  With some extra effort, it might be possible to support
upgrades from older versions, but the added complexity didn't seem
worth it to support versions that will have been out-of-support for
nearly 3 years by the time this change is released.

The astute hacker may remember that prior to v12, pg_upgrade
copied/linked pg_largeobject_metadata's files (see commit
12a53c732c).  Besides the aforementioned storage format issues,
this approach failed to transfer the relevant pg_shdepend rows, and
pg_dump still had to generate an lo_create() command per large
object so that creating the dependent comments and security labels
worked.  We could perhaps adopt a hybrid approach for upgrades from
v16 and newer (i.e., generate lo_create() commands for each large
object, copy/link pg_largeobject_metadata's files, and COPY the
relevant pg_shdepend rows), but further testing is needed.

Reported-by: Hannu Krosing <han...@google.com>
Suggested-by: Tom Lane <t...@sss.pgh.pa.us>
Reviewed-by: Hannu Krosing <han...@google.com>
Reviewed-by: Nitin Motiani <nitinmoti...@google.com>
Reviewed-by: Tom Lane <t...@sss.pgh.pa.us>
Discussion: 
https://postgr.es/m/CAMT0RQSS-6qLH%2BzYsOeUbAYhop3wmQTkNmQpo5--QRDUR%2BqYmQ%40mail.gmail.com
---
 src/bin/pg_dump/pg_backup_archiver.c | 15 +++++
 src/bin/pg_dump/pg_dump.c            | 90 ++++++++++++++++++++++++++--
 src/bin/pg_dump/t/002_pg_dump.pl     |  4 +-
 3 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_archiver.c 
b/src/bin/pg_dump/pg_backup_archiver.c
index 197c1295d93..30e0da31aa3 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -31,6 +31,8 @@
 #endif
 
 #include "catalog/pg_class_d.h"
+#include "catalog/pg_largeobject_metadata_d.h"
+#include "catalog/pg_shdepend_d.h"
 #include "common/string.h"
 #include "compress_io.h"
 #include "dumputils.h"
@@ -2974,6 +2976,19 @@ _tocEntryRequired(TocEntry *te, teSection curSection, 
ArchiveHandle *AH)
        int                     res = REQ_SCHEMA | REQ_DATA;
        RestoreOptions *ropt = AH->public.ropt;
 
+       /*
+        * For binary upgrade mode, dump pg_largeobject_metadata and the
+        * associated pg_shdepend rows. This is faster to restore than the
+        * equivalent set of large object commands.  We can only do this for
+        * upgrades from v12 and newer; in older versions, 
pg_largeobject_metadata
+        * was created WITH OIDS, so the OID column is hidden and won't be 
dumped.
+        */
+       if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 &&
+               strcmp(te->desc, "TABLE DATA") == 0 &&
+               (te->catalogId.oid == LargeObjectMetadataRelationId ||
+                te->catalogId.oid == SharedDependRelationId))
+               return REQ_DATA;
+
        /* These items are treated specially */
        if (strcmp(te->desc, "ENCODING") == 0 ||
                strcmp(te->desc, "STDSTRINGS") == 0 ||
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 1937997ea67..e1e4710e913 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -49,8 +49,10 @@
 #include "catalog/pg_class_d.h"
 #include "catalog/pg_default_acl_d.h"
 #include "catalog/pg_largeobject_d.h"
+#include "catalog/pg_largeobject_metadata_d.h"
 #include "catalog/pg_proc_d.h"
 #include "catalog/pg_publication_d.h"
+#include "catalog/pg_shdepend_d.h"
 #include "catalog/pg_subscription_d.h"
 #include "catalog/pg_type_d.h"
 #include "common/connect.h"
@@ -209,6 +211,12 @@ static int nbinaryUpgradeClassOids = 0;
 static SequenceItem *sequences = NULL;
 static int     nsequences = 0;
 
+/*
+ * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use
+ * as a dependency for pg_shdepend and any large object comments/seclabels.
+ */
+static DumpId lo_metadata_dumpId;
+
 /* Maximum number of relations to fetch in a fetchAttributeStats() call. */
 #define MAX_ATTR_STATS_RELS 64
 
@@ -1085,6 +1093,36 @@ main(int argc, char **argv)
        if (!dopt.dumpData && dopt.sequence_data)
                getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE);
 
+       /*
+        * For binary upgrade mode, dump pg_largeobject_metadata and the
+        * associated pg_shdepend rows. This is faster to restore than the
+        * equivalent set of large object commands.  We can only do this for
+        * upgrades from v12 and newer; in older versions, 
pg_largeobject_metadata
+        * was created WITH OIDS, so the OID column is hidden and won't be 
dumped.
+        */
+       if (dopt.binary_upgrade && fout->remoteVersion >= 120000)
+       {
+               TableInfo  *lo_metadata = 
findTableByOid(LargeObjectMetadataRelationId);
+               TableInfo  *shdepend = findTableByOid(SharedDependRelationId);
+
+               makeTableDataInfo(&dopt, lo_metadata);
+               makeTableDataInfo(&dopt, shdepend);
+
+               /*
+                * Save pg_largeobject_metadata's dump ID for use as a 
dependency on
+                * pg_shdepend and any large object comments/seclabels.
+                */
+               lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId;
+               addObjectDependency(&shdepend->dataObj->dobj, 
lo_metadata_dumpId);
+
+               /*
+                * Only dump large object shdepend rows for this database.
+                */
+               shdepend->dataObj->filtercond = "WHERE classid = 
'pg_largeobject'::regclass "
+                       "AND dbid = (SELECT oid FROM pg_database "
+                       "            WHERE datname = current_database())";
+       }
+
        /*
         * In binary-upgrade mode, we do not have to worry about the actual LO
         * data or the associated metadata that resides in the pg_largeobject 
and
@@ -3924,10 +3962,37 @@ getLOs(Archive *fout)
                 * as it will be copied by pg_upgrade, which simply copies the
                 * pg_largeobject table. We *do* however dump out anything but 
the
                 * data, as pg_upgrade copies just pg_largeobject, but not
-                * pg_largeobject_metadata, after the dump is restored.
+                * pg_largeobject_metadata, after the dump is restored.  In 
versions
+                * before v12, this is done via proper large object commands.  
In
+                * newer versions, we dump the content of 
pg_largeobject_metadata and
+                * any associated pg_shdepend rows, which is faster to restore. 
 (On
+                * <v12, pg_largeobject_metadata was created WITH OIDS, so the 
OID
+                * column is hidden and won't be dumped.)
                 */
                if (dopt->binary_upgrade)
-                       loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
+               {
+                       if (fout->remoteVersion >= 120000)
+                       {
+                               /*
+                                * We should've saved pg_largeobject_metadata's 
dump ID before
+                                * this point.
+                                */
+                               Assert(lo_metadata_dumpId);
+
+                               loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | 
DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION);
+
+                               /*
+                                * Mark the large object as dependent on
+                                * pg_largeobject_metadata so that any large 
object
+                                * comments/seclables are dumped after it.
+                                */
+                               loinfo->dobj.dependencies = (DumpId *) 
pg_malloc(sizeof(DumpId));
+                               loinfo->dobj.dependencies[0] = 
lo_metadata_dumpId;
+                               loinfo->dobj.nDeps = loinfo->dobj.allocDeps = 1;
+                       }
+                       else
+                               loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
+               }
 
                /*
                 * Create a "BLOBS" data item for the group, too. This is just a
@@ -9039,8 +9104,20 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int 
numTables)
                if (tbinfo->relkind == RELKIND_SEQUENCE)
                        continue;
 
-               /* Don't bother with uninteresting tables, either */
-               if (!tbinfo->interesting)
+               /*
+                * Don't bother with uninteresting tables, either.  For binary
+                * upgrades, this is bypassed for pg_largeobject_metadata and
+                * pg_shdepend so that the columns names are collected for the
+                * corresponding COPY commands.  Restoring the data for those 
catalogs
+                * is faster than restoring the equivalent set of large object
+                * commands.  We can only do this for upgrades from v12 and 
newer; in
+                * older versions, pg_largeobject_metadata was created WITH 
OIDS, so
+                * the OID column is hidden and won't be dumped.
+                */
+               if (!tbinfo->interesting &&
+                       !(fout->dopt->binary_upgrade && fout->remoteVersion >= 
120000 &&
+                         (tbinfo->dobj.catId.oid == 
LargeObjectMetadataRelationId ||
+                          tbinfo->dobj.catId.oid == SharedDependRelationId)))
                        continue;
 
                /* OK, we need info for this table */
@@ -9244,7 +9321,10 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int 
numTables)
                        pg_fatal("unrecognized table OID %u", attrelid);
                /* cross-check that we only got requested tables */
                if (tbinfo->relkind == RELKIND_SEQUENCE ||
-                       !tbinfo->interesting)
+                       (!tbinfo->interesting &&
+                        !(fout->dopt->binary_upgrade && fout->remoteVersion >= 
120000 &&
+                          (tbinfo->dobj.catId.oid == 
LargeObjectMetadataRelationId ||
+                               tbinfo->dobj.catId.oid == 
SharedDependRelationId))))
                        pg_fatal("unexpected column data for table \"%s\"",
                                         tbinfo->dobj.name);
 
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 2485d8f360e..d8330e2bd17 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1087,6 +1087,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_owner => 1,
                        schema_only => 1,
@@ -1605,6 +1606,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        schema_only => 1,
                        schema_only_with_statistics => 1,
                        no_large_objects => 1,
@@ -4612,9 +4614,9 @@ my %tests = (
                        no_schema => 1,
                        section_data => 1,
                        test_schema_plus_large_objects => 1,
-                       binary_upgrade => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_privs => 1,
                        schema_only => 1,
-- 
2.39.5 (Apple Git-154)

Reply via email to