Please find the latest patch attached which incorporates the feedback received.
* changed flag name to --max-table-segment-pages
* added check for amname = "heap"
* switched to using of
pg_relation_size(c.oid)/current_setting('block_size')::int when
--max-table-segment-pages is set
* added option_parse_uint32(...) to be used for full range of pages numbers
* The COPY SELECTs now use <= , BETWEEN or >= depending on the segment position
* added documentation
* TESTS:
* added simple chunked dump and restore test
* added a WARNING with count and table data hash to source and
chunked restore database
I left in the boolean to indicate if this is a full table or chunk
(was named chunking, nor is_segment)
An a lternative would be to use an expression like (td->startPage != 0
|| td->endPage != InvalidBlockNumber) whenever td->is_segment is
needed
If you insist on not having a separate structure member we could turn
this into something like this
#define is_segment(td) ((td->startPage != 0 || td->endPage !=
InvalidBlockNumber))
and then use is_segment(td) instead of td->is_segment where needed.
From 11bd8c953968299e62d32ec4ca648077ea8fd8c9 Mon Sep 17 00:00:00 2001
From: Hannu Krosing <[email protected]>
Date: Wed, 21 Jan 2026 13:45:46 +0100
Subject: [PATCH v9] * changed flag mname to max-table-segment-pages * added
check for amname = "heap" * added simple chunked dump and restore test *
switched to using of
pg_relation_size(c.oid)/current_setting('block_size')::int when
--max-table-segment-pages is set * added documentation * added
option_parse_uint32(...) to be used for full range of pages numbers
* TESTS: added a WARNING with count and table data hash to source and chunked restore database
---
doc/src/sgml/ref/pg_dump.sgml | 24 +++
src/bin/pg_dump/pg_backup.h | 2 +
src/bin/pg_dump/pg_backup_archiver.c | 2 +
src/bin/pg_dump/pg_dump.c | 170 +++++++++++++++++-----
src/bin/pg_dump/pg_dump.h | 7 +
src/bin/pg_dump/t/004_pg_dump_parallel.pl | 52 +++++++
src/fe_utils/option_utils.c | 54 +++++++
src/include/fe_utils/option_utils.h | 3 +
8 files changed, 279 insertions(+), 35 deletions(-)
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 688e23c0e90..1811c67d141 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -1088,6 +1088,30 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--max-table-segment-pages=<replaceable class="parameter">npages</replaceable></option></term>
+ <listitem>
+ <para>
+ Dump data in segments based on number of pages in the main relation.
+ If the number of data pages in the relation is more than <replaceable class="parameter">npages</replaceable>
+ the data is split into segments based on that number of pages.
+ Individual segments can be dumped in parallel.
+ </para>
+
+ <note>
+ <para>
+ The option <option>--max-table-segment-pages</option> is applied to only pages
+ in the main heap and if the table has a large TOASTed part this has to be
+ taken into account when deciding on the number of pages to use.
+ In the extreme case a single 8kB heap page can have ~200 toast pointers each
+ corresponding to 1GB of data. If this data is also non-compressible then a
+ single-page segment can dump as 200GB file.
+ </para>
+ </note>
+
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>--no-comments</option></term>
<listitem>
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..b63ae05d895 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -27,6 +27,7 @@
#include "common/file_utils.h"
#include "fe_utils/simple_list.h"
#include "libpq-fe.h"
+#include "storage/block.h"
typedef enum trivalue
@@ -178,6 +179,7 @@ typedef struct _dumpOptions
bool aclsSkip;
const char *lockWaitTimeout;
int dump_inserts; /* 0 = COPY, otherwise rows per INSERT */
+ BlockNumber max_table_segment_pages; /* chunk when relpages is above this */
/* flags for various command-line long options */
int disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 4a63f7392ae..ed1913d66bc 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -44,6 +44,7 @@
#include "pg_backup_archiver.h"
#include "pg_backup_db.h"
#include "pg_backup_utils.h"
+#include "storage/block.h"
#define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n"
#define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n"
@@ -154,6 +155,7 @@ InitDumpOptions(DumpOptions *opts)
opts->dumpSchema = true;
opts->dumpData = true;
opts->dumpStatistics = false;
+ opts->max_table_segment_pages = InvalidBlockNumber;
}
/*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 687dc98e46d..ca7e9c5eeba 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -539,6 +539,7 @@ main(int argc, char **argv)
{"exclude-extension", required_argument, NULL, 17},
{"sequence-data", no_argument, &dopt.sequence_data, 1},
{"restrict-key", required_argument, NULL, 25},
+ {"max-table-segment-pages", required_argument, NULL, 26},
{NULL, 0, NULL, 0}
};
@@ -803,6 +804,13 @@ main(int argc, char **argv)
dopt.restrict_key = pg_strdup(optarg);
break;
+ case 26:
+ if (!option_parse_uint32(optarg, "--max-table-segment-pages", 1, MaxBlockNumber,
+ &dopt.max_table_segment_pages))
+ exit_nicely(1);
+ pg_log_warning("CHUNKING: set dopt.max_table_segment_pages to [%u]",(BlockNumber) dopt.max_table_segment_pages);
+ break;
+
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1372,6 +1380,9 @@ help(const char *progname)
printf(_(" --extra-float-digits=NUM override default setting for extra_float_digits\n"));
printf(_(" --filter=FILENAME include or exclude objects and data from dump\n"
" based on expressions in FILENAME\n"));
+ printf(_(" --max-table-segment-pages=NUMPAGES\n"
+ " Number of main table pages above which data is \n"
+ " copied out in chunks, also determines the chunk size\n"));
printf(_(" --if-exists use IF EXISTS when dropping objects\n"));
printf(_(" --include-foreign-data=PATTERN\n"
" include data of foreign tables on foreign\n"
@@ -2412,7 +2423,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
* a filter condition was specified. For other cases a simple COPY
* suffices.
*/
- if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+ if (tdinfo->filtercond || tdinfo->is_segment || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
{
/* Temporary allows to access to foreign tables to dump data */
if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2428,9 +2439,25 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
else
appendPQExpBufferStr(q, "* ");
- appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+ appendPQExpBuffer(q, "FROM %s %s",
fmtQualifiedDumpable(tbinfo),
tdinfo->filtercond ? tdinfo->filtercond : "");
+ if (tdinfo->is_segment)
+ {
+ appendPQExpBufferStr(q, tdinfo->filtercond?" AND ":" WHERE ");
+ if(tdinfo->startPage == 0)
+ appendPQExpBuffer(q, "ctid <= '(%u,32000)'", /* there is no (*,0) tuple */
+ tdinfo->endPage);
+ else if(tdinfo->endPage != InvalidBlockNumber)
+ appendPQExpBuffer(q, "ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+ tdinfo->startPage, tdinfo->endPage);
+ else
+ appendPQExpBuffer(q, "ctid >= '(%u,1)'", /* there is no (*,0) tuple */
+ tdinfo->startPage);
+ pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+ }
+
+ appendPQExpBuffer(q, ") TO stdout;");
}
else
{
@@ -2438,6 +2465,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
fmtQualifiedDumpable(tbinfo),
column_list);
}
+
+ pg_log_warning("CHUNKING: data query: %s", q->data);
+
res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
PQclear(res);
destroyPQExpBuffer(clistBuf);
@@ -2933,41 +2963,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
{
TocEntry *te;
- te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
- ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
- .namespace = tbinfo->dobj.namespace->dobj.name,
- .owner = tbinfo->rolname,
- .description = "TABLE DATA",
- .section = SECTION_DATA,
- .createStmt = tdDefn,
- .copyStmt = copyStmt,
- .deps = &(tbinfo->dobj.dumpId),
- .nDeps = 1,
- .dumpFn = dumpFn,
- .dumpArg = tdinfo));
-
- /*
- * Set the TocEntry's dataLength in case we are doing a parallel dump
- * and want to order dump jobs by table size. We choose to measure
- * dataLength in table pages (including TOAST pages) during dump, so
- * no scaling is needed.
- *
- * However, relpages is declared as "integer" in pg_class, and hence
- * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
- * Cast so that we get the right interpretation of table sizes
- * exceeding INT_MAX pages.
+ /* data chunking works off relpages, which are computed exactly using
+ * pg_relation_size() when --max-table-segment-pages was set
+ *
+ * We also don't chunk if table access method is not "heap"
+ * TODO: we may add chunking for other access methods later, maybe
+ * based on primary key tranges
*/
- te->dataLength = (BlockNumber) tbinfo->relpages;
- te->dataLength += (BlockNumber) tbinfo->toastpages;
+ if ((BlockNumber) tbinfo->relpages <= dopt->max_table_segment_pages ||
+ strcmp(tbinfo->amname, "heap") != 0)
+ {
+ te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = "TABLE DATA",
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = tdinfo));
- /*
- * If pgoff_t is only 32 bits wide, the above refinement is useless,
- * and instead we'd better worry about integer overflow. Clamp to
- * INT_MAX if the correct result exceeds that.
- */
+ /*
+ * Set the TocEntry's dataLength in case we are doing a parallel dump
+ * and want to order dump jobs by table size. We choose to measure
+ * dataLength in table pages (including TOAST pages) during dump, so
+ * no scaling is needed.
+ *
+ * However, relpages is declared as "integer" in pg_class, and hence
+ * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+ * Cast so that we get the right interpretation of table sizes
+ * exceeding INT_MAX pages.
+ */
+ te->dataLength = (BlockNumber) tbinfo->relpages;
+ te->dataLength += (BlockNumber) tbinfo->toastpages;
+ }
+ else
+ {
+ BlockNumber current_chunk_start = 0;
+ PQExpBuffer chunk_desc = createPQExpBuffer();
+
+ pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+ while (current_chunk_start < (BlockNumber) tbinfo->relpages)
+ {
+ TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+ memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+ AssignDumpId(&chunk_tdinfo->dobj);
+ //addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+ chunk_tdinfo->is_segment = true;
+ chunk_tdinfo->startPage = current_chunk_start;
+ chunk_tdinfo->endPage = current_chunk_start + dopt->max_table_segment_pages - 1;
+
+ pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ current_chunk_start += dopt->max_table_segment_pages;
+ if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+ chunk_tdinfo->endPage = InvalidBlockNumber; /* last chunk is for "all the rest" */
+
+ printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = chunk_desc->data,
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = chunk_tdinfo));
+
+ if(chunk_tdinfo->endPage == InvalidBlockNumber)
+ te->dataLength = (BlockNumber) tbinfo->relpages - chunk_tdinfo->startPage;
+ else
+ te->dataLength = dopt->max_table_segment_pages;
+ /* let's assume toast pages distribute evenly among chunks */
+ if(tbinfo->relpages)
+ te->dataLength += te->dataLength * tbinfo->toastpages / tbinfo->relpages;
+ }
+
+ destroyPQExpBuffer(chunk_desc);
+ }
+ /*
+ * If pgoff_t is only 32 bits wide, the above refinement is useless,
+ * and instead we'd better worry about integer overflow. Clamp to
+ * INT_MAX if the correct result exceeds that.
+ */
if (sizeof(te->dataLength) == 4 &&
(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
- te->dataLength < 0))
+ te->dataLength < 0))
te->dataLength = INT_MAX;
}
@@ -3092,6 +3182,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
tdinfo->dobj.namespace = tbinfo->dobj.namespace;
tdinfo->tdtable = tbinfo;
tdinfo->filtercond = NULL; /* might get set later */
+ tdinfo->is_segment = false; /* we could use (tdinfo->startPage != 0 || tdinfo->endPage != InvalidBlockNumber) */
+ tdinfo->startPage = 0;
+ tdinfo->endPage = InvalidBlockNumber;
addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
/* A TableDataInfo contains data, of course */
@@ -7254,8 +7347,15 @@ getTables(Archive *fout, int *numTables)
"c.relnamespace, c.relkind, c.reltype, "
"c.relowner, "
"c.relchecks, "
- "c.relhasindex, c.relhasrules, c.relpages, "
- "c.reltuples, c.relallvisible, ");
+ "c.relhasindex, c.relhasrules, ");
+
+ /* fetch current relation size if chunking is requested */
+ if(dopt->max_table_segment_pages != InvalidBlockNumber)
+ appendPQExpBufferStr(query, "pg_relation_size(c.oid)/current_setting('block_size')::int AS relpages, ");
+ else
+ appendPQExpBufferStr(query, "c.relpages, ");
+
+ appendPQExpBufferStr(query, "c.reltuples, c.relallvisible, ");
if (fout->remoteVersion >= 180000)
appendPQExpBufferStr(query, "c.relallfrozen, ");
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 4c4b14e5fc7..e362253d4d5 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
#include "pg_backup.h"
#include "catalog/pg_publication_d.h"
+#include "storage/block.h"
#define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) )
@@ -413,6 +414,12 @@ typedef struct _tableDataInfo
DumpableObject dobj;
TableInfo *tdtable; /* link to table to dump */
char *filtercond; /* WHERE condition to limit rows dumped */
+ bool is_segment; /* true if this is a data segment.
+ * we could use (tdinfo->startPage != 0 ||
+ * tdinfo->endPage != InvalidBlockNumber) */
+ BlockNumber startPage; /* starting table page */
+ BlockNumber endPage; /* ending table page for page-range dump,
+ * mostly startPage+max_table_segment_pages */
} TableDataInfo;
typedef struct _indxInfo
diff --git a/src/bin/pg_dump/t/004_pg_dump_parallel.pl b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
index 738f34b1c1b..88af25d2889 100644
--- a/src/bin/pg_dump/t/004_pg_dump_parallel.pl
+++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
@@ -11,6 +11,7 @@ use Test::More;
my $dbname1 = 'regression_src';
my $dbname2 = 'regression_dest1';
my $dbname3 = 'regression_dest2';
+my $dbname4 = 'regression_dest3';
my $node = PostgreSQL::Test::Cluster->new('main');
$node->init;
@@ -21,6 +22,7 @@ my $backupdir = $node->backup_dir;
$node->run_log([ 'createdb', $dbname1 ]);
$node->run_log([ 'createdb', $dbname2 ]);
$node->run_log([ 'createdb', $dbname3 ]);
+$node->run_log([ 'createdb', $dbname4 ]);
$node->safe_psql(
$dbname1,
@@ -44,6 +46,18 @@ create table tht_p1 partition of tht for values with (modulus 3, remainder 0);
create table tht_p2 partition of tht for values with (modulus 3, remainder 1);
create table tht_p3 partition of tht for values with (modulus 3, remainder 2);
insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x;
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+ thash_rec RECORD;
+BEGIN
+ SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash
+ INTO thash_rec
+ FROM tplain AS t;
+ RAISE WARNING 'thash: %', thash_rec;
+END;
+\$\$;
});
$node->command_ok(
@@ -87,4 +101,42 @@ $node->command_ok(
],
'parallel restore as inserts');
+$node->command_ok(
+ [
+ 'pg_dump',
+ '--format' => 'directory',
+ '--max-table-segment-pages' => 2,
+ '--no-sync',
+ '--jobs' => 2,
+ '--file' => "$backupdir/dump3",
+ $node->connstr($dbname1),
+ ],
+ 'parallel dump with chunks of two heap pages');
+
+$node->command_ok(
+ [
+ 'pg_restore', '--verbose',
+ '--dbname' => $node->connstr($dbname4),
+ '--jobs' => 3,
+ "$backupdir/dump3",
+ ],
+ 'parallel restore with chunks of two heap pages');
+
+$node->safe_psql(
+ $dbname4,
+ qq{
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+ thash_rec RECORD;
+BEGIN
+ SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash
+ INTO thash_rec
+ FROM tplain AS t;
+ RAISE WARNING 'thash after parallel chunked restore: %', thash_rec;
+END;
+\$\$;
+ });
+
done_testing();
diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c
index cc483ae176c..93d58d7e1a9 100644
--- a/src/fe_utils/option_utils.c
+++ b/src/fe_utils/option_utils.c
@@ -83,6 +83,60 @@ option_parse_int(const char *optarg, const char *optname,
return true;
}
+/*
+ * option_parse_int
+ *
+ * Parse integer value for an option. If the parsing is successful, returns
+ * true and stores the result in *result if that's given; if parsing fails,
+ * returns false.
+ */
+bool
+option_parse_uint32(const char *optarg, const char *optname,
+ uint64 min_range, uint64 max_range,
+ uint32 *result)
+{
+ char *endptr;
+ uint64 val64;
+
+ /* check there is no minus sign in value because strtoul()
+ * will silently convert negative numbers to two's complement */
+ for(endptr = optarg; *endptr != '\0'; endptr++)
+ if(*endptr == '-')
+ {
+ pg_log_error("negative value \"%s\" for option %s",
+ optarg, optname);
+ return false;
+ }
+
+ errno = 0;
+ val64 = strtoull(optarg, &endptr, 10);
+
+ /*
+ * Skip any trailing whitespace; if anything but whitespace remains before
+ * the terminating character, fail.
+ */
+ while (*endptr != '\0' && isspace((unsigned char) *endptr))
+ endptr++;
+
+ if (*endptr != '\0')
+ {
+ pg_log_error("invalid value \"%s\" for option %s",
+ optarg, optname);
+ return false;
+ }
+
+ if (errno == ERANGE || val64 < min_range || val64 > max_range)
+ {
+ pg_log_error("%s musst be in range %lu..%lu",
+ optname, min_range, max_range);
+ return false;
+ }
+
+ if (result)
+ *result = (uint32)val64;
+ return true;
+}
+
/*
* Provide strictly harmonized handling of the --sync-method option.
*/
diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h
index 0db6e3b6e91..268590a18bd 100644
--- a/src/include/fe_utils/option_utils.h
+++ b/src/include/fe_utils/option_utils.h
@@ -22,6 +22,9 @@ extern void handle_help_version_opts(int argc, char *argv[],
extern bool option_parse_int(const char *optarg, const char *optname,
int min_range, int max_range,
int *result);
+extern bool option_parse_uint32(const char *optarg, const char *optname,
+ uint64 min_range, uint64 max_range,
+ uint32 *result);
extern bool parse_sync_method(const char *optarg,
DataDirSyncMethod *sync_method);
--
2.43.0