On Mon, Aug 5, 2024 at 10:29 PM Robert Haas <robertmh...@gmail.com> wrote:
>
> On Fri, Aug 2, 2024 at 7:43 AM Amul Sul <sula...@gmail.com> wrote:
> > Please consider the attached version for the review.
>
> Thanks. I committed 0001-0003. The only thing that I changed was that
> in 0001, you forgot to pgindent, which actually mattered quite a bit,
> because astreamer is one character shorter than bbstreamer.
>

Understood. Thanks for tidying up and committing the patches.

> Before we proceed with the rest of this patch series, I think we
> should fix up the comments for some of the astreamer files. Proposed
> patch for that attached; please review.
>

Looks good to me, except for the following typo that I have fixed in
the attached version:

s/astreamer_plan_writer/astreamer_plain_writer/

> I also noticed that cfbot was unhappy about this patch set:
>
> [10:37:55.075] pg_verifybackup.c:100:7: error: no previous extern
> declaration for non-static variable 'format'
> [-Werror,-Wmissing-variable-declarations]
> [10:37:55.075] char            format = '\0';          /* p(lain)/t(ar) */
> [10:37:55.075]                 ^
> [10:37:55.075] pg_verifybackup.c:100:1: note: declare 'static' if the
> variable is not intended to be used outside of this translation unit
> [10:37:55.075] char            format = '\0';          /* p(lain)/t(ar) */
> [10:37:55.075] ^
> [10:37:55.075] pg_verifybackup.c:101:23: error: no previous extern
> declaration for non-static variable 'compress_algorithm'
> [-Werror,-Wmissing-variable-declarations]
> [10:37:55.075] pg_compress_algorithm compress_algorithm = PG_COMPRESSION_NONE;
> [10:37:55.075]                       ^
> [10:37:55.075] pg_verifybackup.c:101:1: note: declare 'static' if the
> variable is not intended to be used outside of this translation unit
> [10:37:55.075] pg_compress_algorithm compress_algorithm = PG_COMPRESSION_NONE;
> [10:37:55.075] ^
> [10:37:55.075] 2 errors generated.
>

Fixed in the attached version.

> Please fix and, after posting future versions of the patch set, try to
> remember to check http://cfbot.cputube.org/amul-sul.html

Sure. I used to rely on that earlier, but after Cirrus CI in the
GitHub repo, I assumed the workflow would be the same as cfbot and
started overlooking it. However, cfbot reported a warning that didn't
appear in my GitHub run. From now on, I'll make sure to check cfbot as
well.

Regards,
Amul
From 975b994d8ca76f3c16b7eb9119e45ec25dba410b Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Tue, 2 Jul 2024 10:26:35 +0530
Subject: [PATCH v7 10/12] pg_verifybackup: Add backup format and compression
 option

----
NOTE: This patch is not meant to be committed separately. It should
be squashed with the next patch that implements tar format support.
----
---
 src/bin/pg_verifybackup/pg_verifybackup.c | 146 +++++++++++++++++++++-
 1 file changed, 144 insertions(+), 2 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index 801e13886c2..f20b6e2895c 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -18,6 +18,7 @@
 #include <sys/stat.h>
 #include <time.h>
 
+#include "common/compression.h"
 #include "common/parse_manifest.h"
 #include "fe_utils/simple_list.h"
 #include "getopt_long.h"
@@ -56,6 +57,8 @@ static void report_manifest_error(JsonManifestParseContext *context,
 								  const char *fmt,...)
 			pg_attribute_printf(2, 3) pg_attribute_noreturn();
 
+static char find_backup_format(verifier_context *context);
+static pg_compress_algorithm find_backup_compression(verifier_context *context);
 static void verify_backup_directory(verifier_context *context,
 									char *relpath, char *fullpath);
 static void verify_backup_file(verifier_context *context,
@@ -74,6 +77,9 @@ static void usage(void);
 
 static const char *progname;
 
+static char format = '\0';		/* p(lain)/t(ar) */
+static pg_compress_algorithm compress_algorithm = PG_COMPRESSION_NONE;
+
 /*
  * Main entry point.
  */
@@ -84,11 +90,13 @@ main(int argc, char **argv)
 		{"exit-on-error", no_argument, NULL, 'e'},
 		{"ignore", required_argument, NULL, 'i'},
 		{"manifest-path", required_argument, NULL, 'm'},
+		{"format", required_argument, NULL, 'F'},
 		{"no-parse-wal", no_argument, NULL, 'n'},
 		{"progress", no_argument, NULL, 'P'},
 		{"quiet", no_argument, NULL, 'q'},
 		{"skip-checksums", no_argument, NULL, 's'},
 		{"wal-directory", required_argument, NULL, 'w'},
+		{"compress", required_argument, NULL, 'Z'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -99,6 +107,7 @@ main(int argc, char **argv)
 	bool		quiet = false;
 	char	   *wal_directory = NULL;
 	char	   *pg_waldump_path = NULL;
+	bool		tar_compression_specified = false;
 
 	pg_logging_init(argv[0]);
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_verifybackup"));
@@ -141,7 +150,7 @@ main(int argc, char **argv)
 	simple_string_list_append(&context.ignore_list, "recovery.signal");
 	simple_string_list_append(&context.ignore_list, "standby.signal");
 
-	while ((c = getopt_long(argc, argv, "ei:m:nPqsw:", long_options, NULL)) != -1)
+	while ((c = getopt_long(argc, argv, "eF:i:m:nPqsw:Z:", long_options, NULL)) != -1)
 	{
 		switch (c)
 		{
@@ -160,6 +169,15 @@ main(int argc, char **argv)
 				manifest_path = pstrdup(optarg);
 				canonicalize_path(manifest_path);
 				break;
+			case 'F':
+				if (strcmp(optarg, "p") == 0 || strcmp(optarg, "plain") == 0)
+					format = 'p';
+				else if (strcmp(optarg, "t") == 0 || strcmp(optarg, "tar") == 0)
+					format = 't';
+				else
+					pg_fatal("invalid backup format \"%s\", must be \"plain\" or \"tar\"",
+							 optarg);
+				break;
 			case 'n':
 				no_parse_wal = true;
 				break;
@@ -176,6 +194,12 @@ main(int argc, char **argv)
 				wal_directory = pstrdup(optarg);
 				canonicalize_path(wal_directory);
 				break;
+			case 'Z':
+				if (!parse_compress_algorithm(optarg, &compress_algorithm))
+					pg_fatal("unrecognized compression algorithm: \"%s\"",
+							 optarg);
+				tar_compression_specified = true;
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -207,11 +231,41 @@ main(int argc, char **argv)
 		pg_fatal("cannot specify both %s and %s",
 				 "-P/--progress", "-q/--quiet");
 
+	/* Complain if compression method specified but the format isn't tar. */
+	if (format != 't' && tar_compression_specified)
+	{
+		pg_log_error("only tar mode backups can be compressed");
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	/* Determine the backup format if it hasn't been specified. */
+	if (format == '\0')
+		format = find_backup_format(&context);
+
+	/*
+	 * Determine the tar backup compression method if it hasn't been
+	 * specified.
+	 */
+	if (format == 't' && !tar_compression_specified)
+		compress_algorithm = find_backup_compression(&context);
+
 	/* Unless --no-parse-wal was specified, we will need pg_waldump. */
 	if (!no_parse_wal)
 	{
 		int			ret;
 
+		/*
+		 * XXX: In the future, we should consider enhancing pg_waldump to read
+		 * WAL files from the tar archive.
+		 */
+		if (format != 'p')
+		{
+			pg_log_error("pg_waldump does not support parsing WAL files from a tar archive.");
+			pg_log_error_hint("Try \"%s --help\" to skip parse WAL files option.", progname);
+			exit(1);
+		}
+
 		pg_waldump_path = pg_malloc(MAXPGPATH);
 		ret = find_other_exec(argv[0], "pg_waldump",
 							  "pg_waldump (PostgreSQL) " PG_VERSION "\n",
@@ -279,7 +333,15 @@ main(int argc, char **argv)
 	 */
 	if (!context.skip_checksums)
 	{
-		verify_backup_checksums(&context);
+		/*
+		 * We were only checking the plain backup here. For the tar backup,
+		 * file checksums verification (if requested) will be done immediately
+		 * when the file is accessed, as we don't have random access to the
+		 * files like we do with plain backups.
+		 */
+		if (format == 'p')
+			verify_backup_checksums(&context);
+
 		progress_report(&context, true);
 	}
 
@@ -972,6 +1034,84 @@ should_ignore_relpath(verifier_context *context, const char *relpath)
 	return false;
 }
 
+/*
+ * To detect the backup format, it checks for the PG_VERSION file in the backup
+ * directory. If found, it will be considered a plain-format backup; otherwise,
+ * it will be assumed to be a tar-format backup.
+ */
+static char
+find_backup_format(verifier_context *context)
+{
+	char		result;
+	char	   *path;
+	struct stat sb;
+
+	/* Should be here only if the backup format is unknown */
+	Assert(format == '\0');
+
+	/* Check PG_VERSION file. */
+	path = psprintf("%s/%s", context->backup_directory, "PG_VERSION");
+	result = (stat(path, &sb) == 0) ? 'p' : 't';
+	pfree(path);
+
+	return result;
+}
+
+/*
+ * To determine the compression format, we will search for the main data
+ * directory archive and its extension, which starts with base.tar, as
+ * pg_basebackup writes the main data directory to an archive file named
+ * base.tar followed by a compression type extension like .gz, .lz4, or .zst.
+ */
+static pg_compress_algorithm
+find_backup_compression(verifier_context *context)
+{
+	char	   *path;
+	struct stat sb;
+	bool		found;
+
+	/* Should be here only for tar backup */
+	Assert(format == 't');
+
+	/*
+	 * Is this a tar archive?
+	 */
+	path = psprintf("%s/%s", context->backup_directory, "base.tar");
+	found = (stat(path, &sb) == 0);
+	pfree(path);
+	if (found)
+		return PG_COMPRESSION_NONE;
+
+	/*
+	 * Is this a .tar.gz archive?
+	 */
+	path = psprintf("%s/%s", context->backup_directory, "base.tar.gz");
+	found = (stat(path, &sb) == 0);
+	pfree(path);
+	if (found)
+		return PG_COMPRESSION_GZIP;
+
+	/*
+	 * Is this a .tar.lz4 archive?
+	 */
+	path = psprintf("%s/%s", context->backup_directory, "base.tar.lz4");
+	found = (stat(path, &sb) == 0);
+	pfree(path);
+	if (found)
+		return PG_COMPRESSION_LZ4;
+
+	/*
+	 * Is this a .tar.zst archive?
+	 */
+	path = psprintf("%s/%s", context->backup_directory, "base.tar.zst");
+	found = (stat(path, &sb) == 0);
+	pfree(path);
+	if (found)
+		return PG_COMPRESSION_ZSTD;
+
+	return PG_COMPRESSION_NONE; /* placate compiler */
+}
+
 /*
  * Print a progress report based on the variables in verifier_context.
  *
@@ -1054,11 +1194,13 @@ usage(void)
 	printf(_("  -e, --exit-on-error         exit immediately on error\n"));
 	printf(_("  -i, --ignore=RELATIVE_PATH  ignore indicated path\n"));
 	printf(_("  -m, --manifest-path=PATH    use specified path for manifest\n"));
+	printf(_("  -F, --format=p|t            backup format (plain, tar)\n"));
 	printf(_("  -n, --no-parse-wal          do not try to parse WAL files\n"));
 	printf(_("  -P, --progress              show progress information\n"));
 	printf(_("  -q, --quiet                 do not print any output, except for errors\n"));
 	printf(_("  -s, --skip-checksums        skip checksum verification\n"));
 	printf(_("  -w, --wal-directory=PATH    use specified path for WAL files\n"));
+	printf(_("  -Z, --compress=METHOD       compress method (gzip, lz4, zstd, none) \n"));
 	printf(_("  -V, --version               output version information, then exit\n"));
 	printf(_("  -?, --help                  show this help, then exit\n"));
 	printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
-- 
2.18.0

From c8ca272edb9cc433f7eb680876f5947b98fe010f Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Tue, 2 Jul 2024 17:04:56 +0530
Subject: [PATCH v7 12/12] pg_verifybackup: Tests and document

----
NOTE: This patch is not meant to be committed separately. It should
be squashed with the previous patch that implements tar format support.
----
---
 doc/src/sgml/ref/pg_verifybackup.sgml         | 54 +++++++++++++-
 src/bin/pg_verifybackup/t/001_basic.pl        | 18 ++++-
 src/bin/pg_verifybackup/t/004_options.pl      |  2 +-
 src/bin/pg_verifybackup/t/005_bad_manifest.pl |  2 +-
 src/bin/pg_verifybackup/t/008_untar.pl        | 73 ++++++-------------
 src/bin/pg_verifybackup/t/010_client_untar.pl | 48 +-----------
 6 files changed, 96 insertions(+), 101 deletions(-)

diff --git a/doc/src/sgml/ref/pg_verifybackup.sgml b/doc/src/sgml/ref/pg_verifybackup.sgml
index a3f167f9f6e..c743bd89a92 100644
--- a/doc/src/sgml/ref/pg_verifybackup.sgml
+++ b/doc/src/sgml/ref/pg_verifybackup.sgml
@@ -34,8 +34,10 @@ PostgreSQL documentation
    integrity of a database cluster backup taken using
    <command>pg_basebackup</command> against a
    <literal>backup_manifest</literal> generated by the server at the time
-   of the backup.  The backup must be stored in the "plain"
-   format; a "tar" format backup can be checked after extracting it.
+   of the backup.  The backup must be stored in the "plain" or "tar"
+   format.  Verification is supported for <literal>gzip</literal>,
+   <literal>lz4</literal>, and  <literal>zstd</literal> compressed tar backup;
+   any other compressed format backups can be checked after decompressing them.
   </para>
 
   <para>
@@ -168,6 +170,42 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>-F <replaceable class="parameter">format</replaceable></option></term>
+      <term><option>--format=<replaceable class="parameter">format</replaceable></option></term>
+      <listitem>
+       <para>
+        Specifies the format of the backup. <replaceable>format</replaceable>
+        can be one of the following:
+
+        <variablelist>
+         <varlistentry>
+          <term><literal>p</literal></term>
+          <term><literal>plain</literal></term>
+          <listitem>
+           <para>
+            Backup consists of plain files with the same layout as the
+            source server's data directory and tablespaces.
+           </para>
+          </listitem>
+         </varlistentry>
+
+         <varlistentry>
+          <term><literal>t</literal></term>
+          <term><literal>tar</literal></term>
+          <listitem>
+           <para>
+            Backup consists of tar files. The main data directory's contents
+            will be written to a file named <filename>base.tar</filename>,
+            and each other tablespace will be written to a separate tar file
+            named after that tablespace's OID.
+           </para>
+           </listitem>
+         </varlistentry>
+        </variablelist></para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-n</option></term>
       <term><option>--no-parse-wal</option></term>
@@ -227,6 +265,18 @@ PostgreSQL documentation
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry>
+      <term><option>-Z <replaceable class="parameter">method</replaceable></option></term>
+      <term><option>--compress=<replaceable class="parameter">method</replaceable></option></term>
+      <listitem>
+       <para>
+        The tar backup compression method can be <literal>gzip</literal>,
+        <literal>lz4</literal>, <literal>zstd</literal>, or
+        <literal>none</literal> if no compression.
+       </para>
+      </listitem>
+     </varlistentry>
     </variablelist>
    </para>
 
diff --git a/src/bin/pg_verifybackup/t/001_basic.pl b/src/bin/pg_verifybackup/t/001_basic.pl
index 2f3e52d296f..d47ce1f04fc 100644
--- a/src/bin/pg_verifybackup/t/001_basic.pl
+++ b/src/bin/pg_verifybackup/t/001_basic.pl
@@ -17,13 +17,25 @@ command_fails_like(
 	qr/no backup directory specified/,
 	'target directory must be specified');
 command_fails_like(
-	[ 'pg_verifybackup', $tempdir ],
+	[ 'pg_verifybackup', '-Fp', $tempdir ],
 	qr/could not open file.*\/backup_manifest\"/,
 	'pg_verifybackup requires a manifest');
 command_fails_like(
-	[ 'pg_verifybackup', $tempdir, $tempdir ],
+	[ 'pg_verifybackup', '-Fp', $tempdir, $tempdir ],
 	qr/too many command-line arguments/,
 	'multiple target directories not allowed');
+command_fails_like(
+	[ 'pg_verifybackup', '-Zgzip', $tempdir ],
+	qr/only tar mode backups can be compressed/,
+	'compression method required format option');
+command_fails_like(
+	[ 'pg_verifybackup', '-Fp', '-Zlz4', $tempdir ],
+	qr/only tar mode backups can be compressed/,
+	'compression method required tar format option');
+command_fails_like(
+	[ 'pg_verifybackup', '-Fp', '-Znon_exist', $tempdir ],
+	qr/unrecognized compression algorithm/,
+	'compression method should be valid');
 
 # create fake manifest file
 open(my $fh, '>', "$tempdir/backup_manifest") || die "open: $!";
@@ -31,7 +43,7 @@ close($fh);
 
 # but then try to use an alternate, nonexisting manifest
 command_fails_like(
-	[ 'pg_verifybackup', '-m', "$tempdir/not_the_manifest", $tempdir ],
+	[ 'pg_verifybackup', '-Fp', '-m', "$tempdir/not_the_manifest", $tempdir ],
 	qr/could not open file.*\/not_the_manifest\"/,
 	'pg_verifybackup respects -m flag');
 
diff --git a/src/bin/pg_verifybackup/t/004_options.pl b/src/bin/pg_verifybackup/t/004_options.pl
index 8ed2214408e..2f197648740 100644
--- a/src/bin/pg_verifybackup/t/004_options.pl
+++ b/src/bin/pg_verifybackup/t/004_options.pl
@@ -108,7 +108,7 @@ unlike(
 # Test valid manifest with nonexistent backup directory.
 command_fails_like(
 	[
-		'pg_verifybackup', '-m',
+		'pg_verifybackup', '-Fp', '-m',
 		"$backup_path/backup_manifest", "$backup_path/fake"
 	],
 	qr/could not open directory/,
diff --git a/src/bin/pg_verifybackup/t/005_bad_manifest.pl b/src/bin/pg_verifybackup/t/005_bad_manifest.pl
index c4ed64b62d5..28c51b6feb0 100644
--- a/src/bin/pg_verifybackup/t/005_bad_manifest.pl
+++ b/src/bin/pg_verifybackup/t/005_bad_manifest.pl
@@ -208,7 +208,7 @@ sub test_bad_manifest
 	print $fh $manifest_contents;
 	close($fh);
 
-	command_fails_like([ 'pg_verifybackup', $tempdir ], $regexp, $test_name);
+	command_fails_like([ 'pg_verifybackup', '-Fp', $tempdir ], $regexp, $test_name);
 	return;
 }
 
diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl
index 7a09f3b75b2..9896560adc3 100644
--- a/src/bin/pg_verifybackup/t/008_untar.pl
+++ b/src/bin/pg_verifybackup/t/008_untar.pl
@@ -16,6 +16,20 @@ my $primary = PostgreSQL::Test::Cluster->new('primary');
 $primary->init(allows_streaming => 1);
 $primary->start;
 
+# Create a couple of directories to use as tablespaces.
+my $TS1_LOCATION = $primary->backup_dir .'/ts1';
+$TS1_LOCATION =~ s/\/\.\//\//g;    # collapse foo/./bar to foo/bar
+mkdir($TS1_LOCATION);
+
+# Create a tablespace with table in it.
+$primary->safe_psql('postgres', qq(
+		CREATE TABLESPACE regress_ts1 LOCATION '$TS1_LOCATION';
+		SELECT oid FROM pg_tablespace WHERE spcname = 'regress_ts1';
+		CREATE TABLE regress_tbl1(i int) TABLESPACE regress_ts1;
+		INSERT INTO regress_tbl1 VALUES(generate_series(1,5));));
+my $tsoid = $primary->safe_psql('postgres', qq(
+		SELECT oid FROM pg_tablespace WHERE spcname = 'regress_ts1'));
+
 my $backup_path = $primary->backup_dir . '/server-backup';
 my $extract_path = $primary->backup_dir . '/extracted-backup';
 
@@ -23,39 +37,31 @@ my @test_configuration = (
 	{
 		'compression_method' => 'none',
 		'backup_flags' => [],
-		'backup_archive' => 'base.tar',
+		'backup_archive' => ['base.tar', "$tsoid.tar"],
 		'enabled' => 1
 	},
 	{
 		'compression_method' => 'gzip',
 		'backup_flags' => [ '--compress', 'server-gzip' ],
-		'backup_archive' => 'base.tar.gz',
-		'decompress_program' => $ENV{'GZIP_PROGRAM'},
-		'decompress_flags' => ['-d'],
+		'backup_archive' => [ 'base.tar.gz', "$tsoid.tar.gz" ],
 		'enabled' => check_pg_config("#define HAVE_LIBZ 1")
 	},
 	{
 		'compression_method' => 'lz4',
 		'backup_flags' => [ '--compress', 'server-lz4' ],
-		'backup_archive' => 'base.tar.lz4',
-		'decompress_program' => $ENV{'LZ4'},
-		'decompress_flags' => [ '-d', '-m' ],
+		'backup_archive' => ['base.tar.lz4', "$tsoid.tar.lz4" ],
 		'enabled' => check_pg_config("#define USE_LZ4 1")
 	},
 	{
 		'compression_method' => 'zstd',
 		'backup_flags' => [ '--compress', 'server-zstd' ],
-		'backup_archive' => 'base.tar.zst',
-		'decompress_program' => $ENV{'ZSTD'},
-		'decompress_flags' => ['-d'],
+		'backup_archive' => [ 'base.tar.zst', "$tsoid.tar.zst" ],
 		'enabled' => check_pg_config("#define USE_ZSTD 1")
 	},
 	{
 		'compression_method' => 'zstd',
 		'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
-		'backup_archive' => 'base.tar.zst',
-		'decompress_program' => $ENV{'ZSTD'},
-		'decompress_flags' => ['-d'],
+		'backup_archive' => [ 'base.tar.zst', "$tsoid.tar.zst" ],
 		'enabled' => check_pg_config("#define USE_ZSTD 1")
 	});
 
@@ -86,47 +92,16 @@ for my $tc (@test_configuration)
 		my $backup_files = join(',',
 			sort grep { $_ ne '.' && $_ ne '..' } slurp_dir($backup_path));
 		my $expected_backup_files =
-		  join(',', sort ('backup_manifest', $tc->{'backup_archive'}));
+		  join(',', sort ('backup_manifest', @{ $tc->{'backup_archive'} }));
 		is($backup_files, $expected_backup_files,
 			"found expected backup files, compression $method");
 
-		# Decompress.
-		if (exists $tc->{'decompress_program'})
-		{
-			my @decompress = ($tc->{'decompress_program'});
-			push @decompress, @{ $tc->{'decompress_flags'} }
-			  if $tc->{'decompress_flags'};
-			push @decompress, $backup_path . '/' . $tc->{'backup_archive'};
-			system_or_bail(@decompress);
-		}
-
-	  SKIP:
-		{
-			my $tar = $ENV{TAR};
-			# don't check for a working tar here, to accommodate various odd
-			# cases. If tar doesn't work the init_from_backup below will fail.
-			skip "no tar program available", 1
-			  if (!defined $tar || $tar eq '');
-
-			# Untar.
-			mkdir($extract_path);
-			system_or_bail($tar, 'xf', $backup_path . '/base.tar',
-				'-C', $extract_path);
-
-			# Verify.
-			$primary->command_ok(
-				[
-					'pg_verifybackup', '-n',
-					'-m', "$backup_path/backup_manifest",
-					'-e', $extract_path
-				],
-				"verify backup, compression $method");
-		}
+		# Verify tar backup.
+		$primary->command_ok(['pg_verifybackup', '-n', '-e', $backup_path],
+			"verify backup, compression $method");
 
 		# Cleanup.
-		unlink($backup_path . '/backup_manifest');
-		unlink($backup_path . '/base.tar');
-		unlink($backup_path . '/' . $tc->{'backup_archive'});
+		rmtree($backup_path);
 		rmtree($extract_path);
 	}
 }
diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl
index 8c076d46dee..6b7d7483f6e 100644
--- a/src/bin/pg_verifybackup/t/010_client_untar.pl
+++ b/src/bin/pg_verifybackup/t/010_client_untar.pl
@@ -29,41 +29,30 @@ my @test_configuration = (
 		'compression_method' => 'gzip',
 		'backup_flags' => [ '--compress', 'client-gzip:5' ],
 		'backup_archive' => 'base.tar.gz',
-		'decompress_program' => $ENV{'GZIP_PROGRAM'},
-		'decompress_flags' => ['-d'],
 		'enabled' => check_pg_config("#define HAVE_LIBZ 1")
 	},
 	{
 		'compression_method' => 'lz4',
 		'backup_flags' => [ '--compress', 'client-lz4:5' ],
 		'backup_archive' => 'base.tar.lz4',
-		'decompress_program' => $ENV{'LZ4'},
-		'decompress_flags' => ['-d'],
-		'output_file' => 'base.tar',
 		'enabled' => check_pg_config("#define USE_LZ4 1")
 	},
 	{
 		'compression_method' => 'zstd',
 		'backup_flags' => [ '--compress', 'client-zstd:5' ],
 		'backup_archive' => 'base.tar.zst',
-		'decompress_program' => $ENV{'ZSTD'},
-		'decompress_flags' => ['-d'],
 		'enabled' => check_pg_config("#define USE_ZSTD 1")
 	},
 	{
 		'compression_method' => 'zstd',
 		'backup_flags' => [ '--compress', 'client-zstd:level=1,long' ],
 		'backup_archive' => 'base.tar.zst',
-		'decompress_program' => $ENV{'ZSTD'},
-		'decompress_flags' => ['-d'],
 		'enabled' => check_pg_config("#define USE_ZSTD 1")
 	},
 	{
 		'compression_method' => 'parallel zstd',
 		'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],
 		'backup_archive' => 'base.tar.zst',
-		'decompress_program' => $ENV{'ZSTD'},
-		'decompress_flags' => ['-d'],
 		'enabled' => check_pg_config("#define USE_ZSTD 1"),
 		'possibly_unsupported' =>
 		  qr/could not set compression worker count to 3: Unsupported parameter/
@@ -118,40 +107,9 @@ for my $tc (@test_configuration)
 		is($backup_files, $expected_backup_files,
 			"found expected backup files, compression $method");
 
-		# Decompress.
-		if (exists $tc->{'decompress_program'})
-		{
-			my @decompress = ($tc->{'decompress_program'});
-			push @decompress, @{ $tc->{'decompress_flags'} }
-			  if $tc->{'decompress_flags'};
-			push @decompress, $backup_path . '/' . $tc->{'backup_archive'};
-			push @decompress, $backup_path . '/' . $tc->{'output_file'}
-			  if $tc->{'output_file'};
-			system_or_bail(@decompress);
-		}
-
-	  SKIP:
-		{
-			my $tar = $ENV{TAR};
-			# don't check for a working tar here, to accommodate various odd
-			# cases. If tar doesn't work the init_from_backup below will fail.
-			skip "no tar program available", 1
-			  if (!defined $tar || $tar eq '');
-
-			# Untar.
-			mkdir($extract_path);
-			system_or_bail($tar, 'xf', $backup_path . '/base.tar',
-				'-C', $extract_path);
-
-			# Verify.
-			$primary->command_ok(
-				[
-					'pg_verifybackup', '-n',
-					'-m', "$backup_path/backup_manifest",
-					'-e', $extract_path
-				],
-				"verify backup, compression $method");
-		}
+		# Verify tar backup.
+		$primary->command_ok( [ 'pg_verifybackup', '-n', '-e', $backup_path ],
+			"verify backup, compression $method");
 
 		# Cleanup.
 		rmtree($extract_path);
-- 
2.18.0

From c9415e97006e5765151d7fcd3990bcb0c4a05966 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Wed, 31 Jul 2024 16:22:07 +0530
Subject: [PATCH v7 11/12] pg_verifybackup: Read tar files and verify its
 contents

---
 src/bin/pg_verifybackup/Makefile           |   4 +-
 src/bin/pg_verifybackup/astreamer_verify.c | 367 +++++++++++++++++++++
 src/bin/pg_verifybackup/meson.build        |   6 +-
 src/bin/pg_verifybackup/pg_verifybackup.c  | 216 +++++++++++-
 src/bin/pg_verifybackup/pg_verifybackup.h  |   9 +
 src/tools/pgindent/typedefs.list           |   1 +
 6 files changed, 594 insertions(+), 9 deletions(-)
 create mode 100644 src/bin/pg_verifybackup/astreamer_verify.c

diff --git a/src/bin/pg_verifybackup/Makefile b/src/bin/pg_verifybackup/Makefile
index 7c045f142e8..df7aaabd530 100644
--- a/src/bin/pg_verifybackup/Makefile
+++ b/src/bin/pg_verifybackup/Makefile
@@ -17,11 +17,13 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 # We need libpq only because fe_utils does.
+override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
 LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
 
 OBJS = \
 	$(WIN32RES) \
-	pg_verifybackup.o
+	pg_verifybackup.o \
+	astreamer_verify.o
 
 all: pg_verifybackup
 
diff --git a/src/bin/pg_verifybackup/astreamer_verify.c b/src/bin/pg_verifybackup/astreamer_verify.c
new file mode 100644
index 00000000000..be40922c042
--- /dev/null
+++ b/src/bin/pg_verifybackup/astreamer_verify.c
@@ -0,0 +1,367 @@
+/*-------------------------------------------------------------------------
+ *
+ * astreamer_verify.c
+ *
+ * Extend fe_utils/astreamer.h archive streaming facility to verify TAR
+ * backup.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ *
+ * src/bin/pg_verifybackup/astreamer_verify.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "common/logging.h"
+#include "fe_utils/astreamer.h"
+#include "pg_verifybackup.h"
+
+typedef struct astreamer_verify
+{
+	astreamer	base;
+	verifier_context *context;
+	char	   *archiveName;
+	Oid			tblspcOid;
+
+	/* Hold information for a member file verification */
+	manifest_file *mfile;
+	int64		receivedBytes;
+	bool		verifyChecksums;
+	bool		verifyControlData;
+	pg_checksum_context *checksum_ctx;
+} astreamer_verify;
+
+static void astreamer_verify_content(astreamer *streamer,
+									 astreamer_member *member,
+									 const char *data, int len,
+									 astreamer_archive_context context);
+static void astreamer_verify_finalize(astreamer *streamer);
+static void astreamer_verify_free(astreamer *streamer);
+
+static void verify_member_header(astreamer *streamer, astreamer_member *member);
+static void verify_member_contents(astreamer *streamer,
+								   astreamer_member *member,
+								   const char *data, int len);
+static void verify_content_checksum(astreamer *streamer,
+									astreamer_member *member,
+									const char *buffer, int buffer_len);
+static void verify_controldata(astreamer *streamer,
+							   astreamer_member *member,
+							   const char *data, int len);
+static void reset_member_info(astreamer *streamer);
+
+static const astreamer_ops astreamer_verify_ops = {
+	.content = astreamer_verify_content,
+	.finalize = astreamer_verify_finalize,
+	.free = astreamer_verify_free
+};
+
+/*
+ * Create a astreamer that can verifies content of a TAR file.
+ */
+astreamer *
+astreamer_verify_content_new(astreamer *next, verifier_context *context,
+							 char *archive_name, Oid tblspc_oid)
+{
+	astreamer_verify *streamer;
+
+	streamer = palloc0(sizeof(astreamer_verify));
+	*((const astreamer_ops **) &streamer->base.bbs_ops) =
+		&astreamer_verify_ops;
+
+	streamer->base.bbs_next = next;
+	streamer->context = context;
+	streamer->archiveName = archive_name;
+	streamer->tblspcOid = tblspc_oid;
+	initStringInfo(&streamer->base.bbs_buffer);
+
+	return &streamer->base;
+}
+
+/*
+ * It verifies each TAR member entry against the manifest data and performs
+ * checksum verification if enabled. Additionally, it validates the backup's
+ * system identifier against the backup_manifest.
+ */
+static void
+astreamer_verify_content(astreamer *streamer,
+						 astreamer_member *member, const char *data,
+						 int len, astreamer_archive_context context)
+{
+	Assert(context != ASTREAMER_UNKNOWN);
+
+	switch (context)
+	{
+		case ASTREAMER_MEMBER_HEADER:
+
+			/*
+			 * Perform the initial check and setup for the verification.
+			 */
+			verify_member_header(streamer, member);
+			break;
+
+		case ASTREAMER_MEMBER_CONTENTS:
+
+			/*
+			 * Peform the required contants verification.
+			 */
+			verify_member_contents(streamer, member, data, len);
+			break;
+
+		case ASTREAMER_MEMBER_TRAILER:
+
+			/*
+			 * Reset the temporary information stored for a verification.
+			 */
+			reset_member_info(streamer);
+			break;
+
+		case ASTREAMER_ARCHIVE_TRAILER:
+			break;
+
+		default:
+			/* Shouldn't happen. */
+			pg_fatal("unexpected state while parsing tar archive");
+	}
+}
+
+/*
+ * End-of-stream processing for a astreamer_verify stream.
+ */
+static void
+astreamer_verify_finalize(astreamer *streamer)
+{
+	Assert(streamer->bbs_next == NULL);
+}
+
+/*
+ * Free memory associated with a astreamer_verify stream.
+ */
+static void
+astreamer_verify_free(astreamer *streamer)
+{
+	pfree(streamer->bbs_buffer.data);
+	pfree(streamer);
+}
+
+/*
+ * Verify the entry if it is a file in the backup manifest. If the archive being
+ * processed is a tablespace, prepare the required file path for subsequent
+ * operations. Finally, check if it needs to perform checksum verification and
+ * control data verification during file content processing.
+ */
+static void
+verify_member_header(astreamer *streamer, astreamer_member *member)
+{
+	astreamer_verify *mystreamer = (astreamer_verify *) streamer;
+	manifest_file *m;
+
+	/* We are only interested in files that are not in the ignore list. */
+	if (member->is_directory || member->is_link ||
+		should_ignore_relpath(mystreamer->context, member->pathname))
+		return;
+
+	/*
+	 * The backup_manifest stores a relative path to the base directory for
+	 * files belong tablespace, whereas <tablespaceoid>.tar doesn't. Prepare
+	 * the required path, otherwise, the manfiest entry verification will
+	 * fail.
+	 */
+	if (OidIsValid(mystreamer->tblspcOid))
+	{
+		char		temp[MAXPGPATH];
+
+		/* Copy original name at temporary space */
+		memcpy(temp, member->pathname, MAXPGPATH);
+
+		snprintf(member->pathname, MAXPGPATH, "%s/%d/%s",
+				 "pg_tblspc", mystreamer->tblspcOid, temp);
+	}
+
+	/* Check the manifest entry */
+	m = verify_manifest_entry(mystreamer->context, member->pathname,
+							  member->size);
+	mystreamer->mfile = (void *) m;
+
+	/*
+	 * Prepare for checksum and manifest system identifier verification.
+	 *
+	 * We could have these checks while receiving contents. However, since
+	 * contents are received in multiple iterations, this would result in
+	 * these lengthy checks being performed multiple times. Instead, having a
+	 * single flag would be more efficient.
+	 */
+	mystreamer->verifyChecksums =
+		(!mystreamer->context->skip_checksums && should_verify_checksum(m));
+	mystreamer->verifyControlData =
+		should_verify_control_data(mystreamer->context->manifest, m);
+}
+
+/*
+ * Process the member content according to the flags set by the member header
+ * processing routine for checksum and control data verification.
+ */
+static void
+verify_member_contents(astreamer *streamer, astreamer_member *member,
+					   const char *data, int len)
+
+{
+	astreamer_verify *mystreamer = (astreamer_verify *) streamer;
+
+	/* Verify the checksums */
+	if (mystreamer->verifyChecksums)
+		verify_content_checksum(streamer, member, data, len);
+
+	/* Verify pg_control information */
+	if (mystreamer->verifyControlData)
+		verify_controldata(streamer, member, data, len);
+}
+
+/*
+ * Similar to verify_file_checksum() but this function computes the checksum
+ * incrementally for the received file content. Unlike a normal backup
+ * directory, TAR format files do not allow random access, so checksum
+ * verification occurs progressively. Additionally, the function calls the
+ * routine for control data verification if the flags indicate that it is
+ * required.
+ *
+ * On the first visit, the function initializes checksum_ctx, which will be used
+ * for incremental checksum calculation. Once the complete file content is
+ * received (tracked using the receivedBytes), the routine that performs the
+ * final checksum verification is called
+ */
+static void
+verify_content_checksum(astreamer *streamer, astreamer_member *member,
+						const char *buffer, int buffer_len)
+{
+	astreamer_verify *mystreamer = (astreamer_verify *) streamer;
+	pg_checksum_context *checksum_ctx = mystreamer->checksum_ctx;
+	verifier_context *context = mystreamer->context;
+	manifest_file *m = mystreamer->mfile;
+	const char *relpath = m->pathname;
+	uint8		checksumbuf[PG_CHECKSUM_MAX_LENGTH];
+
+	/*
+	 * Mark it false to avoid unexpected re-entrance for the same file content
+	 * (e.g. returned in error should not be revisited).
+	 */
+	Assert(mystreamer->verifyChecksums);
+	mystreamer->verifyChecksums = false;
+
+	/* Should have came for the right file */
+	Assert(strcmp(member->pathname, relpath) == 0);
+
+	/* If we were first time for this file */
+	if (!checksum_ctx)
+	{
+		checksum_ctx = pg_malloc(sizeof(pg_checksum_context));
+		mystreamer->checksum_ctx = checksum_ctx;
+
+		if (pg_checksum_init(checksum_ctx, m->checksum_type) < 0)
+		{
+			report_backup_error(context,
+								"%s: could not initialize checksum of file \"%s\"",
+								mystreamer->archiveName, relpath);
+			return;
+		}
+	}
+
+	/* Update the total count of computed checksum bytes. */
+	mystreamer->receivedBytes += buffer_len;
+
+	if (pg_checksum_update(checksum_ctx, (uint8 *) buffer, buffer_len) < 0)
+	{
+		report_backup_error(context, "could not update checksum of file \"%s\"",
+							relpath);
+		return;
+	}
+
+	/* Report progress */
+	context->done_size += buffer_len;
+	progress_report(context, false);
+
+	/* Yet to receive the full content of the file. */
+	if (mystreamer->receivedBytes < m->size)
+	{
+		mystreamer->verifyChecksums = true;
+		return;
+	}
+
+	/* Do the final computation and verification. */
+	verify_checksum(context, m, checksum_ctx, checksumbuf);
+}
+
+/*
+ * Prepare the control data from the received file contents, which are supposed
+ * to be from the pg_control file, including CRC calculation. Then, call the
+ * routines that perform the final verification of the control file information.
+ */
+static void
+verify_controldata(astreamer *streamer, astreamer_member *member,
+				   const char *data, int len)
+{
+	astreamer_verify *mystreamer = (astreamer_verify *) streamer;
+	manifest_data *manifest = mystreamer->context->manifest;
+	ControlFileData control_file;
+	pg_crc32c	crc;
+	bool		crc_ok;
+
+	/* Should be here only for control file */
+	Assert(strcmp(member->pathname, "global/pg_control") == 0);
+	Assert(manifest->version != 1);
+
+	/* Mark it as false to avoid unexpected re-entrance */
+	Assert(mystreamer->verifyControlData);
+	mystreamer->verifyControlData = false;
+
+	/* Should have whole control file data. */
+	if (!astreamer_buffer_until(streamer, &data, &len, sizeof(ControlFileData)))
+	{
+		mystreamer->verifyControlData = true;
+		return;
+	}
+
+	pg_log_debug("%s: reading \"%s\"", mystreamer->archiveName,
+				 member->pathname);
+
+	if (streamer->bbs_buffer.len != sizeof(ControlFileData))
+		report_fatal_error("%s: could not read control file: read %d of %zu",
+						   mystreamer->archiveName, streamer->bbs_buffer.len,
+						   sizeof(ControlFileData));
+
+	memcpy(&control_file, streamer->bbs_buffer.data,
+		   sizeof(ControlFileData));
+
+	/* Check the CRC. */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc,
+				(char *) (&control_file),
+				offsetof(ControlFileData, crc));
+	FIN_CRC32C(crc);
+
+	crc_ok = EQ_CRC32C(crc, control_file.crc);
+
+	/* Do the final control data verification. */
+	verify_control_data(&control_file, member->pathname, crc_ok,
+						manifest->system_identifier);
+}
+
+/*
+ * Reset flags and free memory allocations for member file verification.
+ */
+static void
+reset_member_info(astreamer *streamer)
+{
+	astreamer_verify *mystreamer = (astreamer_verify *) streamer;
+
+	mystreamer->mfile = NULL;
+	mystreamer->receivedBytes = 0;
+	mystreamer->verifyChecksums = false;
+	mystreamer->verifyControlData = false;
+
+	if (mystreamer->checksum_ctx)
+		pfree(mystreamer->checksum_ctx);
+	mystreamer->checksum_ctx = NULL;
+}
diff --git a/src/bin/pg_verifybackup/meson.build b/src/bin/pg_verifybackup/meson.build
index 7c7d31a0350..1e3fcf7ee5a 100644
--- a/src/bin/pg_verifybackup/meson.build
+++ b/src/bin/pg_verifybackup/meson.build
@@ -1,7 +1,8 @@
 # Copyright (c) 2022-2024, PostgreSQL Global Development Group
 
 pg_verifybackup_sources = files(
-  'pg_verifybackup.c'
+  'pg_verifybackup.c',
+  'astreamer_verify.c'
 )
 
 if host_system == 'windows'
@@ -10,9 +11,10 @@ if host_system == 'windows'
     '--FILEDESC', 'pg_verifybackup - verify a backup against using a backup manifest'])
 endif
 
+pg_verifybackup_deps = [frontend_code, libpq, lz4, zlib, zstd]
 pg_verifybackup = executable('pg_verifybackup',
   pg_verifybackup_sources,
-  dependencies: [frontend_code, libpq],
+  dependencies: pg_verifybackup_deps,
   kwargs: default_bin_args,
 )
 bin_targets += pg_verifybackup
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index f20b6e2895c..ce2ba7437be 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -20,9 +20,12 @@
 
 #include "common/compression.h"
 #include "common/parse_manifest.h"
+#include "common/relpath.h"
+#include "fe_utils/astreamer.h"
 #include "fe_utils/simple_list.h"
 #include "getopt_long.h"
 #include "pg_verifybackup.h"
+#include "pgtar.h"
 #include "pgtime.h"
 
 /*
@@ -39,6 +42,11 @@
  */
 #define ESTIMATED_BYTES_PER_MANIFEST_LINE	100
 
+
+static void (*verify_backup_file_cb) (verifier_context *context,
+									  char *relpath, char *fullpath,
+									  size_t filesize);
+
 static manifest_data *parse_manifest_file(char *manifest_path);
 static void verifybackup_version_cb(JsonManifestParseContext *context,
 									int manifest_version);
@@ -63,6 +71,15 @@ static void verify_backup_directory(verifier_context *context,
 									char *relpath, char *fullpath);
 static void verify_backup_file(verifier_context *context,
 							   char *relpath, char *fullpath);
+static void verify_plain_file_cb(verifier_context *context,
+								 char *relpath, char *fullpath,
+								 size_t filesize);
+static void verify_tar_file_cb(verifier_context *context,
+							   char *relpath, char *fullpath,
+							   size_t filesize);
+static void verify_tar_content(verifier_context *context,
+							   char *relpath, char *fullpath,
+							   astreamer *streamer);
 static void report_extra_backup_files(verifier_context *context);
 static void verify_backup_checksums(verifier_context *context);
 static void verify_file_checksum(verifier_context *context,
@@ -71,6 +88,9 @@ static void verify_file_checksum(verifier_context *context,
 static void parse_required_wal(verifier_context *context,
 							   char *pg_waldump_path,
 							   char *wal_directory);
+static astreamer *create_archive_verifier(verifier_context *context,
+										  char *archive_name,
+										  Oid tblspc_oid);
 
 static void compute_total_size(verifier_context *context);
 static void usage(void);
@@ -146,6 +166,10 @@ main(int argc, char **argv)
 	 */
 	simple_string_list_append(&context.ignore_list, "backup_manifest");
 	simple_string_list_append(&context.ignore_list, "pg_wal");
+	simple_string_list_append(&context.ignore_list, "pg_wal.tar");
+	simple_string_list_append(&context.ignore_list, "pg_wal.tar.gz");
+	simple_string_list_append(&context.ignore_list, "pg_wal.tar.lz4");
+	simple_string_list_append(&context.ignore_list, "pg_wal.tar.zst");
 	simple_string_list_append(&context.ignore_list, "postgresql.auto.conf");
 	simple_string_list_append(&context.ignore_list, "recovery.signal");
 	simple_string_list_append(&context.ignore_list, "standby.signal");
@@ -250,6 +274,15 @@ main(int argc, char **argv)
 	if (format == 't' && !tar_compression_specified)
 		compress_algorithm = find_backup_compression(&context);
 
+	/*
+	 * Setup the required callback function to verify plain or tar backup
+	 * files.
+	 */
+	if (format == 'p')
+		verify_backup_file_cb = verify_plain_file_cb;
+	else
+		verify_backup_file_cb = verify_tar_file_cb;
+
 	/* Unless --no-parse-wal was specified, we will need pg_waldump. */
 	if (!no_parse_wal)
 	{
@@ -645,7 +678,8 @@ verify_backup_directory(verifier_context *context, char *relpath,
 }
 
 /*
- * Verify one file (which might actually be a directory or a symlink).
+ * Verify one file (which might actually be a directory, a symlink or a
+ * archive).
  *
  * The arguments to this function have the same meaning as the arguments to
  * verify_backup_directory.
@@ -654,7 +688,6 @@ static void
 verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 {
 	struct stat sb;
-	manifest_file *m;
 
 	if (stat(fullpath, &sb) != 0)
 	{
@@ -687,8 +720,25 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 		return;
 	}
 
+	/* Do the further verifications */
+	verify_backup_file_cb(context, relpath, fullpath, sb.st_size);
+}
+
+/*
+ * Verify one plan file or a symlink.
+ *
+ * The arguments to this function are mostly the same as the
+ * verify_backup_directory. The additional argument is the file size for
+ * verifying against manifest entry.
+ */
+static void
+verify_plain_file_cb(verifier_context *context, char *relpath,
+					 char *fullpath, size_t filesize)
+{
+	manifest_file *m;
+
 	/* Check the backup manifest entry for this file. */
-	m = verify_manifest_entry(context, relpath, sb.st_size);
+	m = verify_manifest_entry(context, relpath, filesize);
 
 	/* Validate the manifest system identifier */
 	if (should_verify_control_data(context->manifest, m))
@@ -706,6 +756,124 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 	}
 }
 
+/*
+ * Verify one tar file.
+ *
+ * The arguments to this function are mostly the same as the
+ * verify_backup_directory. The additional argument is the file size for
+ * verifying against manifest entry.
+ */
+static void
+verify_tar_file_cb(verifier_context *context, char *relpath,
+				   char *fullpath, size_t filesize)
+{
+	astreamer  *streamer;
+	Oid			tblspc_oid = InvalidOid;
+	int			file_name_len;
+	int			file_extn_len = 0;	/* placate compiler */
+	char	   *file_extn = "";
+
+	/* Should be tar backup */
+	Assert(format == 't');
+
+	/* Find the tar file extension. */
+	if (compress_algorithm == PG_COMPRESSION_NONE)
+	{
+		file_extn = ".tar";
+		file_extn_len = 4;
+
+	}
+	else if (compress_algorithm == PG_COMPRESSION_GZIP)
+	{
+		file_extn = ".tar.gz";
+		file_extn_len = 7;
+
+	}
+	else if (compress_algorithm == PG_COMPRESSION_LZ4)
+	{
+		file_extn = ".tar.lz4";
+		file_extn_len = 8;
+	}
+	else if (compress_algorithm == PG_COMPRESSION_ZSTD)
+	{
+		file_extn = ".tar.zst";
+		file_extn_len = 8;
+	}
+
+	/*
+	 * Ensure that we have the correct file type corresponding to the backup
+	 * format.
+	 */
+	file_name_len = strlen(relpath);
+	if (file_name_len < file_extn_len ||
+		strcmp(relpath + file_name_len - file_extn_len, file_extn) != 0)
+	{
+		if (compress_algorithm == PG_COMPRESSION_NONE)
+			report_backup_error(context,
+								"\"%s\" is not a valid file, expecting tar file",
+								relpath);
+		else
+			report_backup_error(context,
+								"\"%s\" is not a valid file, expecting \"%s\" compressed tar file",
+								relpath,
+								get_compress_algorithm_name(compress_algorithm));
+		return;
+	}
+
+	/*
+	 * For the tablespace, pg_basebackup writes the data out to
+	 * <tablespaceoid>.tar.  If a file matches that format, then extract the
+	 * tablespaceoid, which we need to prepare the paths of the files
+	 * belonging to that tablespace relative to the base directory.
+	 */
+	if (strspn(relpath, "0123456789") == (file_name_len - file_extn_len))
+		tblspc_oid = strtoi64(relpath, NULL, 10);
+
+	streamer = create_archive_verifier(context, relpath, tblspc_oid);
+	verify_tar_content(context, relpath, fullpath, streamer);
+
+	/* Cleanup. */
+	astreamer_finalize(streamer);
+	astreamer_free(streamer);
+}
+
+/*
+ * Reads a given tar file in predefined chunks and pass to astreamer.  Which
+ * initiates routines for decompression (if necessary) then verification
+ * of each member within the tar archive.
+ */
+static void
+verify_tar_content(verifier_context *context, char *relpath, char *fullpath,
+				   astreamer *streamer)
+{
+	int			fd;
+	int			rc;
+	char	   *buffer;
+
+	/* Open the target file. */
+	if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) < 0)
+	{
+		report_backup_error(context, "could not open file \"%s\": %m",
+							relpath);
+		return;
+	}
+
+	buffer = pg_malloc(READ_CHUNK_SIZE * sizeof(uint8));
+
+	/* Perform the reads */
+	while ((rc = read(fd, buffer, READ_CHUNK_SIZE)) > 0)
+		astreamer_content(streamer, NULL, buffer, rc, ASTREAMER_UNKNOWN);
+
+	if (rc < 0)
+		report_backup_error(context, "could not read file \"%s\": %m",
+							relpath);
+
+	/* Close the file. */
+	if (close(fd) != 0)
+		report_backup_error(context, "could not close file \"%s\": %m",
+							relpath);
+}
+
 /*
  * Verify file and its size entry in the manifest.
  */
@@ -1058,10 +1226,10 @@ find_backup_format(verifier_context *context)
 }
 
 /*
- * To determine the compression format, we will search for the main data
- * directory archive and its extension, which starts with base.tar, as
  * pg_basebackup writes the main data directory to an archive file named
- * base.tar followed by a compression type extension like .gz, .lz4, or .zst.
+ * base.tar, followed by a compression type extension such as .gz, .lz4, or
+ * .zst. To determine the compression format, we need to search for this main
+ * data directory archive file.
  */
 static pg_compress_algorithm
 find_backup_compression(verifier_context *context)
@@ -1112,6 +1280,42 @@ find_backup_compression(verifier_context *context)
 	return PG_COMPRESSION_NONE; /* placate compiler */
 }
 
+/*
+ * Identifies the necessary steps for verifying the contents of the
+ * provided tar file.
+ */
+static astreamer *
+create_archive_verifier(verifier_context *context, char *archive_name,
+						Oid tblspc_oid)
+{
+	astreamer  *streamer = NULL;
+
+	/* Should be here only for tar backup */
+	Assert(format == 't');
+
+	/*
+	 * To verify the contents of the tar file, the initial step is to parse
+	 * its content.
+	 */
+	streamer = astreamer_verify_content_new(streamer, context, archive_name,
+											tblspc_oid);
+	streamer = astreamer_tar_parser_new(streamer);
+
+	/*
+	 * If the tar file is compressed, we must perform the appropriate
+	 * decompression operation before proceeding with the verification of its
+	 * contents.
+	 */
+	if (compress_algorithm == PG_COMPRESSION_GZIP)
+		streamer = astreamer_gzip_decompressor_new(streamer);
+	else if (compress_algorithm == PG_COMPRESSION_LZ4)
+		streamer = astreamer_lz4_decompressor_new(streamer);
+	else if (compress_algorithm == PG_COMPRESSION_ZSTD)
+		streamer = astreamer_zstd_decompressor_new(streamer);
+
+	return streamer;
+}
+
 /*
  * Print a progress report based on the variables in verifier_context.
  *
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.h b/src/bin/pg_verifybackup/pg_verifybackup.h
index c88f71ff14b..f0a7c8918fb 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.h
+++ b/src/bin/pg_verifybackup/pg_verifybackup.h
@@ -137,4 +137,13 @@ extern bool should_ignore_relpath(verifier_context *context,
 
 extern void progress_report(verifier_context *context, bool finished);
 
+/* Forward declarations to avoid fe_utils/astreamer.h include. */
+struct astreamer;
+typedef struct astreamer astreamer;
+
+extern astreamer *astreamer_verify_content_new(astreamer *next,
+											   verifier_context *context,
+											   char *archive_name,
+											   Oid tblspc_oid);
+
 #endif							/* PG_VERIFYBACKUP_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 547d14b3e7c..d86b28b260e 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3329,6 +3329,7 @@ astreamer_plain_writer
 astreamer_recovery_injector
 astreamer_tar_archiver
 astreamer_tar_parser
+astreamer_verify
 astreamer_zstd_frame
 bgworker_main_type
 bh_node_type
-- 
2.18.0

From 44a78699dacfa90f36a37c8868ad11a50d53cb12 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Thu, 1 Aug 2024 15:47:26 +0530
Subject: [PATCH v7 08/12] Refactor: split verify_control_file.

Separated the manifest entry verification code into a new function and
introduced the should_verify_control_data() macro, similar to
should_verify_checksum().

Note that should_verify_checksum() has been slightly modified to
include a NULL check for its argument, maintaining the same code
structure as should_verify_control_data().
---
 src/bin/pg_verifybackup/pg_verifybackup.c | 44 +++++++++++------------
 src/bin/pg_verifybackup/pg_verifybackup.h | 18 +++++++++-
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index 3eddaa2468e..5f055a23a63 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -18,7 +18,6 @@
 #include <sys/stat.h>
 #include <time.h>
 
-#include "common/logging.h"
 #include "common/parse_manifest.h"
 #include "fe_utils/simple_list.h"
 #include "getopt_long.h"
@@ -61,8 +60,6 @@ static void verify_backup_directory(verifier_context *context,
 									char *relpath, char *fullpath);
 static void verify_backup_file(verifier_context *context,
 							   char *relpath, char *fullpath);
-static void verify_control_file(const char *controlpath,
-								uint64 manifest_system_identifier);
 static void report_extra_backup_files(verifier_context *context);
 static void verify_backup_checksums(verifier_context *context);
 static void verify_file_checksum(verifier_context *context,
@@ -625,14 +622,20 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 	/* Check the backup manifest entry for this file. */
 	m = verify_manifest_entry(context, relpath, sb.st_size);
 
-	/*
-	 * Validate the manifest system identifier, not available in manifest
-	 * version 1.
-	 */
-	if (context->manifest->version != 1 &&
-		strcmp(relpath, "global/pg_control") == 0 &&
-		m != NULL && m->matched && !m->bad)
-		verify_control_file(fullpath, context->manifest->system_identifier);
+	/* Validate the manifest system identifier */
+	if (should_verify_control_data(context->manifest, m))
+	{
+		ControlFileData *control_file;
+		bool		crc_ok;
+
+		pg_log_debug("reading \"%s\"", fullpath);
+		control_file = get_controlfile_by_exact_path(fullpath, &crc_ok);
+
+		verify_control_data(control_file, fullpath, crc_ok,
+							context->manifest->system_identifier);
+		/* Release memory. */
+		pfree(control_file);
+	}
 }
 
 /*
@@ -676,18 +679,14 @@ verify_manifest_entry(verifier_context *context, char *relpath, int64 filesize)
 }
 
 /*
- * Sanity check control file and validate system identifier against manifest
- * system identifier.
+ * Sanity check control file data and validate system identifier against
+ * manifest system identifier.
  */
-static void
-verify_control_file(const char *controlpath, uint64 manifest_system_identifier)
+void
+verify_control_data(ControlFileData *control_file,
+					const char *controlpath, bool crc_ok,
+					uint64 manifest_system_identifier)
 {
-	ControlFileData *control_file;
-	bool		crc_ok;
-
-	pg_log_debug("reading \"%s\"", controlpath);
-	control_file = get_controlfile_by_exact_path(controlpath, &crc_ok);
-
 	/* Control file contents not meaningful if CRC is bad. */
 	if (!crc_ok)
 		report_fatal_error("%s: CRC is incorrect", controlpath);
@@ -703,9 +702,6 @@ verify_control_file(const char *controlpath, uint64 manifest_system_identifier)
 						   controlpath,
 						   (unsigned long long) manifest_system_identifier,
 						   (unsigned long long) control_file->system_identifier);
-
-	/* Release memory. */
-	pfree(control_file);
 }
 
 /*
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.h b/src/bin/pg_verifybackup/pg_verifybackup.h
index 1bc5f7a6b4a..c88f71ff14b 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.h
+++ b/src/bin/pg_verifybackup/pg_verifybackup.h
@@ -16,6 +16,7 @@
 
 #include "common/controldata_utils.h"
 #include "common/hashfn_unstable.h"
+#include "common/logging.h"
 #include "common/parse_manifest.h"
 #include "fe_utils/simple_list.h"
 
@@ -44,7 +45,19 @@ typedef struct manifest_file
 } manifest_file;
 
 #define should_verify_checksum(m) \
-	(((m)->matched) && !((m)->bad) && (((m)->checksum_type) != CHECKSUM_TYPE_NONE))
+	(((m) != NULL) && ((m)->matched) && !((m)->bad) && \
+	 (((m)->checksum_type) != CHECKSUM_TYPE_NONE))
+
+/*
+ * Validate the manifest system identifier against the control file; this
+ * feature is not available in manifest version 1.  This validation should be
+ * carried out only if the manifest entry validation is completed without any
+ * errors.
+ */
+#define should_verify_control_data(manifest, m) \
+	(((manifest)->version != 1) && \
+	 ((m) != NULL) && ((m)->matched) && !((m)->bad) && \
+	 (strcmp((m)->pathname, "global/pg_control") == 0))
 
 /*
  * Define a hash table which we can use to store information about the files
@@ -110,6 +123,9 @@ extern manifest_file *verify_manifest_entry(verifier_context *context,
 extern void verify_checksum(verifier_context *context, manifest_file *m,
 							pg_checksum_context *checksum_ctx,
 							uint8 *checksumbuf);
+extern void verify_control_data(ControlFileData *control_file,
+								const char *controlpath, bool crc_ok,
+								uint64 manifest_system_identifier);
 
 extern void report_backup_error(verifier_context *context,
 								const char *pg_restrict fmt,...)
-- 
2.18.0

From ee279a6a540a640857599951807438cda02f30d6 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Fri, 2 Aug 2024 16:37:38 +0530
Subject: [PATCH v7 09/12] Refactor: move first and last progress_report call
 to Main.

---
 src/bin/pg_verifybackup/pg_verifybackup.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index 5f055a23a63..801e13886c2 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -253,7 +253,10 @@ main(int argc, char **argv)
 	 * read, which occurs only when checksum verification is enabled.
 	 */
 	if (!context.skip_checksums)
+	{
 		compute_total_size(&context);
+		progress_report(&context, false);
+	}
 
 	/*
 	 * Now scan the files in the backup directory. At this stage, we verify
@@ -275,7 +278,10 @@ main(int argc, char **argv)
 	 * told to skip it.
 	 */
 	if (!context.skip_checksums)
+	{
 		verify_backup_checksums(&context);
+		progress_report(&context, true);
+	}
 
 	/*
 	 * Try to parse the required ranges of WAL records, unless we were told
@@ -736,8 +742,6 @@ verify_backup_checksums(verifier_context *context)
 	manifest_file *m;
 	uint8	   *buffer;
 
-	progress_report(context, false);
-
 	buffer = pg_malloc(READ_CHUNK_SIZE * sizeof(uint8));
 
 	manifest_files_start_iterate(manifest->files, &it);
@@ -761,8 +765,6 @@ verify_backup_checksums(verifier_context *context)
 	}
 
 	pfree(buffer);
-
-	progress_report(context, true);
 }
 
 /*
-- 
2.18.0

From 37148e750aa68092eaa3e13c0c46cb4978c3f67a Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Thu, 1 Aug 2024 12:15:26 +0530
Subject: [PATCH v7 06/12] Refactor: split verify_backup_file() function.

Move the manifest entry verification code into a new function as
verify_manifest_entry(). And the total size computation code into
another new function, compute_total_size(), which is called from the
main.
---
 src/bin/pg_verifybackup/pg_verifybackup.c | 76 ++++++++++++++++++-----
 src/bin/pg_verifybackup/pg_verifybackup.h |  3 +
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index 4e42757c346..ab6bda8c9dc 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -72,6 +72,7 @@ static void parse_required_wal(verifier_context *context,
 							   char *pg_waldump_path,
 							   char *wal_directory);
 
+static void compute_total_size(verifier_context *context);
 static void usage(void);
 
 static const char *progname;
@@ -250,6 +251,13 @@ main(int argc, char **argv)
 	 */
 	context.manifest = parse_manifest_file(manifest_path);
 
+	/*
+	 * For the progress report, compute the total size of the files to be
+	 * read, which occurs only when checksum verification is enabled.
+	 */
+	if (!context.skip_checksums)
+		compute_total_size(&context);
+
 	/*
 	 * Now scan the files in the backup directory. At this stage, we verify
 	 * that every file on disk is present in the manifest and that the sizes
@@ -614,6 +622,27 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 		return;
 	}
 
+	/* Check the backup manifest entry for this file. */
+	m = verify_manifest_entry(context, relpath, sb.st_size);
+
+	/*
+	 * Validate the manifest system identifier, not available in manifest
+	 * version 1.
+	 */
+	if (context->manifest->version != 1 &&
+		strcmp(relpath, "global/pg_control") == 0 &&
+		m != NULL && m->matched && !m->bad)
+		verify_control_file(fullpath, context->manifest->system_identifier);
+}
+
+/*
+ * Verify file and its size entry in the manifest.
+ */
+manifest_file *
+verify_manifest_entry(verifier_context *context, char *relpath, int64 filesize)
+{
+	manifest_file *m;
+
 	/* Check whether there's an entry in the manifest hash. */
 	m = manifest_files_lookup(context->manifest->files, relpath);
 	if (m == NULL)
@@ -621,40 +650,29 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 		report_backup_error(context,
 							"\"%s\" is present on disk but not in the manifest",
 							relpath);
-		return;
+		return NULL;
 	}
 
 	/* Flag this entry as having been encountered in the filesystem. */
 	m->matched = true;
 
 	/* Check that the size matches. */
-	if (m->size != sb.st_size)
+	if (m->size != filesize)
 	{
 		report_backup_error(context,
 							"\"%s\" has size %lld on disk but size %zu in the manifest",
-							relpath, (long long int) sb.st_size, m->size);
+							relpath, (long long int) filesize, m->size);
 		m->bad = true;
 	}
 
-	/*
-	 * Validate the manifest system identifier, not available in manifest
-	 * version 1.
-	 */
-	if (context->manifest->version != 1 &&
-		strcmp(relpath, "global/pg_control") == 0)
-		verify_control_file(fullpath, context->manifest->system_identifier);
-
-	/* Update statistics for progress report, if necessary */
-	if (context->show_progress && !context->skip_checksums &&
-		should_verify_checksum(m))
-		context->total_size += m->size;
-
 	/*
 	 * We don't verify checksums at this stage. We first finish verifying that
 	 * we have the expected set of files with the expected sizes, and only
 	 * afterwards verify the checksums. That's because computing checksums may
 	 * take a while, and we'd like to report more obvious problems quickly.
 	 */
+
+	return m;
 }
 
 /*
@@ -817,7 +835,7 @@ verify_file_checksum(verifier_context *context, manifest_file *m,
 
 	/*
 	 * Double-check that we read the expected number of bytes from the file.
-	 * Normally, a file size mismatch would be caught in verify_backup_file
+	 * Normally, a file size mismatch would be caught in verify_manifest_entry
 	 * and this check would never be reached, but this provides additional
 	 * safety and clarity in the event of concurrent modifications or
 	 * filesystem misbehavior.
@@ -988,6 +1006,30 @@ progress_report(verifier_context *context, bool finished)
 	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
 }
 
+/*
+ * Compute the total size of backup files for progress reporting.
+ */
+static void
+compute_total_size(verifier_context *context)
+{
+	manifest_data *manifest = context->manifest;
+	manifest_files_iterator it;
+	manifest_file *m;
+	uint64		total_size = 0;
+
+	if (!context->show_progress)
+		return;
+
+	manifest_files_start_iterate(manifest->files, &it);
+	while ((m = manifest_files_iterate(manifest->files, &it)) != NULL)
+	{
+		if (!should_ignore_relpath(context, m->pathname))
+			total_size += m->size;
+	}
+
+	context->total_size = total_size;
+}
+
 /*
  * Print out usage information and exit.
  */
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.h b/src/bin/pg_verifybackup/pg_verifybackup.h
index 90900048547..98c75916255 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.h
+++ b/src/bin/pg_verifybackup/pg_verifybackup.h
@@ -105,6 +105,9 @@ typedef struct verifier_context
 	uint64		done_size;
 } verifier_context;
 
+extern manifest_file *verify_manifest_entry(verifier_context *context,
+											char *relpath, int64 filesize);
+
 extern void report_backup_error(verifier_context *context,
 								const char *pg_restrict fmt,...)
 			pg_attribute_printf(2, 3);
-- 
2.18.0

From 6ec5031183fe3c42f0dbc69ba5374ea302116ef7 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Thu, 1 Aug 2024 16:45:55 +0530
Subject: [PATCH v7 07/12] Refactor: split verify_file_checksum() function.

Move the core functionality of verify_file_checksum to a new function
to reuse it instead of duplicating the code.
---
 src/bin/pg_verifybackup/pg_verifybackup.c | 18 ++++++++++++++++--
 src/bin/pg_verifybackup/pg_verifybackup.h |  3 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index ab6bda8c9dc..3eddaa2468e 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -782,7 +782,6 @@ verify_file_checksum(verifier_context *context, manifest_file *m,
 	int			rc;
 	size_t		bytes_read = 0;
 	uint8		checksumbuf[PG_CHECKSUM_MAX_LENGTH];
-	int			checksumlen;
 
 	/* Open the target file. */
 	if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) < 0)
@@ -848,8 +847,23 @@ verify_file_checksum(verifier_context *context, manifest_file *m,
 		return;
 	}
 
+	/* Do the final computation and verification. */
+	verify_checksum(context, m, &checksum_ctx, checksumbuf);
+}
+
+/*
+ * A helper function to finalize checksum computation and verify it against the
+ * backup manifest information.
+ */
+void
+verify_checksum(verifier_context *context, manifest_file *m,
+				pg_checksum_context *checksum_ctx, uint8 *checksumbuf)
+{
+	int			checksumlen;
+	const char *relpath = m->pathname;
+
 	/* Get the final checksum. */
-	checksumlen = pg_checksum_final(&checksum_ctx, checksumbuf);
+	checksumlen = pg_checksum_final(checksum_ctx, checksumbuf);
 	if (checksumlen < 0)
 	{
 		report_backup_error(context,
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.h b/src/bin/pg_verifybackup/pg_verifybackup.h
index 98c75916255..1bc5f7a6b4a 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.h
+++ b/src/bin/pg_verifybackup/pg_verifybackup.h
@@ -107,6 +107,9 @@ typedef struct verifier_context
 
 extern manifest_file *verify_manifest_entry(verifier_context *context,
 											char *relpath, int64 filesize);
+extern void verify_checksum(verifier_context *context, manifest_file *m,
+							pg_checksum_context *checksum_ctx,
+							uint8 *checksumbuf);
 
 extern void report_backup_error(verifier_context *context,
 								const char *pg_restrict fmt,...)
-- 
2.18.0

From bb127a0c8f25d15ee9c10d111be7611ce58dcb39 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Thu, 1 Aug 2024 12:10:34 +0530
Subject: [PATCH v7 05/12] Refactor: move some part of pg_verifybackup.c to
 pg_verifybackup.h

---
 src/bin/pg_verifybackup/pg_verifybackup.c | 102 +------------------
 src/bin/pg_verifybackup/pg_verifybackup.h | 118 ++++++++++++++++++++++
 2 files changed, 123 insertions(+), 97 deletions(-)
 create mode 100644 src/bin/pg_verifybackup/pg_verifybackup.h

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index 71585ffc50e..4e42757c346 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -18,12 +18,11 @@
 #include <sys/stat.h>
 #include <time.h>
 
-#include "common/controldata_utils.h"
-#include "common/hashfn_unstable.h"
 #include "common/logging.h"
 #include "common/parse_manifest.h"
 #include "fe_utils/simple_list.h"
 #include "getopt_long.h"
+#include "pg_verifybackup.h"
 #include "pgtime.h"
 
 /*
@@ -40,89 +39,6 @@
  */
 #define ESTIMATED_BYTES_PER_MANIFEST_LINE	100
 
-/*
- * How many bytes should we try to read from a file at once?
- */
-#define READ_CHUNK_SIZE				(128 * 1024)
-
-/*
- * Each file described by the manifest file is parsed to produce an object
- * like this.
- */
-typedef struct manifest_file
-{
-	uint32		status;			/* hash status */
-	const char *pathname;
-	size_t		size;
-	pg_checksum_type checksum_type;
-	int			checksum_length;
-	uint8	   *checksum_payload;
-	bool		matched;
-	bool		bad;
-} manifest_file;
-
-#define should_verify_checksum(m) \
-	(((m)->matched) && !((m)->bad) && (((m)->checksum_type) != CHECKSUM_TYPE_NONE))
-
-/*
- * Define a hash table which we can use to store information about the files
- * mentioned in the backup manifest.
- */
-#define SH_PREFIX		manifest_files
-#define SH_ELEMENT_TYPE	manifest_file
-#define SH_KEY_TYPE		const char *
-#define	SH_KEY			pathname
-#define SH_HASH_KEY(tb, key)	hash_string(key)
-#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
-#define	SH_SCOPE		static inline
-#define SH_RAW_ALLOCATOR	pg_malloc0
-#define SH_DECLARE
-#define SH_DEFINE
-#include "lib/simplehash.h"
-
-/*
- * Each WAL range described by the manifest file is parsed to produce an
- * object like this.
- */
-typedef struct manifest_wal_range
-{
-	TimeLineID	tli;
-	XLogRecPtr	start_lsn;
-	XLogRecPtr	end_lsn;
-	struct manifest_wal_range *next;
-	struct manifest_wal_range *prev;
-} manifest_wal_range;
-
-/*
- * All the data parsed from a backup_manifest file.
- */
-typedef struct manifest_data
-{
-	int			version;
-	uint64		system_identifier;
-	manifest_files_hash *files;
-	manifest_wal_range *first_wal_range;
-	manifest_wal_range *last_wal_range;
-} manifest_data;
-
-/*
- * All of the context information we need while checking a backup manifest.
- */
-typedef struct verifier_context
-{
-	manifest_data *manifest;
-	char	   *backup_directory;
-	SimpleStringList ignore_list;
-	bool		skip_checksums;
-	bool		exit_on_error;
-	bool		saw_any_error;
-
-	/* Progress indicators */
-	bool		show_progress;
-	uint64		total_size;
-	uint64		done_size;
-} verifier_context;
-
 static manifest_data *parse_manifest_file(char *manifest_path);
 static void verifybackup_version_cb(JsonManifestParseContext *context,
 									int manifest_version);
@@ -156,14 +72,6 @@ static void parse_required_wal(verifier_context *context,
 							   char *pg_waldump_path,
 							   char *wal_directory);
 
-static void report_backup_error(verifier_context *context,
-								const char *pg_restrict fmt,...)
-			pg_attribute_printf(2, 3);
-static void report_fatal_error(const char *pg_restrict fmt,...)
-			pg_attribute_printf(1, 2) pg_attribute_noreturn();
-static bool should_ignore_relpath(verifier_context *context, const char *relpath);
-
-static void progress_report(verifier_context *context, bool finished);
 static void usage(void);
 
 static const char *progname;
@@ -978,7 +886,7 @@ parse_required_wal(verifier_context *context, char *pg_waldump_path,
  * Update the context to indicate that we saw an error, and exit if the
  * context says we should.
  */
-static void
+void
 report_backup_error(verifier_context *context, const char *pg_restrict fmt,...)
 {
 	va_list		ap;
@@ -995,7 +903,7 @@ report_backup_error(verifier_context *context, const char *pg_restrict fmt,...)
 /*
  * Report a fatal error and exit
  */
-static void
+void
 report_fatal_error(const char *pg_restrict fmt,...)
 {
 	va_list		ap;
@@ -1014,7 +922,7 @@ report_fatal_error(const char *pg_restrict fmt,...)
  * Note that by "prefix" we mean a parent directory; for this purpose,
  * "aa/bb" is not a prefix of "aa/bbb", but it is a prefix of "aa/bb/cc".
  */
-static bool
+bool
 should_ignore_relpath(verifier_context *context, const char *relpath)
 {
 	SimpleStringListCell *cell;
@@ -1043,7 +951,7 @@ should_ignore_relpath(verifier_context *context, const char *relpath)
  * If finished is set to true, this is the last progress report. The cursor
  * is moved to the next line.
  */
-static void
+void
 progress_report(verifier_context *context, bool finished)
 {
 	static pg_time_t last_progress_report = 0;
diff --git a/src/bin/pg_verifybackup/pg_verifybackup.h b/src/bin/pg_verifybackup/pg_verifybackup.h
new file mode 100644
index 00000000000..90900048547
--- /dev/null
+++ b/src/bin/pg_verifybackup/pg_verifybackup.h
@@ -0,0 +1,118 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_verifybackup.h
+ *	  Verify a backup against a backup manifest.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/bin/pg_verifybackup/pg_verifybackup.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PG_VERIFYBACKUP_H
+#define PG_VERIFYBACKUP_H
+
+#include "common/controldata_utils.h"
+#include "common/hashfn_unstable.h"
+#include "common/parse_manifest.h"
+#include "fe_utils/simple_list.h"
+
+extern bool show_progress;
+extern bool skip_checksums;
+
+/*
+ * How many bytes should we try to read from a file at once?
+ */
+#define READ_CHUNK_SIZE				(128 * 1024)
+
+/*
+ * Each file described by the manifest file is parsed to produce an object
+ * like this.
+ */
+typedef struct manifest_file
+{
+	uint32		status;			/* hash status */
+	const char *pathname;
+	size_t		size;
+	pg_checksum_type checksum_type;
+	int			checksum_length;
+	uint8	   *checksum_payload;
+	bool		matched;
+	bool		bad;
+} manifest_file;
+
+#define should_verify_checksum(m) \
+	(((m)->matched) && !((m)->bad) && (((m)->checksum_type) != CHECKSUM_TYPE_NONE))
+
+/*
+ * Define a hash table which we can use to store information about the files
+ * mentioned in the backup manifest.
+ */
+#define SH_PREFIX		manifest_files
+#define SH_ELEMENT_TYPE	manifest_file
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			pathname
+#define SH_HASH_KEY(tb, key)	hash_string(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * Each WAL range described by the manifest file is parsed to produce an
+ * object like this.
+ */
+typedef struct manifest_wal_range
+{
+	TimeLineID	tli;
+	XLogRecPtr	start_lsn;
+	XLogRecPtr	end_lsn;
+	struct manifest_wal_range *next;
+	struct manifest_wal_range *prev;
+} manifest_wal_range;
+
+/*
+ * All the data parsed from a backup_manifest file.
+ */
+typedef struct manifest_data
+{
+	int			version;
+	uint64		system_identifier;
+	manifest_files_hash *files;
+	manifest_wal_range *first_wal_range;
+	manifest_wal_range *last_wal_range;
+} manifest_data;
+
+/*
+ * All of the context information we need while checking a backup manifest.
+ */
+typedef struct verifier_context
+{
+	manifest_data *manifest;
+	char	   *backup_directory;
+	SimpleStringList ignore_list;
+	bool		skip_checksums;
+	bool		exit_on_error;
+	bool		saw_any_error;
+
+	/* Progress indicators */
+	bool		show_progress;
+	uint64		total_size;
+	uint64		done_size;
+} verifier_context;
+
+extern void report_backup_error(verifier_context *context,
+								const char *pg_restrict fmt,...)
+			pg_attribute_printf(2, 3);
+extern void report_fatal_error(const char *pg_restrict fmt,...)
+			pg_attribute_printf(1, 2) pg_attribute_noreturn();
+extern bool should_ignore_relpath(verifier_context *context,
+								  const char *relpath);
+
+extern void progress_report(verifier_context *context, bool finished);
+
+#endif							/* PG_VERIFYBACKUP_H */
-- 
2.18.0

From 0ef14ab4be362f6ab48c6ebd501d3036ba4d21d9 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 6 Aug 2024 10:23:45 +0530
Subject: [PATCH v7] Improve file header comments for astramer code.

Make it clear that "astreamer" stands for "archive streamer".
Generalize comments that still believe this code can only be used
by pg_basebackup. Add some comments explaining the asymmetry
between the gzip, lz4, and zstd astreamers, in the hopes of making
life easier for anyone who hacks on this code in the future.
---
 src/fe_utils/astreamer_file.c    |  4 ++++
 src/fe_utils/astreamer_gzip.c    | 15 +++++++++++++++
 src/fe_utils/astreamer_lz4.c     |  4 ++++
 src/fe_utils/astreamer_zstd.c    |  4 ++++
 src/include/fe_utils/astreamer.h | 21 +++++++++++++++------
 5 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/fe_utils/astreamer_file.c b/src/fe_utils/astreamer_file.c
index 13d1192c6e6..c9a030853bc 100644
--- a/src/fe_utils/astreamer_file.c
+++ b/src/fe_utils/astreamer_file.c
@@ -2,6 +2,10 @@
  *
  * astreamer_file.c
  *
+ * Archive streamers that write to files. astreamer_plain_writer writes
+ * the whole archive to a single file, and astreamer_extractor writes
+ * each archive member to a separate file in a given directory.
+ *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
diff --git a/src/fe_utils/astreamer_gzip.c b/src/fe_utils/astreamer_gzip.c
index dd28defac7b..1c773a23848 100644
--- a/src/fe_utils/astreamer_gzip.c
+++ b/src/fe_utils/astreamer_gzip.c
@@ -2,6 +2,21 @@
  *
  * astreamer_gzip.c
  *
+ * Archive streamers that deal with data compressed using gzip.
+ * astreamer_gzip_writer applies gzip compression to the input data
+ * and writes the result to a file. astreamer_gzip_decompressor assumes
+ * that the input stream is compressed using gzip and decompresses it.
+ *
+ * Note that the code in this file is asymmetric with what we do for
+ * other compression types: for lz4 and zstd, there is a compressor and
+ * a decompressor, rather than a writer and a decompressor. The approach
+ * taken here is less flexible, because a writer can only write to a file,
+ * while a compressor can write to a subsequent astreamer which is free
+ * to do whatever it likes. The reason it's like this is because this
+ * code was adapated from old, less-modular pg_basebackup that used the
+ * same APIs that astreamer_gzip_writer uses, and it didn't seem
+ * necessary to change anything at the time.
+ *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
diff --git a/src/fe_utils/astreamer_lz4.c b/src/fe_utils/astreamer_lz4.c
index d8b2a367e47..2bf14084e7f 100644
--- a/src/fe_utils/astreamer_lz4.c
+++ b/src/fe_utils/astreamer_lz4.c
@@ -2,6 +2,10 @@
  *
  * astreamer_lz4.c
  *
+ * Archive streamers that deal with data compressed using lz4.
+ * astreamer_lz4_compressor applies lz4 compression to the input stream,
+ * and astreamer_lz4_decompressor does the reverse.
+ *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
diff --git a/src/fe_utils/astreamer_zstd.c b/src/fe_utils/astreamer_zstd.c
index 45f6cb67363..4b2d42b2311 100644
--- a/src/fe_utils/astreamer_zstd.c
+++ b/src/fe_utils/astreamer_zstd.c
@@ -2,6 +2,10 @@
  *
  * astreamer_zstd.c
  *
+ * Archive streamers that deal with data compressed using zstd.
+ * astreamer_zstd_compressor applies lz4 compression to the input stream,
+ * and astreamer_zstd_decompressor does the reverse.
+ *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
diff --git a/src/include/fe_utils/astreamer.h b/src/include/fe_utils/astreamer.h
index 2c014dbddbe..570cfba3040 100644
--- a/src/include/fe_utils/astreamer.h
+++ b/src/include/fe_utils/astreamer.h
@@ -2,9 +2,18 @@
  *
  * astreamer.h
  *
- * Each tar archive returned by the server is passed to one or more
- * astreamer objects for further processing. The astreamer may do
- * something simple, like write the archive to a file, perhaps after
+ * The "archive streamer" interface is intended to allow frontend code
+ * to stream from possibly-compressed archive files from any source and
+ * perform arbitrary actions based on the contents of those archives.
+ * Archive streamers are intended to be composable, and most tasks will
+ * require two or more archive streamers to complete. For instance,
+ * if the input is an uncompressed tar stream, a tar parser astreamer
+ * could be used to interpret it, and then an extractor astreamer could
+ * be used to write each archive member out to a file.
+ *
+ * In general, each archive streamer is relatively free to take whatever
+ * action it desires in the stream of chunks provided by the caller. It
+ * may do something simple, like write the archive to a file, perhaps after
  * compressing it, but it can also do more complicated things, like
  * annotating the byte stream to indicate which parts of the data
  * correspond to tar headers or trailing padding, vs. which parts are
@@ -33,9 +42,9 @@ typedef struct astreamer_ops astreamer_ops;
 
 /*
  * Each chunk of archive data passed to a astreamer is classified into one
- * of these categories. When data is first received from the remote server,
- * each chunk will be categorized as ASTREAMER_UNKNOWN, and the chunks will
- * be of whatever size the remote server chose to send.
+ * of these categories. When data is initially passed to an archive streamer,
+ * each chunk will be categorized as ASTREAMER_UNKNOWN, and the chunks can
+ * be of whatever size the caller finds convenient.
  *
  * If the archive is parsed (e.g. see astreamer_tar_parser_new()), then all
  * chunks should be labelled as one of the other types listed here. In
-- 
2.18.0

From 43fb489aecb872cc6f9a59ebbdca9c5a1110ea72 Mon Sep 17 00:00:00 2001
From: Amul Sul <amul.sul@enterprisedb.com>
Date: Wed, 31 Jul 2024 11:43:52 +0530
Subject: [PATCH v7 04/12] Refactor: move few global variable to
 verifier_context struct

Global variables are:
	1. show_progress
	2. skip_checksums
	3. total_size
	4. done_size
---
 src/bin/pg_verifybackup/pg_verifybackup.c | 50 +++++++++++------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c
index d77e70fbe38..71585ffc50e 100644
--- a/src/bin/pg_verifybackup/pg_verifybackup.c
+++ b/src/bin/pg_verifybackup/pg_verifybackup.c
@@ -113,8 +113,14 @@ typedef struct verifier_context
 	manifest_data *manifest;
 	char	   *backup_directory;
 	SimpleStringList ignore_list;
+	bool		skip_checksums;
 	bool		exit_on_error;
 	bool		saw_any_error;
+
+	/* Progress indicators */
+	bool		show_progress;
+	uint64		total_size;
+	uint64		done_size;
 } verifier_context;
 
 static manifest_data *parse_manifest_file(char *manifest_path);
@@ -157,19 +163,11 @@ static void report_fatal_error(const char *pg_restrict fmt,...)
 			pg_attribute_printf(1, 2) pg_attribute_noreturn();
 static bool should_ignore_relpath(verifier_context *context, const char *relpath);
 
-static void progress_report(bool finished);
+static void progress_report(verifier_context *context, bool finished);
 static void usage(void);
 
 static const char *progname;
 
-/* options */
-static bool show_progress = false;
-static bool skip_checksums = false;
-
-/* Progress indicators */
-static uint64 total_size = 0;
-static uint64 done_size = 0;
-
 /*
  * Main entry point.
  */
@@ -260,13 +258,13 @@ main(int argc, char **argv)
 				no_parse_wal = true;
 				break;
 			case 'P':
-				show_progress = true;
+				context.show_progress = true;
 				break;
 			case 'q':
 				quiet = true;
 				break;
 			case 's':
-				skip_checksums = true;
+				context.skip_checksums = true;
 				break;
 			case 'w':
 				wal_directory = pstrdup(optarg);
@@ -299,7 +297,7 @@ main(int argc, char **argv)
 	}
 
 	/* Complain if the specified arguments conflict */
-	if (show_progress && quiet)
+	if (context.show_progress && quiet)
 		pg_fatal("cannot specify both %s and %s",
 				 "-P/--progress", "-q/--quiet");
 
@@ -363,7 +361,7 @@ main(int argc, char **argv)
 	 * Now do the expensive work of verifying file checksums, unless we were
 	 * told to skip it.
 	 */
-	if (!skip_checksums)
+	if (!context.skip_checksums)
 		verify_backup_checksums(&context);
 
 	/*
@@ -739,8 +737,9 @@ verify_backup_file(verifier_context *context, char *relpath, char *fullpath)
 		verify_control_file(fullpath, context->manifest->system_identifier);
 
 	/* Update statistics for progress report, if necessary */
-	if (show_progress && !skip_checksums && should_verify_checksum(m))
-		total_size += m->size;
+	if (context->show_progress && !context->skip_checksums &&
+		should_verify_checksum(m))
+		context->total_size += m->size;
 
 	/*
 	 * We don't verify checksums at this stage. We first finish verifying that
@@ -815,7 +814,7 @@ verify_backup_checksums(verifier_context *context)
 	manifest_file *m;
 	uint8	   *buffer;
 
-	progress_report(false);
+	progress_report(context, false);
 
 	buffer = pg_malloc(READ_CHUNK_SIZE * sizeof(uint8));
 
@@ -841,7 +840,7 @@ verify_backup_checksums(verifier_context *context)
 
 	pfree(buffer);
 
-	progress_report(true);
+	progress_report(context, true);
 }
 
 /*
@@ -889,8 +888,8 @@ verify_file_checksum(verifier_context *context, manifest_file *m,
 		}
 
 		/* Report progress */
-		done_size += rc;
-		progress_report(false);
+		context->done_size += rc;
+		progress_report(context, false);
 	}
 	if (rc < 0)
 		report_backup_error(context, "could not read file \"%s\": %m",
@@ -1036,7 +1035,7 @@ should_ignore_relpath(verifier_context *context, const char *relpath)
 }
 
 /*
- * Print a progress report based on the global variables.
+ * Print a progress report based on the variables in verifier_context.
  *
  * Progress report is written at maximum once per second, unless the finished
  * parameter is set to true.
@@ -1045,7 +1044,7 @@ should_ignore_relpath(verifier_context *context, const char *relpath)
  * is moved to the next line.
  */
 static void
-progress_report(bool finished)
+progress_report(verifier_context *context, bool finished)
 {
 	static pg_time_t last_progress_report = 0;
 	pg_time_t	now;
@@ -1053,7 +1052,7 @@ progress_report(bool finished)
 	char		totalsize_str[32];
 	char		donesize_str[32];
 
-	if (!show_progress)
+	if (!context->show_progress)
 		return;
 
 	now = time(NULL);
@@ -1061,12 +1060,13 @@ progress_report(bool finished)
 		return;					/* Max once per second */
 
 	last_progress_report = now;
-	percent_size = total_size ? (int) ((done_size * 100 / total_size)) : 0;
+	percent_size = context->total_size ?
+		(int) ((context->done_size * 100 / context->total_size)) : 0;
 
 	snprintf(totalsize_str, sizeof(totalsize_str), UINT64_FORMAT,
-			 total_size / 1024);
+			 context->total_size / 1024);
 	snprintf(donesize_str, sizeof(donesize_str), UINT64_FORMAT,
-			 done_size / 1024);
+			 context->done_size / 1024);
 
 	fprintf(stderr,
 			_("%*s/%s kB (%d%%) verified"),
-- 
2.18.0

Reply via email to