On Wed, Dec 24, 2014 at 9:03 PM, Michael Paquier
<michael.paqu...@gmail.com> wrote:
> Returning only a boolean is fine for me (that's what my first patch
> did), especially if we add at some point hooks for compression and
> decompression calls.
Here is a patch rebased on current HEAD (60838df) for the core feature
with the APIs of pglz using booleans as return values.
-- 
Michael
From c8891e6086682d4e5bc197ef3047068b3a3875c5 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Tue, 25 Nov 2014 14:24:26 +0900
Subject: [PATCH] Support compression for full-page writes in WAL

Compression is controlled with a new parameter called wal_compression.
This parameter can be changed at session level to control WAL compression.
---
 contrib/pg_xlogdump/pg_xlogdump.c             |  20 ++--
 doc/src/sgml/config.sgml                      |  29 +++++
 src/backend/access/transam/xlog.c             |   1 +
 src/backend/access/transam/xloginsert.c       | 155 ++++++++++++++++++++++----
 src/backend/access/transam/xlogreader.c       |  77 ++++++++++---
 src/backend/utils/misc/guc.c                  |   9 ++
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/xlog.h                     |   1 +
 src/include/access/xlogreader.h               |   7 +-
 src/include/access/xlogrecord.h               |  36 ++++--
 src/include/pg_config.h.in                    |   4 +-
 11 files changed, 283 insertions(+), 57 deletions(-)

diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c
index 9f05e25..3ba1d8f 100644
--- a/contrib/pg_xlogdump/pg_xlogdump.c
+++ b/contrib/pg_xlogdump/pg_xlogdump.c
@@ -363,15 +363,12 @@ XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats,
 	 * takes up BLCKSZ bytes, minus the "hole" length.
 	 *
 	 * XXX: We peek into xlogreader's private decoded backup blocks for the
-	 * hole_length. It doesn't seem worth it to add an accessor macro for
+	 * length of block. It doesn't seem worth it to add an accessor macro for
 	 * this.
 	 */
 	fpi_len = 0;
 	for (block_id = 0; block_id <= record->max_block_id; block_id++)
-	{
-		if (XLogRecHasBlockImage(record, block_id))
-			fpi_len += BLCKSZ - record->blocks[block_id].hole_length;
-	}
+		fpi_len += record->blocks[block_id].bkp_len;
 
 	/* Update per-rmgr statistics */
 
@@ -465,9 +462,16 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
 				   blk);
 			if (XLogRecHasBlockImage(record, block_id))
 			{
-				printf(" (FPW); hole: offset: %u, length: %u\n",
-					   record->blocks[block_id].hole_offset,
-					   record->blocks[block_id].hole_length);
+				if (record->blocks[block_id].is_compressed)
+					printf(" (FPW compressed); hole offset: %u, "
+						   "compressed length: %u, original length: %u\n",
+						   record->blocks[block_id].hole_offset,
+						   record->blocks[block_id].bkp_len,
+						   record->blocks[block_id].bkp_uncompress_len);
+				else
+					printf(" (FPW); hole offset: %u, length: %u\n",
+						   record->blocks[block_id].hole_offset,
+						   record->blocks[block_id].bkp_len);
 			}
 			putchar('\n');
 		}
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6bcb106..acbbd20 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2282,6 +2282,35 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wal-compression" xreflabel="wal_compression">
+      <term><varname>wal_compression</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>wal_compression</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When this parameter is <literal>on</>, the <productname>PostgreSQL</>
+        server compresses the content of full-page writes when necessary and
+        inserts in WAL records with smaller sizes, reducing the amount of
+        WAL stored on disk.
+       </para>
+
+       <para>
+        Compression has the advantage of reducing the amount of disk I/O when
+        doing WAL-logging, at the cost of some extra CPU to perform the
+        compression of a block image.  At WAL replay, compressed block images
+        need extra CPU cycles to perform the decompression of each block
+        image, but it can reduce as well replay time in I/O bounded
+        environments.
+       </para>
+
+       <para>
+        The default value is <literal>off</>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers">
       <term><varname>wal_buffers</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e5dddd4..d68d9e3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -88,6 +88,7 @@ char	   *XLogArchiveCommand = NULL;
 bool		EnableHotStandby = false;
 bool		fullPageWrites = true;
 bool		wal_log_hints = false;
+bool		wal_compression = false;
 bool		log_checkpoints = false;
 int			sync_method = DEFAULT_SYNC_METHOD;
 int			wal_level = WAL_LEVEL_MINIMAL;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index f3d610f..a1496aa 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -24,12 +24,16 @@
 #include "access/xlog_internal.h"
 #include "access/xloginsert.h"
 #include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
 
+/* maximum size for compression buffer of block image */
+#define PGLZ_MAX_BLCKSZ	PGLZ_MAX_OUTPUT(BLCKSZ)
+
 /*
  * For each block reference registered with XLogRegisterBuffer, we fill in
  * a registered_buffer struct.
@@ -50,6 +54,8 @@ typedef struct
 
 	XLogRecData bkp_rdatas[2];	/* temporary rdatas used to hold references to
 								 * backup block data in XLogRecordAssemble() */
+	char		compressed_page[PGLZ_MAX_BLCKSZ]; /* recipient for compressed
+												   * page */
 }	registered_buffer;
 
 static registered_buffer *registered_buffers;
@@ -81,6 +87,9 @@ static char *hdr_scratch = NULL;
 	 MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \
 	 SizeOfXLogRecordDataHeaderLong)
 
+/* Scratch buffer holding block image data to be compressed  */
+static char *compression_scratch = NULL;
+
 /*
  * An array of XLogRecData structs, to hold registered data.
  */
@@ -97,6 +106,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 				   XLogRecPtr RedoRecPtr, bool doPageWrites,
 				   XLogRecPtr *fpw_lsn);
 
+static bool XLogCompressBackupBlock(char *page, uint32 orig_len,
+									char *dest, uint16 *len);
+
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
@@ -529,9 +541,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 		if (needs_backup)
 		{
 			Page		page = regbuf->page;
+			uint16		hole_length;
+			uint16		hole_offset;
+			uint16		compress_len = 0;
+			bool		is_compressed = false;
 
 			/*
-			 * The page needs to be backed up, so set up *bimg
+			 * The page needs to be backed up, so calculate its hole length
+			 * and offset.
 			 */
 			if (regbuf->flags & REGBUF_STANDARD)
 			{
@@ -543,49 +560,107 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 					upper > lower &&
 					upper <= BLCKSZ)
 				{
-					bimg.hole_offset = lower;
-					bimg.hole_length = upper - lower;
+					hole_offset = lower;
+					hole_length = upper - lower;
 				}
 				else
 				{
 					/* No "hole" to compress out */
-					bimg.hole_offset = 0;
-					bimg.hole_length = 0;
+					hole_offset = 0;
+					hole_length = 0;
 				}
 			}
 			else
 			{
 				/* Not a standard page header, don't try to eliminate "hole" */
-				bimg.hole_offset = 0;
-				bimg.hole_length = 0;
+				hole_offset = 0;
+				hole_length = 0;
 			}
 
-			/* Fill in the remaining fields in the XLogRecordBlockData struct */
-			bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;
+			/*
+			 * First try to compress block without its hole to improve the
+			 * compression of the whole. If the block is considered as
+			 * not compressible, complete the block header information
+			 * accordingly.
+			 */
+			if (wal_compression)
+			{
+				int page_len = BLCKSZ - hole_length;
+				char *scratch_buf;
+
+				if (hole_length != 0)
+				{
+					scratch_buf = compression_scratch;
+					memcpy(scratch_buf, page, hole_offset);
+					memcpy(scratch_buf + hole_offset,
+						   page + (hole_offset + hole_length),
+						   BLCKSZ - (hole_length + hole_offset));
+				}
+				else
+					scratch_buf = page;
 
-			total_len += BLCKSZ - bimg.hole_length;
+				/* Perform compression of block */
+				if (XLogCompressBackupBlock(scratch_buf,
+											page_len,
+											regbuf->compressed_page,
+											&compress_len))
+				{
+					/* compression is done, add record */
+					is_compressed = true;
+				}
+			}
 
 			/*
-			 * Construct XLogRecData entries for the page content.
+			 * Fill in the remaining fields in the XLogRecordBlockImageHeader
+			 * struct and add new entries in the record chain.
 			 */
-			rdt_datas_last->next = &regbuf->bkp_rdatas[0];
-			rdt_datas_last = rdt_datas_last->next;
-			if (bimg.hole_length == 0)
+			bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;
+
+			/* hole offset length should be 15-bit long */
+			Assert((hole_offset & 0x8000) == 0);
+
+			if (is_compressed)
 			{
-				rdt_datas_last->data = page;
-				rdt_datas_last->len = BLCKSZ;
+				/* compressed block information */
+				bimg.length = compress_len;
+				bimg.hole_offset = hole_offset;
+				bimg.is_compressed = 1;
+
+				/* record entry for compressed block */
+				rdt_datas_last->next = &regbuf->bkp_rdatas[0];
+				rdt_datas_last = rdt_datas_last->next;
+				rdt_datas_last->data = regbuf->compressed_page;
+				rdt_datas_last->len = compress_len;
+				total_len += bimg.length;
 			}
 			else
 			{
-				/* must skip the hole */
-				rdt_datas_last->data = page;
-				rdt_datas_last->len = bimg.hole_offset;
-
-				rdt_datas_last->next = &regbuf->bkp_rdatas[1];
+				/* uncompressed block information */
+				bimg.length = BLCKSZ - hole_length;
+				bimg.hole_offset = hole_offset;
+				bimg.is_compressed = 0;
+				total_len += bimg.length;
+
+				/* record entries for uncompressed block */
+				rdt_datas_last->next = &regbuf->bkp_rdatas[0];
 				rdt_datas_last = rdt_datas_last->next;
+				if (hole_length == 0)
+				{
+					rdt_datas_last->data = page;
+					rdt_datas_last->len = BLCKSZ;
+				}
+				else
+				{
+					/* must skip the hole */
+					rdt_datas_last->data = page;
+					rdt_datas_last->len = hole_offset;
+
+					rdt_datas_last->next = &regbuf->bkp_rdatas[1];
+					rdt_datas_last = rdt_datas_last->next;
 
-				rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length);
-				rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length);
+					rdt_datas_last->data = page + (hole_offset + hole_length);
+					rdt_datas_last->len = BLCKSZ - (hole_offset + hole_length);
+				}
 			}
 		}
 
@@ -681,6 +756,35 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 }
 
 /*
+ * Create a compressed version of a backup block. If successful, return
+ * true and set 'len' to its length. If block cannot be compressed or if
+ * compression failed return false.
+ */
+static bool
+XLogCompressBackupBlock(char *page, uint32 orig_len, char *dest, uint16 *len)
+{
+	/* leave if data can not be compressed */
+	if (!pglz_compress(page, orig_len, (PGLZ_Header *) dest,
+					   PGLZ_strategy_default))
+		return false;
+
+	/*
+	 * We recheck the actual size even if pglz_compress() report success,
+	 * because it might be satisfied with having saved as little as one byte
+	 * in the compressed data --- which could turn into a net loss once you
+	 * consider header and alignment padding.  Worst case, the compressed
+	 * format might require three padding bytes (plus header, which is
+	 * included in VARSIZE(buf)), whereas the uncompressed format would take
+	 * only one header byte and no padding if the value is short enough.  So
+	 * we insist on a savings of more than 2 bytes to ensure we have a gain.
+	 */
+	*len = VARSIZE((struct varlena *) dest);
+	if (*len >= orig_len - 2)
+		return false;
+	return true;
+}
+
+/*
  * Determine whether the buffer referenced has to be backed up.
  *
  * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
@@ -893,4 +997,9 @@ InitXLogInsert(void)
 	if (hdr_scratch == NULL)
 		hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
 											 HEADER_SCRATCH_SIZE);
+
+	/* allocate scratch buffer used for compression of block images */
+	if (compression_scratch == NULL)
+		compression_scratch = MemoryContextAllocZero(xloginsert_cxt,
+													 BLCKSZ);
 }
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 67d6223..e483288 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -20,6 +20,7 @@
 #include "access/xlog_internal.h"
 #include "access/xlogreader.h"
 #include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
 
 static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength);
 
@@ -74,13 +75,15 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data)
 	state->max_block_id = -1;
 
 	/*
-	 * Permanently allocate readBuf.  We do it this way, rather than just
-	 * making a static array, for two reasons: (1) no need to waste the
-	 * storage in most instantiations of the backend; (2) a static char array
-	 * isn't guaranteed to have any particular alignment, whereas palloc()
-	 * will provide MAXALIGN'd storage.
+	 * Permanently allocate readBuf compressBuf.  We do it this way,
+	 * rather than just making a static array, for two reasons:
+	 * (1) no need to waste the  storage in most instantiations of the
+	 * backend; (2) a static char array isn't guaranteed to have any
+	 * particular alignment, whereas palloc() will provide MAXALIGN'd
+	 * storage.
 	 */
 	state->readBuf = (char *) palloc(XLOG_BLCKSZ);
+	state->compressBuf = (char *) palloc(BLCKSZ);
 
 	state->read_page = pagereadfunc;
 	/* system_identifier initialized to zeroes above */
@@ -98,6 +101,7 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data)
 	{
 		pfree(state->errormsg_buf);
 		pfree(state->readBuf);
+		pfree(state->compressBuf);
 		pfree(state);
 		return NULL;
 	}
@@ -125,6 +129,7 @@ XLogReaderFree(XLogReaderState *state)
 	if (state->readRecordBuf)
 		pfree(state->readRecordBuf);
 	pfree(state->readBuf);
+	pfree(state->compressBuf);
 	pfree(state);
 }
 
@@ -923,6 +928,7 @@ ResetDecoder(XLogReaderState *state)
 		state->blocks[block_id].in_use = false;
 		state->blocks[block_id].has_image = false;
 		state->blocks[block_id].has_data = false;
+		state->blocks[block_id].is_compressed = false;
 	}
 	state->max_block_id = -1;
 }
@@ -1032,9 +1038,12 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
 
 			if (blk->has_image)
 			{
-				COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
-				COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16));
-				datatotal += BLCKSZ - blk->hole_length;
+				XLogRecordBlockImageHeader bkp_info;
+				COPY_HEADER_FIELD(&bkp_info, sizeof(XLogRecordBlockImageHeader));
+				blk->is_compressed = bkp_info.is_compressed;
+				blk->bkp_len = bkp_info.length;
+				blk->hole_offset = bkp_info.hole_offset;
+				datatotal += blk->bkp_len;
 			}
 			if (!(fork_flags & BKPBLOCK_SAME_REL))
 			{
@@ -1088,8 +1097,17 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
 			continue;
 		if (blk->has_image)
 		{
+			/*
+			 * Peek into the block image to grab the original length of the
+			 * compressed image.
+			 */
+			if (blk->is_compressed)
+				blk->bkp_uncompress_len = PGLZ_RAW_SIZE((PGLZ_Header *) ptr);
+			else
+				blk->bkp_uncompress_len = blk->bkp_len;
+
 			blk->bkp_image = ptr;
-			ptr += BLCKSZ - blk->hole_length;
+			ptr += blk->bkp_len;
 		}
 		if (blk->has_data)
 		{
@@ -1195,6 +1213,8 @@ bool
 RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 {
 	DecodedBkpBlock *bkpb;
+	char   *block_image;
+	int		hole_length;
 
 	if (!record->blocks[block_id].in_use)
 		return false;
@@ -1202,19 +1222,44 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 		return false;
 
 	bkpb = &record->blocks[block_id];
+	block_image = bkpb->bkp_image;
+
+	/*
+	 * Fetch page data, with different processing depending on if the
+	 * page is compressed or not.
+	 */
+	if (bkpb->is_compressed)
+	{
+		PGLZ_Header *header = (PGLZ_Header *) block_image;
+
+		if (!pglz_decompress(header, record->compressBuf))
+		{
+			report_invalid_record(record, "invalid compressed image at %X/%X, block %d",
+								  (uint32) (record->ReadRecPtr >> 32),
+								  (uint32) record->ReadRecPtr,
+								  block_id);
+			return false;
+		}
+
+		block_image = record->compressBuf;
+		hole_length = BLCKSZ - PGLZ_RAW_SIZE(header);
+	}
+	else
+		hole_length = BLCKSZ - bkpb->bkp_len;
 
-	if (bkpb->hole_length == 0)
+	/* generate page, taking into account hole if necessary */
+	if (hole_length == 0)
 	{
-		memcpy(page, bkpb->bkp_image, BLCKSZ);
+		memcpy(page, block_image, BLCKSZ);
 	}
 	else
 	{
-		memcpy(page, bkpb->bkp_image, bkpb->hole_offset);
+		memcpy(page, block_image, bkpb->hole_offset);
 		/* must zero-fill the hole */
-		MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length);
-		memcpy(page + (bkpb->hole_offset + bkpb->hole_length),
-			   bkpb->bkp_image + bkpb->hole_offset,
-			   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
+		MemSet(page + bkpb->hole_offset, 0, hole_length);
+		memcpy(page + (bkpb->hole_offset + hole_length),
+			   block_image + bkpb->hole_offset,
+			   BLCKSZ - (bkpb->hole_offset + hole_length));
 	}
 
 	return true;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 77c3494..de17e29 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -927,6 +927,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"wal_compression", PGC_USERSET, WAL_SETTINGS,
+			 gettext_noop("Compresses full-page writes written in WAL file."),
+			 NULL
+		},
+		&wal_compression,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b053659..b367e2c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,6 +186,7 @@
 					#   fsync_writethrough
 					#   open_sync
 #full_page_writes = on			# recover from partial page writes
+#wal_compression = off			# enable compression of full-page writes
 #wal_log_hints = off			# also do full page writes of non-critical updates
 					# (change requires restart)
 #wal_buffers = -1			# min 32kB, -1 sets based on shared_buffers
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d06fbc0..6bdfa4a 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -98,6 +98,7 @@ extern char *XLogArchiveCommand;
 extern bool EnableHotStandby;
 extern bool fullPageWrites;
 extern bool wal_log_hints;
+extern bool wal_compression;
 extern bool log_checkpoints;
 
 /* WAL levels */
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index eb6cc89..e13b796 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -52,9 +52,11 @@ typedef struct
 
 	/* Information on full-page image, if any */
 	bool		has_image;
+	bool		is_compressed;
 	char	   *bkp_image;
+	uint16		bkp_len;
+	uint16		bkp_uncompress_len;
 	uint16		hole_offset;
-	uint16		hole_length;
 
 	/* Buffer holding the rmgr-specific data associated with this block */
 	bool		has_data;
@@ -138,6 +140,9 @@ struct XLogReaderState
 	/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 	char	   *readBuf;
 
+	/* Scratch buffer used for uncompressed pages */
+	char	   *compressBuf;
+
 	/* last read segment, segment offset, read length, TLI */
 	XLogSegNo	readSegNo;
 	uint32		readOff;
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
index e5826ee..3ccd69d 100644
--- a/src/include/access/xlogrecord.h
+++ b/src/include/access/xlogrecord.h
@@ -98,17 +98,37 @@ typedef struct XLogRecordBlockHeader
  * Additional header information when a full-page image is included
  * (i.e. when BKPBLOCK_HAS_IMAGE is set).
  *
- * As a trivial form of data compression, the XLOG code is aware that
- * PG data pages usually contain an unused "hole" in the middle, which
- * contains only zero bytes.  If hole_length > 0 then we have removed
- * such a "hole" from the stored data (and it's not counted in the
- * XLOG record's CRC, either).  Hence, the amount of block data actually
- * present is BLCKSZ - hole_length bytes.
+ * Block images are able to do several types of compression:
+ * - When wal_compression is off, as a trivial form of compression, the
+ * XLOG code is aware that PG data pages usually contain an unused "hole"
+ * in the middle, which contains only zero bytes.  If length < BLCKSZ
+ * then we have removed such a "hole" from the stored data (and it is
+ * not counted in the XLOG record's CRC, either).  Hence, the amount
+ * of block data actually present is "length" bytes.  The hole "offset"
+ * on page is defined using "hole_offset".
+ * - When wal_compression is enabled, block images are compressed
+ * using a compression algorithm without their hole to improve
+ * compression process of the page. "length" corresponds in this case
+ * to the length of the block compressed, the original length of the
+ * block without its page hole being deducible from the compressed data
+ * itself. "hole_offset" is the hole offset of the page.
+ *
+ * "is_compressed" is used to identofy if a given block image is compressed
+ * or not. Maximum page size allowed on the system being 32k, the hole
+ * offset cannot be more than 15-bit long so the last free bit is used to
+ * store the compression state of block image. If the maximum page size
+ * allowed is increased to a value higher than that, we should consider
+ * increasing this structure size as well, but this would increase the
+ * length of block header in WAL records with alignment.
  */
 typedef struct XLogRecordBlockImageHeader
 {
-	uint16		hole_offset;	/* number of bytes before "hole" */
-	uint16		hole_length;	/* number of bytes in "hole" */
+	uint16	length;		/* length of block data in record. If compressed
+						 * this is the length of compressed block. If not
+						 * compressed, this is the length of page without
+						 * its hole */
+	uint16	hole_offset:15,		/* number of bytes in "hole" */
+			is_compressed:1;	/* compression status of image */
 } XLogRecordBlockImageHeader;
 
 #define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 465281c..f86f028 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -35,7 +35,9 @@
    to have large tuples, since fields can be spread across multiple tuples).
    BLCKSZ must be a power of 2. The maximum possible value of BLCKSZ is
    currently 2^15 (32768). This is determined by the 15-bit widths of the
-   lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h).
+   lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h) and
+   XLogRecordBlockImageHeader where page hole offset is limited to 15-bit
+   length (see src/include/access/xlogrecord.h).
    Changing BLCKSZ requires an initdb. */
 #undef BLCKSZ
 
-- 
2.2.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to