On Thu, Mar 5, 2015 at 10:08 PM, Michael Paquier <michael.paqu...@gmail.com> wrote: > On Thu, Mar 5, 2015 at 9:14 PM, Syed, Rahila <rahila.s...@nttdata.com> wrote: >> Please find attached a patch. As discussed, flag to denote compression and >> presence of hole in block image has been added in XLogRecordImageHeader >> rather than block header.
Thanks for updating the patch! Attached is the refactored version of the patch. Regards, -- Fujii Masao
*** a/contrib/pg_xlogdump/pg_xlogdump.c --- b/contrib/pg_xlogdump/pg_xlogdump.c *************** *** 359,376 **** XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, rec_len = XLogRecGetDataLen(record) + SizeOfXLogRecord; /* ! * Calculate the amount of FPI data in the record. Each backup block ! * takes up BLCKSZ bytes, minus the "hole" length. * * XXX: We peek into xlogreader's private decoded backup blocks for the ! * hole_length. It doesn't seem worth it to add an accessor macro for ! * this. */ fpi_len = 0; for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (XLogRecHasBlockImage(record, block_id)) ! fpi_len += BLCKSZ - record->blocks[block_id].hole_length; } /* Update per-rmgr statistics */ --- 359,375 ---- rec_len = XLogRecGetDataLen(record) + SizeOfXLogRecord; /* ! * Calculate the amount of FPI data in the record. * * XXX: We peek into xlogreader's private decoded backup blocks for the ! * bimg_len indicating the length of FPI data. It doesn't seem worth it to ! * add an accessor macro for this. */ fpi_len = 0; for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (XLogRecHasBlockImage(record, block_id)) ! fpi_len += record->blocks[block_id].bimg_len; } /* Update per-rmgr statistics */ *************** *** 465,473 **** XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { ! printf(" (FPW); hole: offset: %u, length: %u\n", ! record->blocks[block_id].hole_offset, ! record->blocks[block_id].hole_length); } putchar('\n'); } --- 464,485 ---- blk); if (XLogRecHasBlockImage(record, block_id)) { ! if (record->blocks[block_id].bimg_info & ! BKPIMAGE_IS_COMPRESSED) ! { ! printf(" (FPW); hole: offset: %u, length: %u, compression saved: %u\n", ! record->blocks[block_id].hole_offset, ! record->blocks[block_id].hole_length, ! BLCKSZ - ! record->blocks[block_id].hole_length - ! record->blocks[block_id].bimg_len); ! } ! else ! { ! printf(" (FPW); hole: offset: %u, length: %u\n", ! record->blocks[block_id].hole_offset, ! record->blocks[block_id].hole_length); ! } } putchar('\n'); } *** a/doc/src/sgml/config.sgml --- b/doc/src/sgml/config.sgml *************** *** 2282,2287 **** include_dir 'conf.d' --- 2282,2311 ---- </listitem> </varlistentry> + <varlistentry id="guc-wal-compression" xreflabel="wal_compression"> + <term><varname>wal_compression</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>wal_compression</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + When this parameter is <literal>on</>, the <productname>PostgreSQL</> + server compresses a full page image written to WAL when + <xref linkend="guc-full-page-writes"> is on or during a base backup. + A compressed page image will be decompressed during WAL replay. + The default value is <literal>off</> + </para> + + <para> + Turning this parameter on can reduce the WAL volume without + increasing the risk of unrecoverable data corruption, + but at the cost of some extra CPU time by the compression during + WAL logging and the decompression during WAL replay. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers"> <term><varname>wal_buffers</varname> (<type>integer</type>) <indexterm> *** a/src/backend/access/transam/xlog.c --- b/src/backend/access/transam/xlog.c *************** *** 89,94 **** char *XLogArchiveCommand = NULL; --- 89,95 ---- bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; + bool wal_compression = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; *** a/src/backend/access/transam/xloginsert.c --- b/src/backend/access/transam/xloginsert.c *************** *** 24,35 **** --- 24,39 ---- #include "access/xlog_internal.h" #include "access/xloginsert.h" #include "catalog/pg_control.h" + #include "common/pg_lzcompress.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" #include "pg_trace.h" + /* Buffer size required to store a compressed version of backup block image */ + #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + /* * For each block reference registered with XLogRegisterBuffer, we fill in * a registered_buffer struct. *************** *** 50,55 **** typedef struct --- 54,62 ---- XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to * backup block data in XLogRecordAssemble() */ + + /* buffer to store a compressed version of backup block image */ + char compressed_page[PGLZ_MAX_BLCKSZ]; } registered_buffer; static registered_buffer *registered_buffers; *************** *** 96,101 **** static MemoryContext xloginsert_cxt; --- 103,110 ---- static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn); + static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, + uint16 hole_length, char *dest, uint16 *dlen); /* * Begin constructing a WAL record. This must be called before the *************** *** 482,488 **** XLogRecordAssemble(RmgrId rmid, uint8 info, --- 491,501 ---- bool needs_data; XLogRecordBlockHeader bkpb; XLogRecordBlockImageHeader bimg; + XLogRecordBlockCompressHeader cbimg; bool samerel; + bool is_compressed = false; + uint16 hole_length; + uint16 hole_offset; if (!regbuf->in_use) continue; *************** *** 529,537 **** XLogRecordAssemble(RmgrId rmid, uint8 info, if (needs_backup) { Page page = regbuf->page; /* ! * The page needs to be backed up, so set up *bimg */ if (regbuf->flags & REGBUF_STANDARD) { --- 542,552 ---- if (needs_backup) { Page page = regbuf->page; + uint16 compressed_len; /* ! * The page needs to be backed up, so calculate its hole length ! * and offset. */ if (regbuf->flags & REGBUF_STANDARD) { *************** *** 543,592 **** XLogRecordAssemble(RmgrId rmid, uint8 info, upper > lower && upper <= BLCKSZ) { ! bimg.hole_offset = lower; ! bimg.hole_length = upper - lower; } else { /* No "hole" to compress out */ ! bimg.hole_offset = 0; ! bimg.hole_length = 0; } } else { /* Not a standard page header, don't try to eliminate "hole" */ ! bimg.hole_offset = 0; ! bimg.hole_length = 0; } /* Fill in the remaining fields in the XLogRecordBlockHeader struct */ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; - total_len += BLCKSZ - bimg.hole_length; - /* * Construct XLogRecData entries for the page content. */ rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; ! if (bimg.hole_length == 0) { ! rdt_datas_last->data = page; ! rdt_datas_last->len = BLCKSZ; } else { ! /* must skip the hole */ ! rdt_datas_last->data = page; ! rdt_datas_last->len = bimg.hole_offset; ! rdt_datas_last->next = ®buf->bkp_rdatas[1]; ! rdt_datas_last = rdt_datas_last->next; ! rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); ! rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); } } if (needs_data) --- 558,638 ---- upper > lower && upper <= BLCKSZ) { ! hole_offset = lower; ! hole_length = upper - lower; } else { /* No "hole" to compress out */ ! hole_offset = 0; ! hole_length = 0; } } else { /* Not a standard page header, don't try to eliminate "hole" */ ! hole_offset = 0; ! hole_length = 0; ! } ! ! /* ! * Try to compress a block image if wal_compression is enabled ! */ ! if (wal_compression) ! { ! is_compressed = ! XLogCompressBackupBlock(page, hole_offset, hole_length, ! regbuf->compressed_page, ! &compressed_len); } /* Fill in the remaining fields in the XLogRecordBlockHeader struct */ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; /* * Construct XLogRecData entries for the page content. */ rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; ! ! bimg.bimg_info = (hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; ! ! if (is_compressed) { ! bimg.length = compressed_len; ! bimg.hole_offset = hole_offset; ! bimg.bimg_info |= BKPIMAGE_IS_COMPRESSED; ! if (hole_length != 0) ! cbimg.hole_length = hole_length; ! ! rdt_datas_last->data = regbuf->compressed_page; ! rdt_datas_last->len = compressed_len; } else { ! bimg.length = BLCKSZ - hole_length; ! bimg.hole_offset = hole_offset; ! if (hole_length == 0) ! { ! rdt_datas_last->data = page; ! rdt_datas_last->len = BLCKSZ; ! } ! else ! { ! /* must skip the hole */ ! rdt_datas_last->data = page; ! rdt_datas_last->len = hole_offset; ! rdt_datas_last->next = ®buf->bkp_rdatas[1]; ! rdt_datas_last = rdt_datas_last->next; ! ! rdt_datas_last->data = page + (hole_offset + hole_length); ! rdt_datas_last->len = BLCKSZ - (hole_offset + hole_length); ! } } + + total_len += bimg.length; } if (needs_data) *************** *** 619,624 **** XLogRecordAssemble(RmgrId rmid, uint8 info, --- 665,676 ---- { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; + if (hole_length != 0 && is_compressed) + { + memcpy(scratch, &cbimg, + SizeOfXLogRecordBlockCompressHeader); + scratch += SizeOfXLogRecordBlockCompressHeader; + } } if (!samerel) { *************** *** 681,686 **** XLogRecordAssemble(RmgrId rmid, uint8 info, --- 733,789 ---- } /* + * Create a compressed version of a backup block image. + * + * Returns FALSE if compression fails (i.e., compressed result is actually + * bigger than original). Otherwise, returns TRUE and sets 'dlen' to + * the length of compressed block image. + */ + static bool + XLogCompressBackupBlock(char * page, uint16 hole_offset, uint16 hole_length, + char *dest, uint16 *dlen) + { + int32 orig_len = BLCKSZ - hole_length; + int32 len; + int32 extra_bytes = 0; + char *source; + char tmp[BLCKSZ]; + + if (hole_length != 0) + { + /* must skip the hole */ + source = tmp; + memcpy(source, page, hole_offset); + memcpy(source + hole_offset, + page + (hole_offset + hole_length), + BLCKSZ - (hole_length + hole_offset)); + + /* + * Extra data needs to be stored in WAL record for the compressed + * version of block image if the hole exists. + */ + extra_bytes = SizeOfXLogRecordBlockCompressHeader; + } + else + source = page; + + /* + * We recheck the actual size even if pglz_compress() reports success + * and see if the number of bytes saved by compression is larger than + * the length of extra data needed for the compressed version of block + * image. + */ + len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default); + if (len >= 0 && + len + extra_bytes < orig_len) + { + *dlen = (uint16) len; /* successful compression */ + return true; + } + return false; + } + + /* * Determine whether the buffer referenced has to be backed up. * * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites *** a/src/backend/access/transam/xlogreader.c --- b/src/backend/access/transam/xlogreader.c *************** *** 20,25 **** --- 20,26 ---- #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" + #include "common/pg_lzcompress.h" static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); *************** *** 1037,1045 **** DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); ! COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); ! datatotal += BLCKSZ - blk->hole_length; } if (!(fork_flags & BKPBLOCK_SAME_REL)) { --- 1038,1115 ---- if (blk->has_image) { + COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); ! COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); ! if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) ! { ! if (blk->bimg_info & BKPIMAGE_HAS_HOLE) ! COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); ! else ! blk->hole_length = 0; ! } ! else ! blk->hole_length = BLCKSZ - blk->bimg_len; ! datatotal += blk->bimg_len; ! ! /* ! * cross-check that hole_offset > 0, hole_length > 0 and ! * bimg_len < BLCKSZ if the HAS_HOLE flag is set. ! */ ! if (blk->bimg_info & BKPIMAGE_HAS_HOLE && ! (blk->hole_offset == 0 || ! blk->hole_length == 0 || ! blk->bimg_len == BLCKSZ)) ! { ! report_invalid_record(state, ! "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", ! (unsigned int) blk->hole_offset, ! (unsigned int) blk->hole_length, ! (unsigned int) blk->bimg_len, ! (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); ! goto err; ! } ! /* ! * cross-check that hole_offset == 0 and hole_length == 0 ! * if the HAS_HOLE flag is set. ! */ ! if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && ! (blk->hole_offset != 0 || blk->hole_length != 0)) ! { ! report_invalid_record(state, ! "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", ! (unsigned int) blk->hole_offset, ! (unsigned int) blk->hole_length, ! (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); ! goto err; ! } ! /* ! * cross-check that bimg_len < BLCKSZ ! * if the IS_COMPRESSED flag is set. ! */ ! if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED && ! blk->bimg_len == BLCKSZ) ! { ! report_invalid_record(state, ! "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", ! (unsigned int) blk->bimg_len, ! (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); ! goto err; ! } ! /* ! * cross-check that bimg_len = BLCKSZ if neither ! * HAS_HOLE nor IS_COMPRESSED flag is set. ! */ ! if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && ! !(blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && ! blk->bimg_len != BLCKSZ) ! { ! report_invalid_record(state, ! "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", ! (unsigned int) blk->data_len, ! (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); ! goto err; ! } } if (!(fork_flags & BKPBLOCK_SAME_REL)) { *************** *** 1094,1100 **** DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { blk->bkp_image = ptr; ! ptr += BLCKSZ - blk->hole_length; } if (blk->has_data) { --- 1164,1170 ---- if (blk->has_image) { blk->bkp_image = ptr; ! ptr += blk->bimg_len; } if (blk->has_data) { *************** *** 1200,1205 **** bool --- 1270,1277 ---- RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { DecodedBkpBlock *bkpb; + char *ptr; + char tmp[BLCKSZ]; if (!record->blocks[block_id].in_use) return false; *************** *** 1207,1224 **** RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return false; bkpb = &record->blocks[block_id]; if (bkpb->hole_length == 0) { ! memcpy(page, bkpb->bkp_image, BLCKSZ); } else { ! memcpy(page, bkpb->bkp_image, bkpb->hole_offset); /* must zero-fill the hole */ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), ! bkpb->bkp_image + bkpb->hole_offset, BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); } --- 1279,1313 ---- return false; bkpb = &record->blocks[block_id]; + ptr = bkpb->bkp_image; + + if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + /* If a backup block image is compressed, decompress it */ + if (pglz_decompress(ptr, bkpb->bimg_len, tmp, + BLCKSZ - bkpb->hole_length) < 0) + { + report_invalid_record(record, "invalid compressed image at %X/%X, block %d", + (uint32) (record->ReadRecPtr >> 32), + (uint32) record->ReadRecPtr, + block_id); + return false; + } + ptr = tmp; + } + /* generate page, taking into account hole if necessary */ if (bkpb->hole_length == 0) { ! memcpy(page, ptr, BLCKSZ); } else { ! memcpy(page, ptr, bkpb->hole_offset); /* must zero-fill the hole */ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), ! ptr + bkpb->hole_offset, BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); } *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 997,1002 **** static struct config_bool ConfigureNamesBool[] = --- 997,1012 ---- }, { + {"wal_compression", PGC_USERSET, WAL_SETTINGS, + gettext_noop("Compresses full-page writes written in WAL file."), + NULL + }, + &wal_compression, + false, + NULL, NULL, NULL + }, + + { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs each checkpoint."), NULL *** a/src/backend/utils/misc/postgresql.conf.sample --- b/src/backend/utils/misc/postgresql.conf.sample *************** *** 186,191 **** --- 186,192 ---- # fsync_writethrough # open_sync #full_page_writes = on # recover from partial page writes + #wal_compression = off # enable compression of full-page writes #wal_log_hints = off # also do full page writes of non-critical updates # (change requires restart) #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers *** a/src/include/access/xlog.h --- b/src/include/access/xlog.h *************** *** 100,105 **** extern char *XLogArchiveCommand; --- 100,106 ---- extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; + extern bool wal_compression; extern bool log_checkpoints; extern int CheckPointSegments; *** a/src/include/access/xlogreader.h --- b/src/include/access/xlogreader.h *************** *** 55,60 **** typedef struct --- 55,62 ---- char *bkp_image; uint16 hole_offset; uint16 hole_length; + uint16 bimg_len; + uint8 bimg_info; /* Buffer holding the rmgr-specific data associated with this block */ bool has_data; *** a/src/include/access/xlogrecord.h --- b/src/include/access/xlogrecord.h *************** *** 100,117 **** typedef struct XLogRecordBlockHeader * * As a trivial form of data compression, the XLOG code is aware that * PG data pages usually contain an unused "hole" in the middle, which ! * contains only zero bytes. If hole_length > 0 then we have removed * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually ! * present is BLCKSZ - hole_length bytes. */ typedef struct XLogRecordBlockImageHeader { ! uint16 hole_offset; /* number of bytes before "hole" */ ! uint16 hole_length; /* number of bytes in "hole" */ } XLogRecordBlockImageHeader; ! #define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) /* * Maximum size of the header for a block reference. This is used to size a --- 100,154 ---- * * As a trivial form of data compression, the XLOG code is aware that * PG data pages usually contain an unused "hole" in the middle, which ! * contains only zero bytes. If the length of "hole" > 0 then we have removed * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually ! * present is BLCKSZ - the length of "hole" bytes. ! * ! * When wal_compression is enabled, a full page image which "hole" was ! * removed is additionally compressed using PGLZ compression algorithm. ! * This can reduce the WAL volume, but at some extra cost of CPU time ! * by the compression during WAL logging. In this case, since the "hole" ! * length cannot be calculated by subtracting the number of page image bytes ! * from BLCKSZ, basically it needs to be stored as an extra information. ! * But when no "hole" exists, we can assume that the "hole" length is zero ! * and no such an extra information needs to be stored. Note that ! * the original version of page image is stored in WAL instead of the ! * compressed one if the number of bytes saved by compression is less than ! * the length of extra information. Hence, when a page image is successfully ! * compressed, the amount of block data actually present is less than ! * BLCKSZ - the length of "hole" bytes - the length of extra information. */ typedef struct XLogRecordBlockImageHeader { ! uint16 length; /* number of page image bytes */ ! uint16 hole_offset; /* number of bytes before "hole" */ ! uint8 bimg_info; /* flag bits, see below */ ! ! /* ! * If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, ! * an XLogRecordBlockCompressHeader follows ! */ } XLogRecordBlockImageHeader; ! #define SizeOfXLogRecordBlockImageHeader \ ! (offsetof(XLogRecordBlockImageHeader, bimg_info) + sizeof(uint8)) ! ! /* Information stored in bimg_info */ ! #define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */ ! #define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */ ! ! /* ! * Extra header information used when page image has "hole" and ! * is compressed. ! */ ! typedef struct XLogRecordBlockCompressHeader ! { ! uint16 hole_length; /* number of bytes in "hole" */ ! } XLogRecordBlockCompressHeader; ! ! #define SizeOfXLogRecordBlockCompressHeader \ ! sizeof(XLogRecordBlockCompressHeader) /* * Maximum size of the header for a block reference. This is used to size a *************** *** 120,125 **** typedef struct XLogRecordBlockImageHeader --- 157,163 ---- #define MaxSizeOfXLogRecordBlockHeader \ (SizeOfXLogRecordBlockHeader + \ SizeOfXLogRecordBlockImageHeader + \ + SizeOfXLogRecordBlockCompressHeader + \ sizeof(RelFileNode) + \ sizeof(BlockNumber))
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers