On Wed, Dec 24, 2014 at 9:03 PM, Michael Paquier <michael.paqu...@gmail.com> wrote: > Returning only a boolean is fine for me (that's what my first patch > did), especially if we add at some point hooks for compression and > decompression calls. Here is a patch rebased on current HEAD (60838df) for the core feature with the APIs of pglz using booleans as return values. -- Michael
From c8891e6086682d4e5bc197ef3047068b3a3875c5 Mon Sep 17 00:00:00 2001 From: Michael Paquier <michael@otacoo.com> Date: Tue, 25 Nov 2014 14:24:26 +0900 Subject: [PATCH] Support compression for full-page writes in WAL
Compression is controlled with a new parameter called wal_compression. This parameter can be changed at session level to control WAL compression. --- contrib/pg_xlogdump/pg_xlogdump.c | 20 ++-- doc/src/sgml/config.sgml | 29 +++++ src/backend/access/transam/xlog.c | 1 + src/backend/access/transam/xloginsert.c | 155 ++++++++++++++++++++++---- src/backend/access/transam/xlogreader.c | 77 ++++++++++--- src/backend/utils/misc/guc.c | 9 ++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/xlog.h | 1 + src/include/access/xlogreader.h | 7 +- src/include/access/xlogrecord.h | 36 ++++-- src/include/pg_config.h.in | 4 +- 11 files changed, 283 insertions(+), 57 deletions(-) diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c index 9f05e25..3ba1d8f 100644 --- a/contrib/pg_xlogdump/pg_xlogdump.c +++ b/contrib/pg_xlogdump/pg_xlogdump.c @@ -363,15 +363,12 @@ XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, * takes up BLCKSZ bytes, minus the "hole" length. * * XXX: We peek into xlogreader's private decoded backup blocks for the - * hole_length. It doesn't seem worth it to add an accessor macro for + * length of block. It doesn't seem worth it to add an accessor macro for * this. */ fpi_len = 0; for (block_id = 0; block_id <= record->max_block_id; block_id++) - { - if (XLogRecHasBlockImage(record, block_id)) - fpi_len += BLCKSZ - record->blocks[block_id].hole_length; - } + fpi_len += record->blocks[block_id].bkp_len; /* Update per-rmgr statistics */ @@ -465,9 +462,16 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { - printf(" (FPW); hole: offset: %u, length: %u\n", - record->blocks[block_id].hole_offset, - record->blocks[block_id].hole_length); + if (record->blocks[block_id].is_compressed) + printf(" (FPW compressed); hole offset: %u, " + "compressed length: %u, original length: %u\n", + record->blocks[block_id].hole_offset, + record->blocks[block_id].bkp_len, + record->blocks[block_id].bkp_uncompress_len); + else + printf(" (FPW); hole offset: %u, length: %u\n", + record->blocks[block_id].hole_offset, + record->blocks[block_id].bkp_len); } putchar('\n'); } diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 6bcb106..acbbd20 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2282,6 +2282,35 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-wal-compression" xreflabel="wal_compression"> + <term><varname>wal_compression</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>wal_compression</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + When this parameter is <literal>on</>, the <productname>PostgreSQL</> + server compresses the content of full-page writes when necessary and + inserts in WAL records with smaller sizes, reducing the amount of + WAL stored on disk. + </para> + + <para> + Compression has the advantage of reducing the amount of disk I/O when + doing WAL-logging, at the cost of some extra CPU to perform the + compression of a block image. At WAL replay, compressed block images + need extra CPU cycles to perform the decompression of each block + image, but it can reduce as well replay time in I/O bounded + environments. + </para> + + <para> + The default value is <literal>off</>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers"> <term><varname>wal_buffers</varname> (<type>integer</type>) <indexterm> diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e5dddd4..d68d9e3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -88,6 +88,7 @@ char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; +bool wal_compression = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index f3d610f..a1496aa 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -24,12 +24,16 @@ #include "access/xlog_internal.h" #include "access/xloginsert.h" #include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" #include "pg_trace.h" +/* maximum size for compression buffer of block image */ +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + /* * For each block reference registered with XLogRegisterBuffer, we fill in * a registered_buffer struct. @@ -50,6 +54,8 @@ typedef struct XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to * backup block data in XLogRecordAssemble() */ + char compressed_page[PGLZ_MAX_BLCKSZ]; /* recipient for compressed + * page */ } registered_buffer; static registered_buffer *registered_buffers; @@ -81,6 +87,9 @@ static char *hdr_scratch = NULL; MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \ SizeOfXLogRecordDataHeaderLong) +/* Scratch buffer holding block image data to be compressed */ +static char *compression_scratch = NULL; + /* * An array of XLogRecData structs, to hold registered data. */ @@ -97,6 +106,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn); +static bool XLogCompressBackupBlock(char *page, uint32 orig_len, + char *dest, uint16 *len); + /* * Begin constructing a WAL record. This must be called before the * XLogRegister* functions and XLogInsert(). @@ -529,9 +541,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (needs_backup) { Page page = regbuf->page; + uint16 hole_length; + uint16 hole_offset; + uint16 compress_len = 0; + bool is_compressed = false; /* - * The page needs to be backed up, so set up *bimg + * The page needs to be backed up, so calculate its hole length + * and offset. */ if (regbuf->flags & REGBUF_STANDARD) { @@ -543,49 +560,107 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, upper > lower && upper <= BLCKSZ) { - bimg.hole_offset = lower; - bimg.hole_length = upper - lower; + hole_offset = lower; + hole_length = upper - lower; } else { /* No "hole" to compress out */ - bimg.hole_offset = 0; - bimg.hole_length = 0; + hole_offset = 0; + hole_length = 0; } } else { /* Not a standard page header, don't try to eliminate "hole" */ - bimg.hole_offset = 0; - bimg.hole_length = 0; + hole_offset = 0; + hole_length = 0; } - /* Fill in the remaining fields in the XLogRecordBlockData struct */ - bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; + /* + * First try to compress block without its hole to improve the + * compression of the whole. If the block is considered as + * not compressible, complete the block header information + * accordingly. + */ + if (wal_compression) + { + int page_len = BLCKSZ - hole_length; + char *scratch_buf; + + if (hole_length != 0) + { + scratch_buf = compression_scratch; + memcpy(scratch_buf, page, hole_offset); + memcpy(scratch_buf + hole_offset, + page + (hole_offset + hole_length), + BLCKSZ - (hole_length + hole_offset)); + } + else + scratch_buf = page; - total_len += BLCKSZ - bimg.hole_length; + /* Perform compression of block */ + if (XLogCompressBackupBlock(scratch_buf, + page_len, + regbuf->compressed_page, + &compress_len)) + { + /* compression is done, add record */ + is_compressed = true; + } + } /* - * Construct XLogRecData entries for the page content. + * Fill in the remaining fields in the XLogRecordBlockImageHeader + * struct and add new entries in the record chain. */ - rdt_datas_last->next = ®buf->bkp_rdatas[0]; - rdt_datas_last = rdt_datas_last->next; - if (bimg.hole_length == 0) + bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; + + /* hole offset length should be 15-bit long */ + Assert((hole_offset & 0x8000) == 0); + + if (is_compressed) { - rdt_datas_last->data = page; - rdt_datas_last->len = BLCKSZ; + /* compressed block information */ + bimg.length = compress_len; + bimg.hole_offset = hole_offset; + bimg.is_compressed = 1; + + /* record entry for compressed block */ + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compress_len; + total_len += bimg.length; } else { - /* must skip the hole */ - rdt_datas_last->data = page; - rdt_datas_last->len = bimg.hole_offset; - - rdt_datas_last->next = ®buf->bkp_rdatas[1]; + /* uncompressed block information */ + bimg.length = BLCKSZ - hole_length; + bimg.hole_offset = hole_offset; + bimg.is_compressed = 0; + total_len += bimg.length; + + /* record entries for uncompressed block */ + rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; + if (hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = hole_offset; + + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; - rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); - rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->data = page + (hole_offset + hole_length); + rdt_datas_last->len = BLCKSZ - (hole_offset + hole_length); + } } } @@ -681,6 +756,35 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* + * Create a compressed version of a backup block. If successful, return + * true and set 'len' to its length. If block cannot be compressed or if + * compression failed return false. + */ +static bool +XLogCompressBackupBlock(char *page, uint32 orig_len, char *dest, uint16 *len) +{ + /* leave if data can not be compressed */ + if (!pglz_compress(page, orig_len, (PGLZ_Header *) dest, + PGLZ_strategy_default)) + return false; + + /* + * We recheck the actual size even if pglz_compress() report success, + * because it might be satisfied with having saved as little as one byte + * in the compressed data --- which could turn into a net loss once you + * consider header and alignment padding. Worst case, the compressed + * format might require three padding bytes (plus header, which is + * included in VARSIZE(buf)), whereas the uncompressed format would take + * only one header byte and no padding if the value is short enough. So + * we insist on a savings of more than 2 bytes to ensure we have a gain. + */ + *len = VARSIZE((struct varlena *) dest); + if (*len >= orig_len - 2) + return false; + return true; +} + +/* * Determine whether the buffer referenced has to be backed up. * * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites @@ -893,4 +997,9 @@ InitXLogInsert(void) if (hdr_scratch == NULL) hdr_scratch = MemoryContextAllocZero(xloginsert_cxt, HEADER_SCRATCH_SIZE); + + /* allocate scratch buffer used for compression of block images */ + if (compression_scratch == NULL) + compression_scratch = MemoryContextAllocZero(xloginsert_cxt, + BLCKSZ); } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 67d6223..e483288 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -20,6 +20,7 @@ #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); @@ -74,13 +75,15 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data) state->max_block_id = -1; /* - * Permanently allocate readBuf. We do it this way, rather than just - * making a static array, for two reasons: (1) no need to waste the - * storage in most instantiations of the backend; (2) a static char array - * isn't guaranteed to have any particular alignment, whereas palloc() - * will provide MAXALIGN'd storage. + * Permanently allocate readBuf compressBuf. We do it this way, + * rather than just making a static array, for two reasons: + * (1) no need to waste the storage in most instantiations of the + * backend; (2) a static char array isn't guaranteed to have any + * particular alignment, whereas palloc() will provide MAXALIGN'd + * storage. */ state->readBuf = (char *) palloc(XLOG_BLCKSZ); + state->compressBuf = (char *) palloc(BLCKSZ); state->read_page = pagereadfunc; /* system_identifier initialized to zeroes above */ @@ -98,6 +101,7 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data) { pfree(state->errormsg_buf); pfree(state->readBuf); + pfree(state->compressBuf); pfree(state); return NULL; } @@ -125,6 +129,7 @@ XLogReaderFree(XLogReaderState *state) if (state->readRecordBuf) pfree(state->readRecordBuf); pfree(state->readBuf); + pfree(state->compressBuf); pfree(state); } @@ -923,6 +928,7 @@ ResetDecoder(XLogReaderState *state) state->blocks[block_id].in_use = false; state->blocks[block_id].has_image = false; state->blocks[block_id].has_data = false; + state->blocks[block_id].is_compressed = false; } state->max_block_id = -1; } @@ -1032,9 +1038,12 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { - COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); - COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); - datatotal += BLCKSZ - blk->hole_length; + XLogRecordBlockImageHeader bkp_info; + COPY_HEADER_FIELD(&bkp_info, sizeof(XLogRecordBlockImageHeader)); + blk->is_compressed = bkp_info.is_compressed; + blk->bkp_len = bkp_info.length; + blk->hole_offset = bkp_info.hole_offset; + datatotal += blk->bkp_len; } if (!(fork_flags & BKPBLOCK_SAME_REL)) { @@ -1088,8 +1097,17 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) continue; if (blk->has_image) { + /* + * Peek into the block image to grab the original length of the + * compressed image. + */ + if (blk->is_compressed) + blk->bkp_uncompress_len = PGLZ_RAW_SIZE((PGLZ_Header *) ptr); + else + blk->bkp_uncompress_len = blk->bkp_len; + blk->bkp_image = ptr; - ptr += BLCKSZ - blk->hole_length; + ptr += blk->bkp_len; } if (blk->has_data) { @@ -1195,6 +1213,8 @@ bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { DecodedBkpBlock *bkpb; + char *block_image; + int hole_length; if (!record->blocks[block_id].in_use) return false; @@ -1202,19 +1222,44 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return false; bkpb = &record->blocks[block_id]; + block_image = bkpb->bkp_image; + + /* + * Fetch page data, with different processing depending on if the + * page is compressed or not. + */ + if (bkpb->is_compressed) + { + PGLZ_Header *header = (PGLZ_Header *) block_image; + + if (!pglz_decompress(header, record->compressBuf)) + { + report_invalid_record(record, "invalid compressed image at %X/%X, block %d", + (uint32) (record->ReadRecPtr >> 32), + (uint32) record->ReadRecPtr, + block_id); + return false; + } + + block_image = record->compressBuf; + hole_length = BLCKSZ - PGLZ_RAW_SIZE(header); + } + else + hole_length = BLCKSZ - bkpb->bkp_len; - if (bkpb->hole_length == 0) + /* generate page, taking into account hole if necessary */ + if (hole_length == 0) { - memcpy(page, bkpb->bkp_image, BLCKSZ); + memcpy(page, block_image, BLCKSZ); } else { - memcpy(page, bkpb->bkp_image, bkpb->hole_offset); + memcpy(page, block_image, bkpb->hole_offset); /* must zero-fill the hole */ - MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); - memcpy(page + (bkpb->hole_offset + bkpb->hole_length), - bkpb->bkp_image + bkpb->hole_offset, - BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); + MemSet(page + bkpb->hole_offset, 0, hole_length); + memcpy(page + (bkpb->hole_offset + hole_length), + block_image + bkpb->hole_offset, + BLCKSZ - (bkpb->hole_offset + hole_length)); } return true; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 77c3494..de17e29 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -927,6 +927,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"wal_compression", PGC_USERSET, WAL_SETTINGS, + gettext_noop("Compresses full-page writes written in WAL file."), + NULL + }, + &wal_compression, + false, + NULL, NULL, NULL + }, { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index b053659..b367e2c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -186,6 +186,7 @@ # fsync_writethrough # open_sync #full_page_writes = on # recover from partial page writes +#wal_compression = off # enable compression of full-page writes #wal_log_hints = off # also do full page writes of non-critical updates # (change requires restart) #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d06fbc0..6bdfa4a 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -98,6 +98,7 @@ extern char *XLogArchiveCommand; extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; +extern bool wal_compression; extern bool log_checkpoints; /* WAL levels */ diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index eb6cc89..e13b796 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -52,9 +52,11 @@ typedef struct /* Information on full-page image, if any */ bool has_image; + bool is_compressed; char *bkp_image; + uint16 bkp_len; + uint16 bkp_uncompress_len; uint16 hole_offset; - uint16 hole_length; /* Buffer holding the rmgr-specific data associated with this block */ bool has_data; @@ -138,6 +140,9 @@ struct XLogReaderState /* Buffer for currently read page (XLOG_BLCKSZ bytes) */ char *readBuf; + /* Scratch buffer used for uncompressed pages */ + char *compressBuf; + /* last read segment, segment offset, read length, TLI */ XLogSegNo readSegNo; uint32 readOff; diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index e5826ee..3ccd69d 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -98,17 +98,37 @@ typedef struct XLogRecordBlockHeader * Additional header information when a full-page image is included * (i.e. when BKPBLOCK_HAS_IMAGE is set). * - * As a trivial form of data compression, the XLOG code is aware that - * PG data pages usually contain an unused "hole" in the middle, which - * contains only zero bytes. If hole_length > 0 then we have removed - * such a "hole" from the stored data (and it's not counted in the - * XLOG record's CRC, either). Hence, the amount of block data actually - * present is BLCKSZ - hole_length bytes. + * Block images are able to do several types of compression: + * - When wal_compression is off, as a trivial form of compression, the + * XLOG code is aware that PG data pages usually contain an unused "hole" + * in the middle, which contains only zero bytes. If length < BLCKSZ + * then we have removed such a "hole" from the stored data (and it is + * not counted in the XLOG record's CRC, either). Hence, the amount + * of block data actually present is "length" bytes. The hole "offset" + * on page is defined using "hole_offset". + * - When wal_compression is enabled, block images are compressed + * using a compression algorithm without their hole to improve + * compression process of the page. "length" corresponds in this case + * to the length of the block compressed, the original length of the + * block without its page hole being deducible from the compressed data + * itself. "hole_offset" is the hole offset of the page. + * + * "is_compressed" is used to identofy if a given block image is compressed + * or not. Maximum page size allowed on the system being 32k, the hole + * offset cannot be more than 15-bit long so the last free bit is used to + * store the compression state of block image. If the maximum page size + * allowed is increased to a value higher than that, we should consider + * increasing this structure size as well, but this would increase the + * length of block header in WAL records with alignment. */ typedef struct XLogRecordBlockImageHeader { - uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ + uint16 length; /* length of block data in record. If compressed + * this is the length of compressed block. If not + * compressed, this is the length of page without + * its hole */ + uint16 hole_offset:15, /* number of bytes in "hole" */ + is_compressed:1; /* compression status of image */ } XLogRecordBlockImageHeader; #define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 465281c..f86f028 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -35,7 +35,9 @@ to have large tuples, since fields can be spread across multiple tuples). BLCKSZ must be a power of 2. The maximum possible value of BLCKSZ is currently 2^15 (32768). This is determined by the 15-bit widths of the - lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h). + lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h) and + XLogRecordBlockImageHeader where page hole offset is limited to 15-bit + length (see src/include/access/xlogrecord.h). Changing BLCKSZ requires an initdb. */ #undef BLCKSZ -- 2.2.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers