Re: pg_walinspect - a new extension to get raw WAL data and WAL stats

Andres Freund Fri, 18 Mar 2022 16:48:54 -0700

Hi,

First look at this patch, so I might be repeating stuff already commented on /
discussed.


On 2022-03-17 13:25:35 +0530, Bharath Rupireddy wrote:
> +--
> +-- pg_get_raw_wal_record()
> +--
> +CREATE FUNCTION pg_get_raw_wal_record(IN in_lsn pg_lsn,
> +    OUT start_lsn pg_lsn,
> +    OUT end_lsn pg_lsn,
> +    OUT prev_lsn pg_lsn,
> +    OUT record_length int4,
> +    OUT record bytea
> +)
> +AS 'MODULE_PATHNAME', 'pg_get_raw_wal_record'
> +LANGUAGE C CALLED ON NULL INPUT PARALLEL SAFE;

What is raw about the function?

Why "CALLED ON NULL INPUT"? It doesn't make sense to call the function with a
NULL lsn, does it?  Also, that's the default, why is it restated here?


> +REVOKE EXECUTE ON FUNCTION pg_get_raw_wal_record(pg_lsn) FROM PUBLIC;
> +GRANT EXECUTE ON FUNCTION pg_get_raw_wal_record(pg_lsn) TO 
> pg_read_server_files;
> +
> +--
> +-- pg_get_wal_record_info()
> +--
> +CREATE FUNCTION pg_get_wal_record_info(IN in_lsn pg_lsn,
> +    OUT start_lsn pg_lsn,
> +    OUT end_lsn pg_lsn,
> +    OUT prev_lsn pg_lsn,
> +    OUT xid xid,
> +    OUT resource_manager text,
> +    OUT record_length int4,
> +    OUT fpi_length int4,
> +     OUT description text,
> +    OUT block_ref text,
> +    OUT data_length int4,
> +    OUT data bytea
> +)
> +AS 'MODULE_PATHNAME', 'pg_get_wal_record_info'
> +LANGUAGE C CALLED ON NULL INPUT PARALLEL SAFE;
> +
> +REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn) FROM PUBLIC;
> +GRANT EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn) TO pg_monitor;

I don't think it's appropriate for pg_monitor to see all the data in the WAL.

> +--
> +-- pg_get_wal_stats()
> +--
> +CREATE FUNCTION pg_get_wal_stats(IN start_lsn pg_lsn,
> +    IN end_lsn pg_lsn DEFAULT NULL,
> +    OUT resource_manager text,
> +    OUT count int8,
> +    OUT count_percentage float4,
> +    OUT record_length int8,
> +    OUT record_length_percentage float4,
> +    OUT fpi_length int8,
> +    OUT fpi_length_percentage float4
> +    )
> +RETURNS SETOF record AS $$
> +SELECT resource_manager,
> +    count(*) AS cnt,
> +    CASE WHEN count(*) > 0 THEN (count(*) * 100 / sum(count(*)) OVER 
> total)::numeric(5,2) ELSE 0 END AS "count_%",
> +    sum(record_length) AS trecl,
> +    CASE WHEN sum(record_length) > 0 THEN (sum(record_length) * 100 / 
> sum(sum(record_length)) OVER total)::numeric(5,2) ELSE 0 END AS "trecl_%",
> +    sum(fpi_length) AS tfpil,
> +    CASE WHEN sum(fpi_length) > 0 THEN (sum(fpi_length) * 100 / 
> sum(sum(fpi_length)) OVER total)::numeric(5,2) ELSE 0 END AS "tfpil_%"
> +FROM pg_get_wal_records_info(start_lsn, end_lsn)
> +GROUP BY resource_manager
> +WINDOW total AS ();
> +$$ LANGUAGE SQL CALLED ON NULL INPUT PARALLEL SAFE;

This seems like an exceedingly expensive way to compute this. Not just because
of doing the grouping, window etc, but also because it's serializing the
"data" field from pg_get_wal_records_info() just to never use it. With any
appreciatable amount of data the return value pg_get_wal_records_info() will
be serialized into a on-disk tuplestore.

This is probably close to an order of magnitude slower than pg_waldump
--stats. Which imo renders this largely useless.

The column names don't seem great either. "tfpil"?


> +/*
> + * Module load callback.
> + */
> +void
> +_PG_init(void)
> +{
> +     /* Define custom GUCs and install hooks here, if any. */
> +
> +     /*
> +      * Have EmitWarningsOnPlaceholders("pg_walinspect"); if custom GUCs are
> +      * defined.
> +      */
> +}
> +
> +/*
> + * Module unload callback.
> + */
> +void
> +_PG_fini(void)
> +{
> +     /* Uninstall hooks, if any. */
> +}

Why have this stuff if it's not used?


> +/*
> + * Validate given LSN and return the LSN up to which the server has WAL.
> + */
> +static XLogRecPtr
> +ValidateInputLSN(XLogRecPtr lsn)
> +{
> +     XLogRecPtr curr_lsn;
> +
> +     /* Validate input WAL LSN. */
> +     if (XLogRecPtrIsInvalid(lsn))
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("invalid WAL LSN")));
> +
> +     /*
> +      * We determine the current LSN of the server similar to how page_read
> +      * callback read_local_xlog_page does.
> +      */
> +     if (!RecoveryInProgress())
> +             curr_lsn = GetFlushRecPtr(NULL);
> +     else
> +             curr_lsn = GetXLogReplayRecPtr(NULL);
> +
> +     Assert(!XLogRecPtrIsInvalid(curr_lsn));
> +
> +     if (lsn >= curr_lsn)
> +     {
> +             /*
> +              * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of 
> the last
> +              * record flushed or replayed respectively. But let's use the 
> LSN up
> +              * to "end" in user facing message.
> +              */
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("cannot accept future input LSN"),
> +                              errdetail("Last known WAL LSN on the database 
> system is %X/%X.",
> +                                                LSN_FORMAT_ARGS(curr_lsn - 
> 1))));
> +     }

> +     return curr_lsn;
> +}
> +
> +/*
> + * Validate given start LSN and end LSN, return the new end LSN in case user
> + * hasn't specified one.
> + */
> +static XLogRecPtr
> +ValidateStartAndEndLSNs(XLogRecPtr start_lsn, XLogRecPtr end_lsn)
> +{
> +     XLogRecPtr curr_lsn;
> +
> +     /* Validate WAL start LSN. */
> +     if (XLogRecPtrIsInvalid(start_lsn))
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("invalid WAL start LSN")));
> +
> +     if (!RecoveryInProgress())
> +             curr_lsn = GetFlushRecPtr(NULL);
> +     else
> +             curr_lsn = GetXLogReplayRecPtr(NULL);
> +
> +     if (start_lsn >= curr_lsn)
> +     {
> +             /*
> +              * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of 
> the last
> +              * record flushed or replayed respectively. But let's use the 
> LSN up
> +              * to "end" in user facing message.
> +              */
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("cannot accept future start LSN"),
> +                              errdetail("Last known WAL LSN on the database 
> system is %X/%X.",
> +                                                LSN_FORMAT_ARGS(curr_lsn - 
> 1))));
> +     }

> +     /*
> +      * If end_lsn is specified, let's ensure that it's not a future LSN i.e.
> +      * something the database system doesn't know about.
> +      */
> +     if (!XLogRecPtrIsInvalid(end_lsn) &&
> +             (end_lsn >= curr_lsn))
> +     {
> +             /*
> +              * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of 
> the last
> +              * record flushed or replayed respectively. But let's use the 
> LSN up
> +              * to "end" in user facing message.
> +              */
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("cannot accept future end LSN"),
> +                              errdetail("Last known WAL LSN on the database 
> system is %X/%X.",
> +                                                LSN_FORMAT_ARGS(curr_lsn - 
> 1))));
> +     }
> +
> +     /*
> +      * When end_lsn is not specified let's read up to the last WAL position
> +      * known to be on the server.
> +      */
> +     if (XLogRecPtrIsInvalid(end_lsn))
> +             end_lsn = curr_lsn;
> +
> +     Assert(!XLogRecPtrIsInvalid(end_lsn));
> +
> +     if (start_lsn >= end_lsn)
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                              errmsg("WAL start LSN must be less than end 
> LSN")));
> +
> +     return end_lsn;
> +}

These two functions are largely redundant, that doesn't seem great.


> +Datum
> +pg_get_raw_wal_record(PG_FUNCTION_ARGS)
> +{
> +#define PG_GET_RAW_WAL_RECORD_COLS 5
> +     XLogRecPtr      lsn;
> +     XLogRecord *record;
> +     XLogRecPtr      first_record;
> +     XLogReaderState *xlogreader;
> +     bytea   *raw_record;
> +     uint32  rec_len;
> +     char    *raw_record_data;
> +     TupleDesc       tupdesc;
> +     Datum   result;
> +     HeapTuple       tuple;
> +     Datum   values[PG_GET_RAW_WAL_RECORD_COLS];
> +     bool    nulls[PG_GET_RAW_WAL_RECORD_COLS];
> +     int     i = 0;
> +
> +     lsn = PG_GETARG_LSN(0);
> +
> +     (void) ValidateInputLSN(lsn);
> +
> +     /* Build a tuple descriptor for our result type. */
> +     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
> +             elog(ERROR, "return type must be a row type");
> +
> +     xlogreader = InitXLogReaderState(lsn, &first_record);
> +
> +     Assert(xlogreader);
> +
> +     record = ReadNextXLogRecord(xlogreader, first_record);
> +
> +     rec_len = XLogRecGetTotalLen(xlogreader);
> +
> +     Assert(rec_len > 0);
> +

Most of this has another copy in pg_get_wal_record_info(). Can more of this be
deduplicated?


> +/*
> + * Get WAL record info.
> + */
> +static void
> +GetXLogRecordInfo(XLogReaderState *record, XLogRecPtr lsn,
> +                               Datum *values, bool *nulls, uint32 ncols)
> +{
> +     const char *id;
> +     const RmgrData *desc;
> +     uint32  fpi_len = 0;
> +     RelFileNode rnode;
> +     ForkNumber      forknum;
> +     BlockNumber blk;
> +     int     block_id;
> +     StringInfoData rec_desc;
> +     StringInfoData rec_blk_ref;
> +     StringInfoData temp;
> +     bytea   *data;
> +     char    *main_data;
> +     uint32  main_data_len;
> +     int     i = 0;
> +
> +     desc = &RmgrTable[XLogRecGetRmid(record)];
> +     initStringInfo(&rec_desc);
> +     id = desc->rm_identify(XLogRecGetInfo(record));
> +
> +     if (id == NULL)
> +             appendStringInfo(&rec_desc, "UNKNOWN (%x) ", 
> XLogRecGetInfo(record) & ~XLR_INFO_MASK);
> +     else
> +             appendStringInfo(&rec_desc, "%s ", id);
> +
> +     initStringInfo(&temp);
> +     desc->rm_desc(&temp, record);
> +     appendStringInfo(&rec_desc, "%s", temp.data);
> +     pfree(temp.data);
> +     initStringInfo(&rec_blk_ref);

This seems unnecessarily wasteful. You serialize into one stringinfo, just to
then copy that stringinfo into another stringinfo. Just to then allocate yet
another stringinfo.


> +     /* Block references (detailed format). */

This comment seems copied from pg_waldump, but doesn't make sense here,
because there's no short format.


> +     for (block_id = 0; block_id <= record->max_block_id; block_id++)
> +     {
> +             if (!XLogRecHasBlockRef(record, block_id))
> +                     continue;
> +
> +             XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
> +
> +             if (forknum != MAIN_FORKNUM)
> +                     appendStringInfo(&rec_blk_ref,
> +                                                     "blkref #%u: rel 
> %u/%u/%u fork %s blk %u",
> +                                                     block_id, 
> rnode.spcNode, rnode.dbNode,
> +                                                     rnode.relNode, 
> get_forkname(forknum), blk);
> +             else
> +                     appendStringInfo(&rec_blk_ref,
> +                                                     "blkref #%u: rel 
> %u/%u/%u blk %u",
> +                                                     block_id, 
> rnode.spcNode, rnode.dbNode,
> +                                                     rnode.relNode, blk);
> +
> +             if (XLogRecHasBlockImage(record, block_id))
> +             {
> +                     uint8           bimg_info = 
> record->blocks[block_id].bimg_info;
> +
> +                     /* Calculate the amount of FPI data in the record. */
> +                     fpi_len += record->blocks[block_id].bimg_len;
> +
> +                     if (BKPIMAGE_COMPRESSED(bimg_info))
> +                     {
> +                             const char *method;
> +
> +                             if ((bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
> +                                     method = "pglz";
> +                             else if ((bimg_info & BKPIMAGE_COMPRESS_LZ4) != 
> 0)
> +                                     method = "lz4";
> +                             else
> +                                     method = "unknown";
> +
> +                             appendStringInfo(&rec_blk_ref, " (FPW%s); hole: 
> offset: %u, length: %u, "
> +                                                              "compression 
> saved: %u, method: %s",
> +                                                              
> XLogRecBlockImageApply(record, block_id) ?
> +                                                              "" : " for WAL 
> verification",
> +                                                              
> record->blocks[block_id].hole_offset,
> +                                                              
> record->blocks[block_id].hole_length,
> +                                                              BLCKSZ -
> +                                                              
> record->blocks[block_id].hole_length -
> +                                                              
> record->blocks[block_id].bimg_len,
> +                                                              method);
> +                     }
> +                     else
> +                     {
> +                             appendStringInfo(&rec_blk_ref, " (FPW%s); hole: 
> offset: %u, length: %u",
> +                                                              
> XLogRecBlockImageApply(record, block_id) ?
> +                                                              "" : " for WAL 
> verification",
> +                                                              
> record->blocks[block_id].hole_offset,
> +                                                              
> record->blocks[block_id].hole_length);
> +                     }
> +             }
> +     }

To me duplicating this much code from waldump seems like a bad idea from a
maintainability POV.



> +/*
> + * Get info and data of all WAL records between start LSN and end LSN.
> + */
> +static void
> +GetWALRecordsInfoInternal(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
> +                                               XLogRecPtr end_lsn)
> +{
> +#define PG_GET_WAL_RECORDS_INFO_COLS 11
> +     XLogRecPtr      first_record;
> +     XLogReaderState *xlogreader;
> +     ReturnSetInfo *rsinfo;
> +     TupleDesc       tupdesc;
> +     Tuplestorestate *tupstore;
> +     MemoryContext per_query_ctx;
> +     MemoryContext oldcontext;
> +     Datum   values[PG_GET_WAL_RECORDS_INFO_COLS];
> +     bool    nulls[PG_GET_WAL_RECORDS_INFO_COLS];
> +
> +     rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
> +
> +     /* Check to see if caller supports us returning a tuplestore. */
> +     if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
> +                              errmsg("set-valued function called in context 
> that cannot accept a set")));
> +     if (!(rsinfo->allowedModes & SFRM_Materialize))
> +             ereport(ERROR,
> +                             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
> +                              errmsg("materialize mode required, but it is 
> not allowed in this context")));
> +
> +     /* Build a tuple descriptor for our result type. */
> +     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
> +             elog(ERROR, "return type must be a row type");
> +
> +     /* Build tuplestore to hold the result rows. */
> +     per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
> +     oldcontext = MemoryContextSwitchTo(per_query_ctx);
> +     tupstore = tuplestore_begin_heap(true, false, work_mem);
> +     rsinfo->returnMode = SFRM_Materialize;
> +     rsinfo->setResult = tupstore;
> +     rsinfo->setDesc = tupdesc;

This should likely use the infrastructure introduced in 
5b81703787bfc1e6072c8e37125eba0c5598b807.


> +     for (;;)
> +     {
> +             (void) ReadNextXLogRecord(xlogreader, first_record);
> +
> +             /*
> +              * Let's not show the record info if it is spanning more than 
> the
> +              * end_lsn. EndRecPtr is "end+1" of the last read record, hence
> +              * use "end" here.
> +              */
> +             if ((xlogreader->EndRecPtr - 1) <= end_lsn)
> +             {
> +                     GetXLogRecordInfo(xlogreader, xlogreader->currRecPtr, 
> values, nulls,
> +                                                       
> PG_GET_WAL_RECORDS_INFO_COLS);
> +
> +                     tuplestore_putvalues(tupstore, tupdesc, values, nulls);
> +             }
> +
> +             /* Exit loop if read up to end_lsn. */
> +             if (xlogreader->EndRecPtr >= end_lsn)
> +                     break;

Seems weird to have both of these conditions separately.


Greetings,

Andres Freund

Re: pg_walinspect - a new extension to get raw WAL data and WAL stats

Reply via email to