Hi, Recently I've heard people complaining that Postgres doesn't expose any statistics about how many full page writes happened during some time frame. Indeed, I couldn't find any easy way to do so, and judging from my understanding of xloginsert.c it actually can be done per database with the attached poc patch.
I guess it can be implemented in a more effective and optimized way, but with what I have right now first naive pgbench tests show that slowdown is about 3%. Before I'll dig into it more, it would be nice to hear your opinion about this idea - does it make sense to have something like this?
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5bea073..64e450c 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -31,6 +31,7 @@ #include "storage/proc.h" #include "utils/memutils.h" #include "pg_trace.h" +#include "pgstat.h" /* Buffer size required to store a compressed version of backup block image */ #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) @@ -103,6 +104,15 @@ static int max_rdatas; /* allocated size */ static bool begininsert_called = false; +#define FPW_COUNTER_HASH_SIZE 100 + +typedef struct { + Oid db; + PgStat_Counter counter; +} FpwCounterEntry; + +static HTAB *fpwCounterStatHash = NULL; + /* Memory context to hold the registered buffer and data references. */ static MemoryContext xloginsert_cxt; @@ -192,7 +202,9 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas) void XLogResetInsertion(void) { - int i; + int i; + HASH_SEQ_STATUS fstat; + FpwCounterEntry *fpwEntry; for (i = 0; i < max_registered_block_id; i++) registered_buffers[i].in_use = false; @@ -203,6 +215,14 @@ XLogResetInsertion(void) mainrdata_last = (XLogRecData *) &mainrdata_head; curinsert_flags = 0; begininsert_called = false; + + + hash_seq_init(&fstat, fpwCounterStatHash); + while ((fpwEntry = (FpwCounterEntry *) hash_seq_search(&fstat)) != NULL) + { + pgstat_report_fpw(fpwEntry->db, fpwEntry->counter); + fpwEntry->counter = 0; + } } /* @@ -584,7 +604,20 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (include_image) { Page page = regbuf->page; + bool found = false; uint16 compressed_len = 0; + FpwCounterEntry *fpwEntry; + Oid dbOid = regbuf->rnode.dbNode; + + fpwEntry = (FpwCounterEntry *) hash_search(fpwCounterStatHash, + &dbOid, HASH_ENTER, &found); + if (!found) + { + fpwEntry->counter = 0; + fpwEntry->db = dbOid; + } + + fpwEntry->counter++; /* * The page needs to be backed up, so calculate its hole length @@ -1055,4 +1088,16 @@ InitXLogInsert(void) if (hdr_scratch == NULL) hdr_scratch = MemoryContextAllocZero(xloginsert_cxt, HEADER_SCRATCH_SIZE); + + if (fpwCounterStatHash == NULL) + { + HASHCTL hash_ctl; + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(FpwCounterEntry); + + fpwCounterStatHash = hash_create("Full page write counter hask", + FPW_COUNTER_HASH_SIZE, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_PREALLOC | HASH_FIXED_SIZE); + } } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 8cd8bf4..6ee5692 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -822,7 +822,8 @@ CREATE VIEW pg_stat_database AS pg_stat_get_db_deadlocks(D.oid) AS deadlocks, pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time, - pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset + pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset, + pg_stat_get_db_fpw(D.oid) AS fpw FROM pg_database D; CREATE VIEW pg_stat_database_conflicts AS diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 084573e..9606bb7 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -336,6 +336,7 @@ static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); +static void pgstat_recv_fpw(PgStat_MsgFpw *msg, int len); /* ------------------------------------------------------------ * Public functions called from postmaster follow @@ -3210,6 +3211,26 @@ pgstat_report_xact_timestamp(TimestampTz tstamp) pgstat_increment_changecount_after(beentry); } +/* -------- + * pgstat_report_fpw() - + * + * Tell the collector about full page writes. + * -------- + */ +void +pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter) +{ + PgStat_MsgFpw msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FPW); + msg.m_databaseid = dboid; + msg.m_fpw_counter = fpwCounter; + pgstat_send(&msg, sizeof(msg)); +} + /* ---------- * pgstat_read_current_status() - * @@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len); break; + case PGSTAT_MTYPE_FPW: + pgstat_recv_fpw((PgStat_MsgFpw *) &msg, len); + break; + default: break; } @@ -6197,6 +6222,23 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) } /* ---------- + * pgstat_recv_fpw() - + * + * Process a FPW message. + * ---------- + */ +static void +pgstat_recv_fpw(PgStat_MsgFpw *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_fpw += msg->m_fpw_counter; +} + + +/* ---------- * pgstat_recv_tempfile() - * * Process a TEMPFILE message. diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 6110e40..7f0dcad 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1504,6 +1504,21 @@ pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS) } Datum +pg_stat_get_db_fpw(PG_FUNCTION_ARGS) +{ + Oid dbid = PG_GETARG_OID(0); + int64 result; + PgStat_StatDBEntry *dbentry; + + if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) + result = 0; + else + result = (int64) (dbentry->n_fpw); + + PG_RETURN_INT64(result); +} + +Datum pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS) { PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 785e0fa..14c92f8 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -517,6 +517,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) * space if the caller correctly estimates a small table size. */ if ((flags & HASH_SHARED_MEM) || + (flags & HASH_PREALLOC) || nelem < hctl->nelem_alloc) { int i, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index f643f56..5c1b9da 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5344,6 +5344,10 @@ proname => 'pg_stat_get_db_blk_write_time', provolatile => 's', proparallel => 'r', prorettype => 'float8', proargtypes => 'oid', prosrc => 'pg_stat_get_db_blk_write_time' }, +{ oid => '3423', descr => 'statistics: number of full page writes for database', + proname => 'pg_stat_get_db_fpw', provolatile => 's', proparallel => 'r', + prorettype => 'int8', proargtypes => 'oid', + prosrc => 'pg_stat_get_db_fpw' }, { oid => '3195', descr => 'statistics: information about WAL archiver', proname => 'pg_stat_get_archiver', proisstrict => 'f', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f592..2d67dc3 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -64,7 +64,8 @@ typedef enum StatMsgType PGSTAT_MTYPE_FUNCPURGE, PGSTAT_MTYPE_RECOVERYCONFLICT, PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK + PGSTAT_MTYPE_DEADLOCK, + PGSTAT_MTYPE_FPW } StatMsgType; /* ---------- @@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock Oid m_databaseid; } PgStat_MsgDeadlock; +/* ---------- + * PgStat_MsgFpw Sent by the backend to tell the collector + * about a fpw that occurred. + * ---------- + */ +typedef struct PgStat_MsgFpw +{ + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + PgStat_Counter m_fpw_counter; +} PgStat_MsgFpw; /* ---------- * PgStat_Msg Union over all possible messages. @@ -595,6 +607,7 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_deadlocks; PgStat_Counter n_block_read_time; /* times in microseconds */ PgStat_Counter n_block_write_time; + PgStat_Counter n_fpw; TimestampTz stat_reset_timestamp; TimestampTz stats_timestamp; /* time of db stats file update */ @@ -1196,6 +1209,7 @@ extern void pgstat_report_activity(BackendState state, const char *cmd_str); extern void pgstat_report_tempfile(size_t filesize); extern void pgstat_report_appname(const char *appname); extern void pgstat_report_xact_timestamp(TimestampTz tstamp); +extern void pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter); extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser); diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h index 8357faa..5bf04a2 100644 --- a/src/include/utils/hsearch.h +++ b/src/include/utils/hsearch.h @@ -94,6 +94,7 @@ typedef struct HASHCTL #define HASH_SHARED_MEM 0x0800 /* Hashtable is in shared memory */ #define HASH_ATTACH 0x1000 /* Do not initialize hctl */ #define HASH_FIXED_SIZE 0x2000 /* Initial size is a hard limit */ +#define HASH_PREALLOC 0x4000 /* Preallocate hashtable */ /* max_dsize value to indicate expansible directory */ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index ae0cd25..acb5bbe 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1811,7 +1811,8 @@ pg_stat_database| SELECT d.oid AS datid, pg_stat_get_db_deadlocks(d.oid) AS deadlocks, pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time, pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time, - pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset + pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset, + pg_stat_get_db_fpw(d.oid) AS fpw FROM pg_database d; pg_stat_database_conflicts| SELECT d.oid AS datid, d.datname,