Hi,

Recently I've heard people complaining that Postgres doesn't expose any
statistics about how many full page writes happened during some time frame.
Indeed, I couldn't find any easy way to do so, and judging from my
understanding of xloginsert.c it actually can be done per database with the
attached poc patch.

I guess it can be implemented in a more effective and optimized way, but with
what I have right now first naive pgbench tests show that slowdown is about 3%.
Before I'll dig into it more, it would be nice to hear your opinion about this
idea -  does it make sense to have something like this?
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 5bea073..64e450c 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -31,6 +31,7 @@
 #include "storage/proc.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
+#include "pgstat.h"
 
 /* Buffer size required to store a compressed version of backup block image */
 #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@@ -103,6 +104,15 @@ static int	max_rdatas;			/* allocated size */
 
 static bool begininsert_called = false;
 
+#define FPW_COUNTER_HASH_SIZE 100
+
+typedef struct {
+	Oid				db;
+	PgStat_Counter	counter;
+} FpwCounterEntry;
+
+static HTAB *fpwCounterStatHash = NULL;
+
 /* Memory context to hold the registered buffer and data references. */
 static MemoryContext xloginsert_cxt;
 
@@ -192,7 +202,9 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas)
 void
 XLogResetInsertion(void)
 {
-	int			i;
+	int				i;
+	HASH_SEQ_STATUS fstat;
+	FpwCounterEntry *fpwEntry;
 
 	for (i = 0; i < max_registered_block_id; i++)
 		registered_buffers[i].in_use = false;
@@ -203,6 +215,14 @@ XLogResetInsertion(void)
 	mainrdata_last = (XLogRecData *) &mainrdata_head;
 	curinsert_flags = 0;
 	begininsert_called = false;
+
+
+	hash_seq_init(&fstat, fpwCounterStatHash);
+	while ((fpwEntry = (FpwCounterEntry *) hash_seq_search(&fstat)) != NULL)
+	{
+		pgstat_report_fpw(fpwEntry->db, fpwEntry->counter);
+		fpwEntry->counter = 0;
+	}
 }
 
 /*
@@ -584,7 +604,20 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 		if (include_image)
 		{
 			Page		page = regbuf->page;
+			bool		found = false;
 			uint16		compressed_len = 0;
+			FpwCounterEntry *fpwEntry;
+			Oid			dbOid = regbuf->rnode.dbNode;
+
+			fpwEntry = (FpwCounterEntry *) hash_search(fpwCounterStatHash,
+													   &dbOid, HASH_ENTER, &found);
+			if (!found)
+			{
+				fpwEntry->counter = 0;
+				fpwEntry->db = dbOid;
+			}
+
+			fpwEntry->counter++;
 
 			/*
 			 * The page needs to be backed up, so calculate its hole length
@@ -1055,4 +1088,16 @@ InitXLogInsert(void)
 	if (hdr_scratch == NULL)
 		hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
 											 HEADER_SCRATCH_SIZE);
+
+	if (fpwCounterStatHash == NULL)
+	{
+		HASHCTL hash_ctl;
+		memset(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(Oid);
+		hash_ctl.entrysize = sizeof(FpwCounterEntry);
+
+		fpwCounterStatHash = hash_create("Full page write counter hask",
+										 FPW_COUNTER_HASH_SIZE, &hash_ctl,
+										 HASH_ELEM | HASH_BLOBS | HASH_PREALLOC | HASH_FIXED_SIZE);
+	}
 }
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8cd8bf4..6ee5692 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -822,7 +822,8 @@ CREATE VIEW pg_stat_database AS
             pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
             pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
             pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
-            pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
+            pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset,
+            pg_stat_get_db_fpw(D.oid) AS fpw
     FROM pg_database D;
 
 CREATE VIEW pg_stat_database_conflicts AS
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 084573e..9606bb7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -336,6 +336,7 @@ static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+static void pgstat_recv_fpw(PgStat_MsgFpw *msg, int len);
 
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
@@ -3210,6 +3211,26 @@ pgstat_report_xact_timestamp(TimestampTz tstamp)
 	pgstat_increment_changecount_after(beentry);
 }
 
+/* --------
+ * pgstat_report_fpw() -
+ *
+ *	Tell the collector about full page writes.
+ * --------
+ */
+void
+pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter)
+{
+	PgStat_MsgFpw msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FPW);
+	msg.m_databaseid = dboid;
+	msg.m_fpw_counter = fpwCounter;
+	pgstat_send(&msg, sizeof(msg));
+}
+
 /* ----------
  * pgstat_read_current_status() -
  *
@@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
 					break;
 
+				case PGSTAT_MTYPE_FPW:
+					pgstat_recv_fpw((PgStat_MsgFpw *) &msg, len);
+					break;
+
 				default:
 					break;
 			}
@@ -6197,6 +6222,23 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_fpw() -
+ *
+ *	Process a FPW message.
+ * ----------
+ */
+static void
+pgstat_recv_fpw(PgStat_MsgFpw *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+	dbentry->n_fpw += msg->m_fpw_counter;
+}
+
+
+/* ----------
  * pgstat_recv_tempfile() -
  *
  *	Process a TEMPFILE message.
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 6110e40..7f0dcad 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1504,6 +1504,21 @@ pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_db_fpw(PG_FUNCTION_ARGS)
+{
+	Oid			dbid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatDBEntry *dbentry;
+
+	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (dbentry->n_fpw);
+
+	PG_RETURN_INT64(result);
+}
+
+Datum
 pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 785e0fa..14c92f8 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -517,6 +517,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	 * space if the caller correctly estimates a small table size.
 	 */
 	if ((flags & HASH_SHARED_MEM) ||
+		(flags & HASH_PREALLOC)   ||
 		nelem < hctl->nelem_alloc)
 	{
 		int			i,
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f643f56..5c1b9da 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5344,6 +5344,10 @@
   proname => 'pg_stat_get_db_blk_write_time', provolatile => 's',
   proparallel => 'r', prorettype => 'float8', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_blk_write_time' },
+{ oid => '3423', descr => 'statistics: number of full page writes for database',
+  proname => 'pg_stat_get_db_fpw', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => 'oid',
+  prosrc => 'pg_stat_get_db_fpw' },
 { oid => '3195', descr => 'statistics: information about WAL archiver',
   proname => 'pg_stat_get_archiver', proisstrict => 'f', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index be2f592..2d67dc3 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -64,7 +64,8 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
 	PGSTAT_MTYPE_TEMPFILE,
-	PGSTAT_MTYPE_DEADLOCK
+	PGSTAT_MTYPE_DEADLOCK,
+	PGSTAT_MTYPE_FPW
 } StatMsgType;
 
 /* ----------
@@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock
 	Oid			m_databaseid;
 } PgStat_MsgDeadlock;
 
+/* ----------
+ * PgStat_MsgFpw			Sent by the backend to tell the collector
+ *								about a fpw that occurred.
+ * ----------
+ */
+typedef struct PgStat_MsgFpw
+{
+	PgStat_MsgHdr	m_hdr;
+	Oid				m_databaseid;
+	PgStat_Counter	m_fpw_counter;
+} PgStat_MsgFpw;
 
 /* ----------
  * PgStat_Msg					Union over all possible messages.
@@ -595,6 +607,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_deadlocks;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
+	PgStat_Counter n_fpw;
 
 	TimestampTz stat_reset_timestamp;
 	TimestampTz stats_timestamp;	/* time of db stats file update */
@@ -1196,6 +1209,7 @@ extern void pgstat_report_activity(BackendState state, const char *cmd_str);
 extern void pgstat_report_tempfile(size_t filesize);
 extern void pgstat_report_appname(const char *appname);
 extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
+extern void pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter);
 extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
 extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 8357faa..5bf04a2 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -94,6 +94,7 @@ typedef struct HASHCTL
 #define HASH_SHARED_MEM 0x0800	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x1000	/* Do not initialize hctl */
 #define HASH_FIXED_SIZE 0x2000	/* Initial size is a hard limit */
+#define HASH_PREALLOC   0x4000	/* Preallocate hashtable */
 
 
 /* max_dsize value to indicate expansible directory */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ae0cd25..acb5bbe 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1811,7 +1811,8 @@ pg_stat_database| SELECT d.oid AS datid,
     pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
     pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
     pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
-    pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
+    pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset,
+    pg_stat_get_db_fpw(d.oid) AS fpw
    FROM pg_database d;
 pg_stat_database_conflicts| SELECT d.oid AS datid,
     d.datname,

Reply via email to