On 2021/04/21 15:08, torikoshia wrote:
> On 2021-04-16 10:27, Masahiro Ikeda wrote:
>> On 2021/04/13 9:33, Fujii Masao wrote:
>>>
>>>
>>> On 2021/03/30 20:37, Masahiro Ikeda wrote:
>>>> OK, I added the condition to the fast-return check. I noticed that I
>>>> misunderstood that the purpose is to avoid expanding a clock check
>>>> using WAL stats counters. But, the purpose is to make the conditions
>>>> stricter, right?
>>>
>>> Yes. Currently if the following condition is false even when the WAL
>>> counters are updated, nothing is sent to the stats collector. But with
>>> your patch, in this case the WAL stats are sent.
>>>
>>> if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
>>> pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
>>> !have_function_stats && !disconnect)
>>>
>>> Thanks for the patch! It now fails to be applied to the master
>>> cleanly. So could you rebase the patch?
>>
>> Thanks for your comments! I rebased it.
>
> Thanks for working on this!
Hi, thanks for your comments!
> I have some minor comments on
> performance-improvements-of-reporting-wal-stats-without-introducing-a-new-variable.patch.
>
>
>
>
> 177 @@ -3094,20 +3066,33 @@ pgstat_report_wal(void) 178 * Return true if
> the message is sent, and false otherwise.
>
> Since you changed the return value to void, it seems the description is not
> necessary anymore.
Right, I fixed it.
> 208 + * generate wal records. 'wal_writes' and 'wal_sync' are zero
> means the
>
> It may be better to change 'wal_writes' to 'wal_write' since single
> quotation seems to mean variable name.
Agreed.
> 234 + * set the counters related to generated WAL data if the
> counters are
>
>
> set -> Set?
Yes, I fixed.
> BTW, is it better to change PgStat_Counter from int64 to uint64 because>
> there aren't any counters with negative number?
Although this is not related to torikoshi-san's comment, the above my
understanding is not right. Some counters like delta_live_tuples,
delta_dead_tuples and changed_tuples can be negative.
Regards,
--
Masahiro Ikeda
NTT DATA CORPORATION
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index f42f07622e..1b925d31d2 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -173,21 +173,21 @@ typedef struct Counters
double sum_var_time[PGSS_NUMKIND]; /* sum of variances in
* planning/execution time in msec */
int64 rows; /* total # of retrieved or affected rows */
- int64 shared_blks_hit; /* # of shared buffer hits */
- int64 shared_blks_read; /* # of shared disk blocks read */
- int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
- int64 shared_blks_written; /* # of shared disk blocks written */
- int64 local_blks_hit; /* # of local buffer hits */
- int64 local_blks_read; /* # of local disk blocks read */
- int64 local_blks_dirtied; /* # of local disk blocks dirtied */
- int64 local_blks_written; /* # of local disk blocks written */
- int64 temp_blks_read; /* # of temp blocks read */
- int64 temp_blks_written; /* # of temp blocks written */
+ uint64 shared_blks_hit; /* # of shared buffer hits */
+ uint64 shared_blks_read; /* # of shared disk blocks read */
+ uint64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
+ uint64 shared_blks_written; /* # of shared disk blocks written */
+ uint64 local_blks_hit; /* # of local buffer hits */
+ uint64 local_blks_read; /* # of local disk blocks read */
+ uint64 local_blks_dirtied; /* # of local disk blocks dirtied */
+ uint64 local_blks_written; /* # of local disk blocks written */
+ uint64 temp_blks_read; /* # of temp blocks read */
+ uint64 temp_blks_written; /* # of temp blocks written */
double blk_read_time; /* time spent reading, in msec */
double blk_write_time; /* time spent writing, in msec */
double usage; /* usage factor */
- int64 wal_records; /* # of WAL records generated */
- int64 wal_fpi; /* # of WAL full page images generated */
+ uint64 wal_records; /* # of WAL records generated */
+ uint64 wal_fpi; /* # of WAL full page images generated */
uint64 wal_bytes; /* total amount of WAL generated in bytes */
} Counters;
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 237e13361b..75ecd00c23 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -17,6 +17,12 @@
#include "executor/instrument.h"
+/*
+ * Buffer and generated WAL usage counters.
+ *
+ * The counters are accumulated values. There are infrastructures
+ * to add the values and calculate the difference within a specific period.
+ */
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e7e6a2a459..1761694a5b 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -505,7 +505,7 @@ CheckpointerMain(void)
pgstat_send_bgwriter();
/* Send WAL statistics to the stats collector. */
- pgstat_report_wal();
+ pgstat_send_wal(true);
/*
* If any checkpoint flags have been set, redo the loop to handle the
@@ -583,7 +583,7 @@ HandleCheckpointerInterrupts(void)
BgWriterStats.m_requested_checkpoints++;
ShutdownXLOG(0, 0);
pgstat_send_bgwriter();
- pgstat_report_wal();
+ pgstat_send_wal(true);
/* Normal exit from the checkpointer is here */
proc_exit(0); /* done */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index e1ec7d8b7d..940663de20 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -133,8 +133,8 @@ PgStat_MsgWal WalStats;
/*
* WAL usage counters saved from pgWALUsage at the previous call to
- * pgstat_report_wal(). This is used to calculate how much WAL usage
- * happens between pgstat_report_wal() calls, by substracting
+ * pgstat_send_wal(). This is used to calculate how much WAL usage
+ * happens between pgstat_send_wal() calls, by substracting
* the previous counters from the current ones.
*/
static WalUsage prevWalUsage;
@@ -855,9 +855,19 @@ pgstat_report_stat(bool disconnect)
TabStatusArray *tsa;
int i;
- /* Don't expend a clock check if nothing to do */
+ /*
+ * Don't expend a clock check if nothing to do.
+ *
+ * Note: regarding to WAL statistics counters, it's not enough to check
+ * only whether the wal record is generated or not. If a read transaction
+ * is executed, wal records aren't nomally generated (although HOT makes
+ * wal records). But, just writes and syncs the wal data may happen when
+ * to flush buffers.
+ */
if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
+ pgWalUsage.wal_records == prevWalUsage.wal_records &&
+ walStats.wal_write == 0 && walStats.wal_sync == 0 &&
!have_function_stats && !disconnect)
return;
@@ -951,7 +961,7 @@ pgstat_report_stat(bool disconnect)
pgstat_send_funcstats();
/* Send WAL statistics */
- pgstat_report_wal();
+ pgstat_send_wal(true);
/* Finally send SLRU statistics */
pgstat_send_slru();
@@ -2934,7 +2944,7 @@ void
pgstat_initialize(void)
{
/*
- * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can
+ * Initialize prevWalUsage with pgWalUsage so that pgstat_send_wal() can
* calculate how much pgWalUsage counters are increased by substracting
* prevWalUsage from pgWalUsage.
*/
@@ -3046,44 +3056,6 @@ pgstat_send_bgwriter(void)
MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
}
-/* ----------
- * pgstat_report_wal() -
- *
- * Calculate how much WAL usage counters are increased and send
- * WAL statistics to the collector.
- *
- * Must be called by processes that generate WAL.
- * ----------
- */
-void
-pgstat_report_wal(void)
-{
- WalUsage walusage;
-
- /*
- * Calculate how much WAL usage counters are increased by substracting the
- * previous counters from the current ones. Fill the results in WAL stats
- * message.
- */
- MemSet(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage);
-
- WalStats.m_wal_records = walusage.wal_records;
- WalStats.m_wal_fpi = walusage.wal_fpi;
- WalStats.m_wal_bytes = walusage.wal_bytes;
-
- /*
- * Send WAL stats message to the collector.
- */
- if (!pgstat_send_wal(true))
- return;
-
- /*
- * Save the current counters for the subsequent calculation of WAL usage.
- */
- prevWalUsage = pgWalUsage;
-}
-
/* ----------
* pgstat_send_wal() -
*
@@ -3091,24 +3063,35 @@ pgstat_report_wal(void)
*
* If 'force' is not set, WAL stats message is only sent if enough time has
* passed since last one was sent to reach PGSTAT_STAT_INTERVAL.
- *
- * Return true if the message is sent, and false otherwise.
* ----------
*/
-bool
+void
pgstat_send_wal(bool force)
{
- /* We assume this initializes to zeroes */
- static const PgStat_MsgWal all_zeroes;
static TimestampTz sendTime = 0;
/*
- * This function can be called even if nothing at all has happened. In
- * this case, avoid sending a completely empty message to the stats
- * collector.
+ * First, to check the WAL stats counters were updated.
+ *
+ * Even if the 'force' is true, we don't need to send the stats if the
+ * counters were not updated.
+ *
+ * We can know whether the counters were updated or not to check only
+ * three counters. So, for performance, we don't allocate another memory
+ * spaces and check the all stats like pgstat_send_slru().
+ *
+ * The current 'wal_records' is the same as the previous one means that
+ * wal records weren't generated. So, the counters of 'wal_fpi',
+ * 'wal_bytes', 'm_wal_buffers_full' are not updated neither.
+ *
+ * It's not enough to check the number of generated wal records, for
+ * example the walwriter may write/sync the WAL although it doesn't
+ * generate wal records. 'wal_write' and 'wal_sync' are zero means the
+ * counters of time spent are zero too.
*/
- if (memcmp(&WalStats, &all_zeroes, sizeof(PgStat_MsgWal)) == 0)
- return false;
+ if (pgWalUsage.wal_records == prevWalUsage.wal_records &&
+ walStats.wal_write == 0 && walStats.wal_sync == 0)
+ return;
if (!force)
{
@@ -3116,25 +3099,50 @@ pgstat_send_wal(bool force)
/*
* Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
- * msec since we last sent one.
+ * msec since we last sent one to avoid overloading the stats
+ * collector.
*/
if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL))
- return false;
+ return;
sendTime = now;
}
+ /*
+ * Set the counters related to generated WAL data if the counters are
+ * updated.
+ */
+ if (pgWalUsage.wal_records != prevWalUsage.wal_records)
+ {
+ WalUsage walusage;
+
+ /*
+ * Calculate how much WAL usage counters are increased by substracting
+ * the previous counters from the current ones. Fill the results in
+ * WAL stats message.
+ */
+ MemSet(&walusage, 0, sizeof(WalUsage));
+ WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage);
+
+ WalStats.m_wal_records = walusage.wal_records;
+ WalStats.m_wal_fpi = walusage.wal_fpi;
+ WalStats.m_wal_bytes = walusage.wal_bytes;
+ }
+
/*
* Prepare and send the message
*/
pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL);
pgstat_send(&WalStats, sizeof(WalStats));
+ /*
+ * Save the current counters for the subsequent calculation of WAL usage.
+ */
+ prevWalUsage = pgWalUsage;
+
/*
* Clear out the statistics buffer, so it can be re-used.
*/
MemSet(&WalStats, 0, sizeof(WalStats));
-
- return true;
}
/* ----------
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index aa8eceda5f..9de29e51d6 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -16,26 +16,41 @@
#include "portability/instr_time.h"
+/*
+ * The accumulated counters for buffer usage.
+ *
+ * The reason the counters are accumulated values is that there may be many
+ * callers to use them, so to avoid unexpected reset.
+ */
typedef struct BufferUsage
{
- long shared_blks_hit; /* # of shared buffer hits */
- long shared_blks_read; /* # of shared disk blocks read */
- long shared_blks_dirtied; /* # of shared blocks dirtied */
- long shared_blks_written; /* # of shared disk blocks written */
- long local_blks_hit; /* # of local buffer hits */
- long local_blks_read; /* # of local disk blocks read */
- long local_blks_dirtied; /* # of local blocks dirtied */
- long local_blks_written; /* # of local disk blocks written */
- long temp_blks_read; /* # of temp blocks read */
- long temp_blks_written; /* # of temp blocks written */
+ uint64 shared_blks_hit; /* # of shared buffer hits */
+ uint64 shared_blks_read; /* # of shared disk blocks read */
+ uint64 shared_blks_dirtied; /* # of shared blocks dirtied */
+ uint64 shared_blks_written; /* # of shared disk blocks written */
+ uint64 local_blks_hit; /* # of local buffer hits */
+ uint64 local_blks_read; /* # of local disk blocks read */
+ uint64 local_blks_dirtied; /* # of local blocks dirtied */
+ uint64 local_blks_written; /* # of local disk blocks written */
+ uint64 temp_blks_read; /* # of temp blocks read */
+ uint64 temp_blks_written; /* # of temp blocks written */
instr_time blk_read_time; /* time spent reading */
instr_time blk_write_time; /* time spent writing */
} BufferUsage;
+/*
+ * The accumulated counters for generated WAL usage.
+ *
+ * The reason the counters are accumulated values is the same as BufferUsage's one.
+ * And the reason to store only generated WAL usage and doesn't store WAL I/O
+ * activity, is that this is assumed for knowing the WAL usage in per query or
+ * transaction. So, common resources for the cluster like WAL I/O activity is
+ * not stored.
+ */
typedef struct WalUsage
{
- long wal_records; /* # of WAL records produced */
- long wal_fpi; /* # of WAL full page images produced */
+ uint64 wal_records; /* # of WAL records produced */
+ uint64 wal_fpi; /* # of WAL full page images produced */
uint64 wal_bytes; /* size of WAL records produced */
} WalUsage;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 5c5920b0b5..bd7ce9ad99 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -503,8 +503,8 @@ typedef struct PgStat_MsgBgWriter
typedef struct PgStat_MsgWal
{
PgStat_MsgHdr m_hdr;
- PgStat_Counter m_wal_records;
- PgStat_Counter m_wal_fpi;
+ uint64 m_wal_records;
+ uint64 m_wal_fpi;
uint64 m_wal_bytes;
PgStat_Counter m_wal_buffers_full;
PgStat_Counter m_wal_write;
@@ -1114,8 +1114,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
extern void pgstat_send_archiver(const char *xlog, bool failed);
extern void pgstat_send_bgwriter(void);
extern void pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats);
-extern void pgstat_report_wal(void);
-extern bool pgstat_send_wal(bool force);
+extern void pgstat_send_wal(bool force);
/* ----------
* Support functions for the SQL-callable functions to