On 2021/04/23 16:30, Fujii Masao wrote:
> 
> 
> On 2021/04/23 10:25, Andres Freund wrote:
>> Hi,
>>
>> On 2021-04-23 09:26:17 +0900, Masahiro Ikeda wrote:
>>> On 2021/04/23 0:36, Andres Freund wrote:
>>>> On Thu, Apr 22, 2021, at 06:42, Fujii Masao wrote:
>>>>> On 2021/04/21 18:31, Masahiro Ikeda wrote:
>>>>>>> BTW, is it better to change PgStat_Counter from int64 to uint64
>>>>>>> because> there aren't any counters with negative number?
>>>>> On second thought, it's ok even if the counters like wal_records get
>>>>> overflowed.
>>>>> Because they are always used to calculate the diff between the previous 
>>>>> and
>>>>> current counters. Even when the current counters are overflowed and
>>>>> the previous ones are not, WalUsageAccumDiff() seems to return the right
>>>>> diff of them. If this understanding is right, I'd withdraw my comment and
>>>>> it's ok to use "long" type for those counters. Thought?
>>>>
>>>> Why long? It's of a platform dependent size, which doesn't seem useful 
>>>> here?
>>>
>>> I think it's ok to unify uint64. Although it's better to use small size for
>>> cache, the idea works well for only some platform which use 4bytes for 
>>> "long".
>>>
>>>
>>> (Although I'm not familiar with the standardization...)
>>> It seems that we need to adopt unsinged long if use "long", which may be
>>> 4bytes.
>>>
>>> I though WalUsageAccumDiff() seems to return the right value too. But, I
>>> researched deeply and found that ISO/IEC 9899:1999 defines unsinged integer
>>> never overflow(2.6.5 Types 9th section) although it doesn't define overflow 
>>> of
>>> signed integer type.
>>>
>>> If my understanding is right, the definition only guarantee
>>> WalUsageAccumDiff() returns the right value only if the type is unsigned
>>> integer. So, it's safe to change "long" to "unsigned long".
>>
>> I think this should just use 64bit counters. Emitting more than 4
>> billion records in one transaction isn't actually particularly rare. And
> 
> Right. I agree to change the types of the counters to int64.
> 
> I think that it's better to make this change as a separate patch from
> the changes for pg_stat_wal view.

Thanks for your comments.
OK, I separate two patches.

First patch has only the changes for pg_stat_wal view.
("v6-0001-performance-improvements-of-reporting-wal-stats-without-introducing-a-new-variable.patch")

Second one has the changes for the type of the BufferUsage's and WalUsage's
members. I change the type from long to int64. Is it better to make new thread?
("v6-0002-change-the-data-type-of-XXXUsage-from-long-to-int64.patch")

Regards,
-- 
Masahiro Ikeda
NTT DATA CORPORATION
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 237e13361b..75ecd00c23 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -17,6 +17,12 @@
 
 #include "executor/instrument.h"
 
+/*
+ * Buffer and generated WAL usage counters.
+ *
+ * The counters are accumulated values. There are infrastructures
+ * to add the values and calculate the difference within a specific period.
+ */
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e7e6a2a459..1761694a5b 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -505,7 +505,7 @@ CheckpointerMain(void)
 		pgstat_send_bgwriter();
 
 		/* Send WAL statistics to the stats collector. */
-		pgstat_report_wal();
+		pgstat_send_wal(true);
 
 		/*
 		 * If any checkpoint flags have been set, redo the loop to handle the
@@ -583,7 +583,7 @@ HandleCheckpointerInterrupts(void)
 		BgWriterStats.m_requested_checkpoints++;
 		ShutdownXLOG(0, 0);
 		pgstat_send_bgwriter();
-		pgstat_report_wal();
+		pgstat_send_wal(true);
 
 		/* Normal exit from the checkpointer is here */
 		proc_exit(0);			/* done */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 6e8dee9784..f7f28673f8 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -133,8 +133,8 @@ PgStat_MsgWal WalStats;
 
 /*
  * WAL usage counters saved from pgWALUsage at the previous call to
- * pgstat_report_wal(). This is used to calculate how much WAL usage
- * happens between pgstat_report_wal() calls, by substracting
+ * pgstat_send_wal(). This is used to calculate how much WAL usage
+ * happens between pgstat_send_wal() calls, by substracting
  * the previous counters from the current ones.
  */
 static WalUsage prevWalUsage;
@@ -855,9 +855,19 @@ pgstat_report_stat(bool disconnect)
 	TabStatusArray *tsa;
 	int			i;
 
-	/* Don't expend a clock check if nothing to do */
+	/*
+	 * Don't expend a clock check if nothing to do.
+	 *
+	 * Note: regarding to WAL statistics counters, it's not enough to check
+	 * only whether the wal record is generated or not. If a read transaction
+	 * is executed, wal records aren't nomally generated (although HOT makes
+	 * wal records). But, just writes and syncs the wal data may happen when
+	 * to flush buffers.
+	 */
 	if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
 		pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
+		pgWalUsage.wal_records == prevWalUsage.wal_records &&
+		walStats.wal_write == 0 && walStats.wal_sync == 0 &&
 		!have_function_stats && !disconnect)
 		return;
 
@@ -951,7 +961,7 @@ pgstat_report_stat(bool disconnect)
 	pgstat_send_funcstats();
 
 	/* Send WAL statistics */
-	pgstat_report_wal();
+	pgstat_send_wal(true);
 
 	/* Finally send SLRU statistics */
 	pgstat_send_slru();
@@ -2934,7 +2944,7 @@ void
 pgstat_initialize(void)
 {
 	/*
-	 * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can
+	 * Initialize prevWalUsage with pgWalUsage so that pgstat_send_wal() can
 	 * calculate how much pgWalUsage counters are increased by substracting
 	 * prevWalUsage from pgWalUsage.
 	 */
@@ -3046,44 +3056,6 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
-/* ----------
- * pgstat_report_wal() -
- *
- * Calculate how much WAL usage counters are increased and send
- * WAL statistics to the collector.
- *
- * Must be called by processes that generate WAL.
- * ----------
- */
-void
-pgstat_report_wal(void)
-{
-	WalUsage	walusage;
-
-	/*
-	 * Calculate how much WAL usage counters are increased by substracting the
-	 * previous counters from the current ones. Fill the results in WAL stats
-	 * message.
-	 */
-	MemSet(&walusage, 0, sizeof(WalUsage));
-	WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage);
-
-	WalStats.m_wal_records = walusage.wal_records;
-	WalStats.m_wal_fpi = walusage.wal_fpi;
-	WalStats.m_wal_bytes = walusage.wal_bytes;
-
-	/*
-	 * Send WAL stats message to the collector.
-	 */
-	if (!pgstat_send_wal(true))
-		return;
-
-	/*
-	 * Save the current counters for the subsequent calculation of WAL usage.
-	 */
-	prevWalUsage = pgWalUsage;
-}
-
 /* ----------
  * pgstat_send_wal() -
  *
@@ -3091,24 +3063,35 @@ pgstat_report_wal(void)
  *
  * If 'force' is not set, WAL stats message is only sent if enough time has
  * passed since last one was sent to reach PGSTAT_STAT_INTERVAL.
- *
- * Return true if the message is sent, and false otherwise.
  * ----------
  */
-bool
+void
 pgstat_send_wal(bool force)
 {
-	/* We assume this initializes to zeroes */
-	static const PgStat_MsgWal all_zeroes;
 	static TimestampTz sendTime = 0;
 
 	/*
-	 * This function can be called even if nothing at all has happened. In
-	 * this case, avoid sending a completely empty message to the stats
-	 * collector.
+	 * First, to check the WAL stats counters were updated.
+	 *
+	 * Even if the 'force' is true, we don't need to send the stats if the
+	 * counters were not updated.
+	 *
+	 * We can know whether the counters were updated or not to check only
+	 * three counters. So, for performance, we don't allocate another memory
+	 * spaces and check the all stats like pgstat_send_slru().
+	 *
+	 * The current 'wal_records' is the same as the previous one means that
+	 * wal records weren't generated. So, the counters of 'wal_fpi',
+	 * 'wal_bytes', 'm_wal_buffers_full' are not updated neither.
+	 *
+	 * It's not enough to check the number of generated wal records, for
+	 * example the walwriter may write/sync the WAL although it doesn't
+	 * generate wal records. 'wal_write' and 'wal_sync' are zero means the
+	 * counters of time spent are zero too.
 	 */
-	if (memcmp(&WalStats, &all_zeroes, sizeof(PgStat_MsgWal)) == 0)
-		return false;
+	if (pgWalUsage.wal_records == prevWalUsage.wal_records &&
+		walStats.wal_write == 0 && walStats.wal_sync == 0)
+		return;
 
 	if (!force)
 	{
@@ -3116,25 +3099,50 @@ pgstat_send_wal(bool force)
 
 		/*
 		 * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
-		 * msec since we last sent one.
+		 * msec since we last sent one to avoid overloading the stats
+		 * collector.
 		 */
 		if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL))
-			return false;
+			return;
 		sendTime = now;
 	}
 
+	/*
+	 * Set the counters related to generated WAL data if the counters are
+	 * updated.
+	 */
+	if (pgWalUsage.wal_records != prevWalUsage.wal_records)
+	{
+		WalUsage	walusage;
+
+		/*
+		 * Calculate how much WAL usage counters are increased by substracting
+		 * the previous counters from the current ones. Fill the results in
+		 * WAL stats message.
+		 */
+		MemSet(&walusage, 0, sizeof(WalUsage));
+		WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage);
+
+		WalStats.m_wal_records = walusage.wal_records;
+		WalStats.m_wal_fpi = walusage.wal_fpi;
+		WalStats.m_wal_bytes = walusage.wal_bytes;
+	}
+
 	/*
 	 * Prepare and send the message
 	 */
 	pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL);
 	pgstat_send(&WalStats, sizeof(WalStats));
 
+	/*
+	 * Save the current counters for the subsequent calculation of WAL usage.
+	 */
+	prevWalUsage = pgWalUsage;
+
 	/*
 	 * Clear out the statistics buffer, so it can be re-used.
 	 */
 	MemSet(&WalStats, 0, sizeof(WalStats));
-
-	return true;
 }
 
 /* ----------
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index aa8eceda5f..9a0d794a1c 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -16,6 +16,12 @@
 #include "portability/instr_time.h"
 
 
+/*
+ * The accumulated counters for buffer usage.
+ *
+ * The reason the counters are accumulated values is to avoid unexpected
+ * reset because many callers may use them.
+ */
 typedef struct BufferUsage
 {
 	long		shared_blks_hit;	/* # of shared buffer hits */
@@ -32,6 +38,15 @@ typedef struct BufferUsage
 	instr_time	blk_write_time; /* time spent writing */
 } BufferUsage;
 
+/*
+ * The accumulated counters for generated WAL usage.
+ *
+ * The reason the counters are accumulated values is the same as BufferUsage's one.
+ * And the reason to store only generated WAL usage and doesn't store WAL I/O
+ * activity, is that this is assumed for knowing the WAL usage in per query or
+ * transaction. So, common resources for the cluster like WAL I/O activity is
+ * not stored.
+ */
 typedef struct WalUsage
 {
 	long		wal_records;	/* # of WAL records produced */
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 5c5920b0b5..8e048b9172 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1114,8 +1114,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
 extern void pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats);
-extern void pgstat_report_wal(void);
-extern bool pgstat_send_wal(bool force);
+extern void pgstat_send_wal(bool force);
 
 /* ----------
  * Support functions for the SQL-callable functions to
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index c3fc12d76c..ad25f72085 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -830,9 +830,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 			}
 			appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
 			appendStringInfo(&buf,
-							 _("WAL usage: %ld records, %ld full page images, %llu bytes"),
-							 walusage.wal_records,
-							 walusage.wal_fpi,
+							 _("WAL usage: %lld records, %lld full page images, %llu bytes"),
+							 (long long) walusage.wal_records,
+							 (long long) walusage.wal_fpi,
 							 (unsigned long long) walusage.wal_bytes);
 
 			ereport(LOG,
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 3d5198e234..4135d88b1a 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3519,17 +3519,17 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			{
 				appendStringInfoString(es->str, " shared");
 				if (usage->shared_blks_hit > 0)
-					appendStringInfo(es->str, " hit=%ld",
-									 usage->shared_blks_hit);
+					appendStringInfo(es->str, " hit=%lld",
+									 (long long) usage->shared_blks_hit);
 				if (usage->shared_blks_read > 0)
-					appendStringInfo(es->str, " read=%ld",
-									 usage->shared_blks_read);
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->shared_blks_read);
 				if (usage->shared_blks_dirtied > 0)
-					appendStringInfo(es->str, " dirtied=%ld",
-									 usage->shared_blks_dirtied);
+					appendStringInfo(es->str, " dirtied=%lld",
+									 (long long) usage->shared_blks_dirtied);
 				if (usage->shared_blks_written > 0)
-					appendStringInfo(es->str, " written=%ld",
-									 usage->shared_blks_written);
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->shared_blks_written);
 				if (has_local || has_temp)
 					appendStringInfoChar(es->str, ',');
 			}
@@ -3537,17 +3537,17 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			{
 				appendStringInfoString(es->str, " local");
 				if (usage->local_blks_hit > 0)
-					appendStringInfo(es->str, " hit=%ld",
-									 usage->local_blks_hit);
+					appendStringInfo(es->str, " hit=%lld",
+									 (long long) usage->local_blks_hit);
 				if (usage->local_blks_read > 0)
-					appendStringInfo(es->str, " read=%ld",
-									 usage->local_blks_read);
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->local_blks_read);
 				if (usage->local_blks_dirtied > 0)
-					appendStringInfo(es->str, " dirtied=%ld",
-									 usage->local_blks_dirtied);
+					appendStringInfo(es->str, " dirtied=%lld",
+									 (long long) usage->local_blks_dirtied);
 				if (usage->local_blks_written > 0)
-					appendStringInfo(es->str, " written=%ld",
-									 usage->local_blks_written);
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->local_blks_written);
 				if (has_temp)
 					appendStringInfoChar(es->str, ',');
 			}
@@ -3555,11 +3555,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			{
 				appendStringInfoString(es->str, " temp");
 				if (usage->temp_blks_read > 0)
-					appendStringInfo(es->str, " read=%ld",
-									 usage->temp_blks_read);
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->temp_blks_read);
 				if (usage->temp_blks_written > 0)
-					appendStringInfo(es->str, " written=%ld",
-									 usage->temp_blks_written);
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->temp_blks_written);
 			}
 			appendStringInfoChar(es->str, '\n');
 		}
@@ -3631,11 +3631,11 @@ show_wal_usage(ExplainState *es, const WalUsage *usage)
 			appendStringInfoString(es->str, "WAL:");
 
 			if (usage->wal_records > 0)
-				appendStringInfo(es->str, " records=%ld",
-								 usage->wal_records);
+				appendStringInfo(es->str, " records=%lld",
+								 (long long) usage->wal_records);
 			if (usage->wal_fpi > 0)
-				appendStringInfo(es->str, " fpi=%ld",
-								 usage->wal_fpi);
+				appendStringInfo(es->str, " fpi=%lld",
+								 (long long) usage->wal_fpi);
 			if (usage->wal_bytes > 0)
 				appendStringInfo(es->str, " bytes=" UINT64_FORMAT,
 								 usage->wal_bytes);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9a0d794a1c..2e11ab6d4b 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -24,16 +24,16 @@
  */
 typedef struct BufferUsage
 {
-	long		shared_blks_hit;	/* # of shared buffer hits */
-	long		shared_blks_read;	/* # of shared disk blocks read */
-	long		shared_blks_dirtied;	/* # of shared blocks dirtied */
-	long		shared_blks_written;	/* # of shared disk blocks written */
-	long		local_blks_hit; /* # of local buffer hits */
-	long		local_blks_read;	/* # of local disk blocks read */
-	long		local_blks_dirtied; /* # of local blocks dirtied */
-	long		local_blks_written; /* # of local disk blocks written */
-	long		temp_blks_read; /* # of temp blocks read */
-	long		temp_blks_written;	/* # of temp blocks written */
+	int64		shared_blks_hit;	/* # of shared buffer hits */
+	int64		shared_blks_read;	/* # of shared disk blocks read */
+	int64		shared_blks_dirtied;	/* # of shared blocks dirtied */
+	int64		shared_blks_written;	/* # of shared disk blocks written */
+	int64		local_blks_hit; /* # of local buffer hits */
+	int64		local_blks_read;	/* # of local disk blocks read */
+	int64		local_blks_dirtied; /* # of local blocks dirtied */
+	int64		local_blks_written; /* # of local disk blocks written */
+	int64		temp_blks_read; /* # of temp blocks read */
+	int64		temp_blks_written;	/* # of temp blocks written */
 	instr_time	blk_read_time;	/* time spent reading */
 	instr_time	blk_write_time; /* time spent writing */
 } BufferUsage;
@@ -49,8 +49,8 @@ typedef struct BufferUsage
  */
 typedef struct WalUsage
 {
-	long		wal_records;	/* # of WAL records produced */
-	long		wal_fpi;		/* # of WAL full page images produced */
+	int64		wal_records;	/* # of WAL records produced */
+	int64		wal_fpi;		/* # of WAL full page images produced */
 	uint64		wal_bytes;		/* size of WAL records produced */
 } WalUsage;
 

Reply via email to