On 2021/04/23 16:30, Fujii Masao wrote: > > > On 2021/04/23 10:25, Andres Freund wrote: >> Hi, >> >> On 2021-04-23 09:26:17 +0900, Masahiro Ikeda wrote: >>> On 2021/04/23 0:36, Andres Freund wrote: >>>> On Thu, Apr 22, 2021, at 06:42, Fujii Masao wrote: >>>>> On 2021/04/21 18:31, Masahiro Ikeda wrote: >>>>>>> BTW, is it better to change PgStat_Counter from int64 to uint64 >>>>>>> because> there aren't any counters with negative number? >>>>> On second thought, it's ok even if the counters like wal_records get >>>>> overflowed. >>>>> Because they are always used to calculate the diff between the previous >>>>> and >>>>> current counters. Even when the current counters are overflowed and >>>>> the previous ones are not, WalUsageAccumDiff() seems to return the right >>>>> diff of them. If this understanding is right, I'd withdraw my comment and >>>>> it's ok to use "long" type for those counters. Thought? >>>> >>>> Why long? It's of a platform dependent size, which doesn't seem useful >>>> here? >>> >>> I think it's ok to unify uint64. Although it's better to use small size for >>> cache, the idea works well for only some platform which use 4bytes for >>> "long". >>> >>> >>> (Although I'm not familiar with the standardization...) >>> It seems that we need to adopt unsinged long if use "long", which may be >>> 4bytes. >>> >>> I though WalUsageAccumDiff() seems to return the right value too. But, I >>> researched deeply and found that ISO/IEC 9899:1999 defines unsinged integer >>> never overflow(2.6.5 Types 9th section) although it doesn't define overflow >>> of >>> signed integer type. >>> >>> If my understanding is right, the definition only guarantee >>> WalUsageAccumDiff() returns the right value only if the type is unsigned >>> integer. So, it's safe to change "long" to "unsigned long". >> >> I think this should just use 64bit counters. Emitting more than 4 >> billion records in one transaction isn't actually particularly rare. And > > Right. I agree to change the types of the counters to int64. > > I think that it's better to make this change as a separate patch from > the changes for pg_stat_wal view.
Thanks for your comments. OK, I separate two patches. First patch has only the changes for pg_stat_wal view. ("v6-0001-performance-improvements-of-reporting-wal-stats-without-introducing-a-new-variable.patch") Second one has the changes for the type of the BufferUsage's and WalUsage's members. I change the type from long to int64. Is it better to make new thread? ("v6-0002-change-the-data-type-of-XXXUsage-from-long-to-int64.patch") Regards, -- Masahiro Ikeda NTT DATA CORPORATION
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 237e13361b..75ecd00c23 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -17,6 +17,12 @@ #include "executor/instrument.h" +/* + * Buffer and generated WAL usage counters. + * + * The counters are accumulated values. There are infrastructures + * to add the values and calculate the difference within a specific period. + */ BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e7e6a2a459..1761694a5b 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -505,7 +505,7 @@ CheckpointerMain(void) pgstat_send_bgwriter(); /* Send WAL statistics to the stats collector. */ - pgstat_report_wal(); + pgstat_send_wal(true); /* * If any checkpoint flags have been set, redo the loop to handle the @@ -583,7 +583,7 @@ HandleCheckpointerInterrupts(void) BgWriterStats.m_requested_checkpoints++; ShutdownXLOG(0, 0); pgstat_send_bgwriter(); - pgstat_report_wal(); + pgstat_send_wal(true); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 6e8dee9784..f7f28673f8 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -133,8 +133,8 @@ PgStat_MsgWal WalStats; /* * WAL usage counters saved from pgWALUsage at the previous call to - * pgstat_report_wal(). This is used to calculate how much WAL usage - * happens between pgstat_report_wal() calls, by substracting + * pgstat_send_wal(). This is used to calculate how much WAL usage + * happens between pgstat_send_wal() calls, by substracting * the previous counters from the current ones. */ static WalUsage prevWalUsage; @@ -855,9 +855,19 @@ pgstat_report_stat(bool disconnect) TabStatusArray *tsa; int i; - /* Don't expend a clock check if nothing to do */ + /* + * Don't expend a clock check if nothing to do. + * + * Note: regarding to WAL statistics counters, it's not enough to check + * only whether the wal record is generated or not. If a read transaction + * is executed, wal records aren't nomally generated (although HOT makes + * wal records). But, just writes and syncs the wal data may happen when + * to flush buffers. + */ if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) && pgStatXactCommit == 0 && pgStatXactRollback == 0 && + pgWalUsage.wal_records == prevWalUsage.wal_records && + walStats.wal_write == 0 && walStats.wal_sync == 0 && !have_function_stats && !disconnect) return; @@ -951,7 +961,7 @@ pgstat_report_stat(bool disconnect) pgstat_send_funcstats(); /* Send WAL statistics */ - pgstat_report_wal(); + pgstat_send_wal(true); /* Finally send SLRU statistics */ pgstat_send_slru(); @@ -2934,7 +2944,7 @@ void pgstat_initialize(void) { /* - * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can + * Initialize prevWalUsage with pgWalUsage so that pgstat_send_wal() can * calculate how much pgWalUsage counters are increased by substracting * prevWalUsage from pgWalUsage. */ @@ -3046,44 +3056,6 @@ pgstat_send_bgwriter(void) MemSet(&BgWriterStats, 0, sizeof(BgWriterStats)); } -/* ---------- - * pgstat_report_wal() - - * - * Calculate how much WAL usage counters are increased and send - * WAL statistics to the collector. - * - * Must be called by processes that generate WAL. - * ---------- - */ -void -pgstat_report_wal(void) -{ - WalUsage walusage; - - /* - * Calculate how much WAL usage counters are increased by substracting the - * previous counters from the current ones. Fill the results in WAL stats - * message. - */ - MemSet(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); - - WalStats.m_wal_records = walusage.wal_records; - WalStats.m_wal_fpi = walusage.wal_fpi; - WalStats.m_wal_bytes = walusage.wal_bytes; - - /* - * Send WAL stats message to the collector. - */ - if (!pgstat_send_wal(true)) - return; - - /* - * Save the current counters for the subsequent calculation of WAL usage. - */ - prevWalUsage = pgWalUsage; -} - /* ---------- * pgstat_send_wal() - * @@ -3091,24 +3063,35 @@ pgstat_report_wal(void) * * If 'force' is not set, WAL stats message is only sent if enough time has * passed since last one was sent to reach PGSTAT_STAT_INTERVAL. - * - * Return true if the message is sent, and false otherwise. * ---------- */ -bool +void pgstat_send_wal(bool force) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgWal all_zeroes; static TimestampTz sendTime = 0; /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * First, to check the WAL stats counters were updated. + * + * Even if the 'force' is true, we don't need to send the stats if the + * counters were not updated. + * + * We can know whether the counters were updated or not to check only + * three counters. So, for performance, we don't allocate another memory + * spaces and check the all stats like pgstat_send_slru(). + * + * The current 'wal_records' is the same as the previous one means that + * wal records weren't generated. So, the counters of 'wal_fpi', + * 'wal_bytes', 'm_wal_buffers_full' are not updated neither. + * + * It's not enough to check the number of generated wal records, for + * example the walwriter may write/sync the WAL although it doesn't + * generate wal records. 'wal_write' and 'wal_sync' are zero means the + * counters of time spent are zero too. */ - if (memcmp(&WalStats, &all_zeroes, sizeof(PgStat_MsgWal)) == 0) - return false; + if (pgWalUsage.wal_records == prevWalUsage.wal_records && + walStats.wal_write == 0 && walStats.wal_sync == 0) + return; if (!force) { @@ -3116,25 +3099,50 @@ pgstat_send_wal(bool force) /* * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one. + * msec since we last sent one to avoid overloading the stats + * collector. */ if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL)) - return false; + return; sendTime = now; } + /* + * Set the counters related to generated WAL data if the counters are + * updated. + */ + if (pgWalUsage.wal_records != prevWalUsage.wal_records) + { + WalUsage walusage; + + /* + * Calculate how much WAL usage counters are increased by substracting + * the previous counters from the current ones. Fill the results in + * WAL stats message. + */ + MemSet(&walusage, 0, sizeof(WalUsage)); + WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); + + WalStats.m_wal_records = walusage.wal_records; + WalStats.m_wal_fpi = walusage.wal_fpi; + WalStats.m_wal_bytes = walusage.wal_bytes; + } + /* * Prepare and send the message */ pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL); pgstat_send(&WalStats, sizeof(WalStats)); + /* + * Save the current counters for the subsequent calculation of WAL usage. + */ + prevWalUsage = pgWalUsage; + /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&WalStats, 0, sizeof(WalStats)); - - return true; } /* ---------- diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index aa8eceda5f..9a0d794a1c 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -16,6 +16,12 @@ #include "portability/instr_time.h" +/* + * The accumulated counters for buffer usage. + * + * The reason the counters are accumulated values is to avoid unexpected + * reset because many callers may use them. + */ typedef struct BufferUsage { long shared_blks_hit; /* # of shared buffer hits */ @@ -32,6 +38,15 @@ typedef struct BufferUsage instr_time blk_write_time; /* time spent writing */ } BufferUsage; +/* + * The accumulated counters for generated WAL usage. + * + * The reason the counters are accumulated values is the same as BufferUsage's one. + * And the reason to store only generated WAL usage and doesn't store WAL I/O + * activity, is that this is assumed for knowing the WAL usage in per query or + * transaction. So, common resources for the cluster like WAL I/O activity is + * not stored. + */ typedef struct WalUsage { long wal_records; /* # of WAL records produced */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 5c5920b0b5..8e048b9172 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1114,8 +1114,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, extern void pgstat_send_archiver(const char *xlog, bool failed); extern void pgstat_send_bgwriter(void); extern void pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats); -extern void pgstat_report_wal(void); -extern bool pgstat_send_wal(bool force); +extern void pgstat_send_wal(bool force); /* ---------- * Support functions for the SQL-callable functions to
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index c3fc12d76c..ad25f72085 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -830,9 +830,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, } appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0)); appendStringInfo(&buf, - _("WAL usage: %ld records, %ld full page images, %llu bytes"), - walusage.wal_records, - walusage.wal_fpi, + _("WAL usage: %lld records, %lld full page images, %llu bytes"), + (long long) walusage.wal_records, + (long long) walusage.wal_fpi, (unsigned long long) walusage.wal_bytes); ereport(LOG, diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 3d5198e234..4135d88b1a 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3519,17 +3519,17 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) { appendStringInfoString(es->str, " shared"); if (usage->shared_blks_hit > 0) - appendStringInfo(es->str, " hit=%ld", - usage->shared_blks_hit); + appendStringInfo(es->str, " hit=%lld", + (long long) usage->shared_blks_hit); if (usage->shared_blks_read > 0) - appendStringInfo(es->str, " read=%ld", - usage->shared_blks_read); + appendStringInfo(es->str, " read=%lld", + (long long) usage->shared_blks_read); if (usage->shared_blks_dirtied > 0) - appendStringInfo(es->str, " dirtied=%ld", - usage->shared_blks_dirtied); + appendStringInfo(es->str, " dirtied=%lld", + (long long) usage->shared_blks_dirtied); if (usage->shared_blks_written > 0) - appendStringInfo(es->str, " written=%ld", - usage->shared_blks_written); + appendStringInfo(es->str, " written=%lld", + (long long) usage->shared_blks_written); if (has_local || has_temp) appendStringInfoChar(es->str, ','); } @@ -3537,17 +3537,17 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) { appendStringInfoString(es->str, " local"); if (usage->local_blks_hit > 0) - appendStringInfo(es->str, " hit=%ld", - usage->local_blks_hit); + appendStringInfo(es->str, " hit=%lld", + (long long) usage->local_blks_hit); if (usage->local_blks_read > 0) - appendStringInfo(es->str, " read=%ld", - usage->local_blks_read); + appendStringInfo(es->str, " read=%lld", + (long long) usage->local_blks_read); if (usage->local_blks_dirtied > 0) - appendStringInfo(es->str, " dirtied=%ld", - usage->local_blks_dirtied); + appendStringInfo(es->str, " dirtied=%lld", + (long long) usage->local_blks_dirtied); if (usage->local_blks_written > 0) - appendStringInfo(es->str, " written=%ld", - usage->local_blks_written); + appendStringInfo(es->str, " written=%lld", + (long long) usage->local_blks_written); if (has_temp) appendStringInfoChar(es->str, ','); } @@ -3555,11 +3555,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning) { appendStringInfoString(es->str, " temp"); if (usage->temp_blks_read > 0) - appendStringInfo(es->str, " read=%ld", - usage->temp_blks_read); + appendStringInfo(es->str, " read=%lld", + (long long) usage->temp_blks_read); if (usage->temp_blks_written > 0) - appendStringInfo(es->str, " written=%ld", - usage->temp_blks_written); + appendStringInfo(es->str, " written=%lld", + (long long) usage->temp_blks_written); } appendStringInfoChar(es->str, '\n'); } @@ -3631,11 +3631,11 @@ show_wal_usage(ExplainState *es, const WalUsage *usage) appendStringInfoString(es->str, "WAL:"); if (usage->wal_records > 0) - appendStringInfo(es->str, " records=%ld", - usage->wal_records); + appendStringInfo(es->str, " records=%lld", + (long long) usage->wal_records); if (usage->wal_fpi > 0) - appendStringInfo(es->str, " fpi=%ld", - usage->wal_fpi); + appendStringInfo(es->str, " fpi=%lld", + (long long) usage->wal_fpi); if (usage->wal_bytes > 0) appendStringInfo(es->str, " bytes=" UINT64_FORMAT, usage->wal_bytes); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9a0d794a1c..2e11ab6d4b 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -24,16 +24,16 @@ */ typedef struct BufferUsage { - long shared_blks_hit; /* # of shared buffer hits */ - long shared_blks_read; /* # of shared disk blocks read */ - long shared_blks_dirtied; /* # of shared blocks dirtied */ - long shared_blks_written; /* # of shared disk blocks written */ - long local_blks_hit; /* # of local buffer hits */ - long local_blks_read; /* # of local disk blocks read */ - long local_blks_dirtied; /* # of local blocks dirtied */ - long local_blks_written; /* # of local disk blocks written */ - long temp_blks_read; /* # of temp blocks read */ - long temp_blks_written; /* # of temp blocks written */ + int64 shared_blks_hit; /* # of shared buffer hits */ + int64 shared_blks_read; /* # of shared disk blocks read */ + int64 shared_blks_dirtied; /* # of shared blocks dirtied */ + int64 shared_blks_written; /* # of shared disk blocks written */ + int64 local_blks_hit; /* # of local buffer hits */ + int64 local_blks_read; /* # of local disk blocks read */ + int64 local_blks_dirtied; /* # of local blocks dirtied */ + int64 local_blks_written; /* # of local disk blocks written */ + int64 temp_blks_read; /* # of temp blocks read */ + int64 temp_blks_written; /* # of temp blocks written */ instr_time blk_read_time; /* time spent reading */ instr_time blk_write_time; /* time spent writing */ } BufferUsage; @@ -49,8 +49,8 @@ typedef struct BufferUsage */ typedef struct WalUsage { - long wal_records; /* # of WAL records produced */ - long wal_fpi; /* # of WAL full page images produced */ + int64 wal_records; /* # of WAL records produced */ + int64 wal_fpi; /* # of WAL full page images produced */ uint64 wal_bytes; /* size of WAL records produced */ } WalUsage;