Hi,
On 2/10/21 11:10 PM, Stephen Frost wrote:
> Greetings,
>
> * Heikki Linnakangas ([email protected]) wrote:
>> On 05/02/2021 23:22, Stephen Frost wrote:
>>> Unless there's anything else on this, I'll commit these sometime next
>>> week.
>>
>> One more thing: Instead of using 'effective_io_concurrency' GUC directly,
>> should call get_tablespace_maintenance_io_concurrency().
>
> Ah, yeah, of course.
>
> Updated patch attached.
>
A couple minor comments:
1) I think the patch should be split into two parts, one adding the
track_io_timing, one adding the prefetching.
2) I haven't tried but I'm pretty sure there'll be a compiler warning
about 'prefetch_maximum' being unused without USE_PREFETCH defined.
3) Is there a way to reduce the amount of #ifdef in acquire_sample_rows?
This makes the code rather hard to read, IMHO. It seems to me we can
move the code around a bit and merge some of the #ifdef blocks - see the
attached patch. Most of this is fairly trivial, with the exception of
moving PrefetchBuffer before table_scan_analyze_next_block - AFAIK this
does not materially change the behavior, but perhaps I'm wrong.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 967de73596..492ed78e1c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7457,9 +7457,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
I/O timing information is
displayed in <link linkend="monitoring-pg-stat-database-view">
<structname>pg_stat_database</structname></link>, in the output of
- <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal> option is
- used, and by <xref linkend="pgstatstatements"/>. Only superusers can
- change this setting.
+ <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal> option
+ is used, by autovacuum for auto-vacuums and auto-analyzes, when
+ <xref linkend="guc-log-autovacuum-min-duration"/> is set and by
+ <xref linkend="pgstatstatements"/>. Only superusers can change this
+ setting.
</para>
</listitem>
</varlistentry>
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index d8f847b0e6..ab40be0771 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -440,6 +440,8 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
TransactionId new_frozen_xid;
MultiXactId new_min_multi;
ErrorContextCallback errcallback;
+ PgStat_Counter startreadtime = 0;
+ PgStat_Counter startwritetime = 0;
Assert(params != NULL);
Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
@@ -454,6 +456,11 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
{
pg_rusage_init(&ru0);
starttime = GetCurrentTimestamp();
+ if (track_io_timing)
+ {
+ startreadtime = pgStatBlockReadTime;
+ startwritetime = pgStatBlockWriteTime;
+ }
}
if (params->options & VACOPT_VERBOSE)
@@ -674,6 +681,17 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
(long long) VacuumPageDirty);
appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
read_rate, write_rate);
+ if (track_io_timing)
+ {
+ appendStringInfoString(&buf, _("I/O Timings:"));
+ if (pgStatBlockReadTime - startreadtime > 0)
+ appendStringInfo(&buf, _(" read=%.3f"),
+ (double) (pgStatBlockReadTime - startreadtime) / 1000);
+ if (pgStatBlockWriteTime - startwritetime > 0)
+ appendStringInfo(&buf, _(" write=%.3f"),
+ (double) (pgStatBlockWriteTime - startwritetime) / 1000);
+ appendStringInfoChar(&buf, '\n');
+ }
appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
appendStringInfo(&buf,
_("WAL usage: %ld records, %ld full page images, %llu bytes"),
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 7295cf0215..fc32598639 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -63,6 +63,7 @@
#include "utils/pg_rusage.h"
#include "utils/sampling.h"
#include "utils/sortsupport.h"
+#include "utils/spccache.h"
#include "utils/syscache.h"
#include "utils/timestamp.h"
@@ -312,6 +313,11 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
Oid save_userid;
int save_sec_context;
int save_nestlevel;
+ int64 AnalyzePageHit = VacuumPageHit;
+ int64 AnalyzePageMiss = VacuumPageMiss;
+ int64 AnalyzePageDirty = VacuumPageDirty;
+ PgStat_Counter startreadtime = 0;
+ PgStat_Counter startwritetime = 0;
if (inh)
ereport(elevel,
@@ -346,8 +352,14 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
/* measure elapsed time iff autovacuum logging requires it */
if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
{
+ if (track_io_timing)
+ {
+ startreadtime = pgStatBlockReadTime;
+ startwritetime = pgStatBlockWriteTime;
+ }
+
pg_rusage_init(&ru0);
- if (params->log_min_duration > 0)
+ if (params->log_min_duration >= 0)
starttime = GetCurrentTimestamp();
}
@@ -682,15 +694,90 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
/* Log the action if appropriate */
if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
{
+ TimestampTz endtime = GetCurrentTimestamp();
+
if (params->log_min_duration == 0 ||
- TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
+ TimestampDifferenceExceeds(starttime, endtime,
params->log_min_duration))
+ {
+ long delay_in_ms;
+ double read_rate = 0;
+ double write_rate = 0;
+ StringInfoData buf;
+
+ /*
+ * Calculate the difference in the Page Hit/Miss/Dirty that
+ * happened as part of the analyze by subtracting out the
+ * pre-analyze values which we saved above.
+ */
+ AnalyzePageHit = VacuumPageHit - AnalyzePageHit;
+ AnalyzePageMiss = VacuumPageMiss - AnalyzePageMiss;
+ AnalyzePageDirty = VacuumPageDirty - AnalyzePageDirty;
+
+ /*
+ * We do not expect an analyze to take > 25 days and it simplifies
+ * things a bit to use TimestampDifferenceMilliseconds.
+ */
+ delay_in_ms = TimestampDifferenceMilliseconds(starttime, endtime);
+
+ /*
+ * Note that we are reporting these read/write rates in the same
+ * manner as VACUUM does, which means that while the 'average read
+ * rate' here actually corresponds to page misses and resulting
+ * reads which are also picked up by track_io_timing, if enabled,
+ * the 'average write rate' is actually talking about the rate of
+ * pages being dirtied, not being written out, so it's typical to
+ * have a non-zero 'avg write rate' while I/O Timings only reports
+ * reads.
+ *
+ * It's not clear that an ANALYZE will ever result in
+ * FlushBuffer() being called, but we track and support reporting
+ * on I/O write time in case that changes as it's practically free
+ * to do so anyway.
+ */
+
+ if (delay_in_ms > 0)
+ {
+ read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) /
+ (delay_in_ms / 1000.0);
+ write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) /
+ (delay_in_ms / 1000.0);
+ }
+
+ /*
+ * We split this up so we don't emit empty I/O timing values when
+ * track_io_timing isn't enabled.
+ */
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, _("automatic analyze of table \"%s.%s.%s\"\n"),
+ get_database_name(MyDatabaseId),
+ get_namespace_name(RelationGetNamespace(onerel)),
+ RelationGetRelationName(onerel));
+ appendStringInfo(&buf, _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
+ (long long) AnalyzePageHit,
+ (long long) AnalyzePageMiss,
+ (long long) AnalyzePageDirty);
+ appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
+ read_rate, write_rate);
+ if (track_io_timing)
+ {
+ appendStringInfoString(&buf, _("I/O Timings:"));
+ if (pgStatBlockReadTime - startreadtime > 0)
+ appendStringInfo(&buf, _(" read=%.3f"),
+ (double) (pgStatBlockReadTime - startreadtime) / 1000);
+ if (pgStatBlockWriteTime - startwritetime > 0)
+ appendStringInfo(&buf, _(" write=%.3f"),
+ (double) (pgStatBlockWriteTime - startwritetime) / 1000);
+ appendStringInfoChar(&buf, '\n');
+ }
+ appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
+
ereport(LOG,
- (errmsg("automatic analyze of table \"%s.%s.%s\" system usage: %s",
- get_database_name(MyDatabaseId),
- get_namespace_name(RelationGetNamespace(onerel)),
- RelationGetRelationName(onerel),
- pg_rusage_show(&ru0))));
+ (errmsg_internal("%s", buf.data)));
+
+ pfree(buf.data);
+ }
}
/* Roll back any GUC changes executed by index functions */
@@ -1031,6 +1118,7 @@ acquire_sample_rows(Relation onerel, int elevel,
double liverows = 0; /* # live rows seen */
double deadrows = 0; /* # dead rows seen */
double rowstoskip = -1; /* -1 means not set yet */
+ long randseed; /* Seed for block sampler(s) */
BlockNumber totalblocks;
TransactionId OldestXmin;
BlockSamplerData bs;
@@ -1039,6 +1127,10 @@ acquire_sample_rows(Relation onerel, int elevel,
TableScanDesc scan;
BlockNumber nblocks;
BlockNumber blksdone = 0;
+#ifdef USE_PREFETCH
+ int prefetch_maximum = 0; /* blocks to prefetch if enabled */
+ BlockSamplerData prefetch_bs;
+#endif
Assert(targrows > 0);
@@ -1048,7 +1140,8 @@ acquire_sample_rows(Relation onerel, int elevel,
OldestXmin = GetOldestNonRemovableTransactionId(onerel);
/* Prepare for sampling block numbers */
- nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random());
+ randseed = random();
+ nblocks = BlockSampler_Init(&bs, totalblocks, targrows, randseed);
/* Report sampling block numbers */
pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL,
@@ -1060,13 +1153,71 @@ acquire_sample_rows(Relation onerel, int elevel,
scan = table_beginscan_analyze(onerel);
slot = table_slot_create(onerel, NULL);
+#ifdef USE_PREFETCH
+
+ prefetch_maximum = get_tablespace_io_concurrency(onerel->rd_rel->reltablespace);
+ /* Create another BlockSampler, using the same seed, for prefetching */
+ if (prefetch_maximum)
+ (void) BlockSampler_Init(&prefetch_bs, totalblocks, targrows, randseed);
+
+ /*
+ * If we are doing prefetching, then go ahead and tell the kernel about
+ * the first set of pages we are going to want. This also moves our
+ * iterator out ahead of the main one being used, where we will keep it so
+ * that we're always pre-fetching out prefetch_maximum number of blocks
+ * ahead.
+ */
+ if (prefetch_maximum)
+ {
+ for (int i = 0; i < prefetch_maximum; i++)
+ {
+ BlockNumber prefetch_block;
+
+ if (!BlockSampler_HasMore(&prefetch_bs))
+ break;
+
+ prefetch_block = BlockSampler_Next(&prefetch_bs);
+ PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_block);
+ }
+ }
+
+#endif
+
/* Outer loop over blocks to sample */
while (BlockSampler_HasMore(&bs))
{
BlockNumber targblock = BlockSampler_Next(&bs);
+#ifdef USE_PREFETCH
+
+ BlockNumber prefetch_targblock = InvalidBlockNumber;
+
+ /*
+ * Make sure that every time the main BlockSampler is moved forward
+ * that our prefetch BlockSampler also gets moved forward, so that we
+ * always stay out ahead.
+ */
+ if (prefetch_maximum && BlockSampler_HasMore(&prefetch_bs))
+ prefetch_targblock = BlockSampler_Next(&prefetch_bs);
+
+ /*
+ * When pre-fetching, after we get a block, tell the kernel about the
+ * next one we will want, if there's any left.
+ *
+ * We want to do this even if the table_scan_analyze_next_block() call
+ * below decides against analyzing the block it picked.
+ */
+ if (prefetch_targblock != InvalidBlockNumber)
+ PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_targblock);
+
+#endif
+
vacuum_delay_point();
+ /*
+ * Don't analyze if table_scan_analyze_next_block() indicated this
+ * block is unsuitable for analyzing.
+ */
if (!table_scan_analyze_next_block(scan, targblock, vac_strategy))
continue;