On Thu, Mar 19, 2020 at 03:44:49PM -0700, Andres Freund wrote:
> But uh, unfortunately the vacuum delay code just sleeps without setting
> a wait event:
...
> Seems like it should instead use a new wait event in the PG_WAIT_TIMEOUT
> class?
>
> Given how frequently we run into trouble with [auto]vacuum throttling
> being a problem, and there not being any way to monitor that currently,
> that seems like it'd be a significant improvement, given the effort?
I see that pg_sleep sets WAIT_EVENT_PG_SLEEP, but pg_usleep doesn't, I guess
because the overhead is significant for a small number of usecs, and because it
doesn't work for pg_usleep to set a generic event if callers want to be able to
set a more specific wait event.
Also, I noticed that SLEEP_ON_ASSERT comment (31338352b) wants to use pg_usleep
"which seems too short.". Surely it should use pg_sleep, added at 782eefc58 a
few years later ?
Also, there was a suggestion recently that this should have a separate
vacuum_progress phase:
|vacuumlazy.c:#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
|vacuumlazy.c:pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
I was planning to look at that eventually ; should it have a wait event instead
or in addition ?
> It'd probably also be helpful to report the total time [auto]vacuum
> spent being delayed for vacuum verbose/autovacuum logging, but imo
> that'd be a parallel feature to a wait event, not a replacement.
This requires wider changes than I anticipated.
2020-03-20 22:35:51.308 CDT [16534] LOG: automatic aggressive vacuum of table
"template1.pg_catalog.pg_class": index scans: 1
pages: 0 removed, 11 remain, 0 skipped due to pins, 0 skipped frozen
tuples: 6 removed, 405 remain, 0 are dead but not yet removable, oldest
xmin: 1574
buffer usage: 76 hits, 7 misses, 8 dirtied
avg read rate: 16.378 MB/s, avg write rate: 18.718 MB/s
Cost-based delay: 2 msec
system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
VACUUM VERBOSE wouldn't normally be run with cost_delay > 0, so that field will
typically be zero, so I made it conditional, which is supposedly why it's
written like that, even though that's not otherwise being used since 17eaae98.
Added at https://commitfest.postgresql.org/28/2515/
--
Justin
>From 68c5ad8c7a9feb0c68afad310e3f52c21c3cdbaf Mon Sep 17 00:00:00 2001
From: Justin Pryzby <[email protected]>
Date: Fri, 20 Mar 2020 20:47:30 -0500
Subject: [PATCH v1 1/2] Report wait event for cost-based vacuum delay
---
doc/src/sgml/monitoring.sgml | 2 ++
src/backend/commands/vacuum.c | 2 ++
src/backend/postmaster/pgstat.c | 3 +++
src/include/pgstat.h | 3 ++-
4 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 5bffdcce10..46c99a55b7 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1507,6 +1507,8 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
(<filename>pg_wal</filename>, archive or stream) before trying
again to retrieve WAL data, at recovery.
</entry>
+ <entry><literal>VacuumDelay</literal></entry>
+ <entry>Waiting in a cost-based vacuum delay point.</entry>
</row>
<row>
<entry morerows="68"><literal>IO</literal></entry>
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index d625d17bf4..59731d687f 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -2019,7 +2019,9 @@ vacuum_delay_point(void)
if (msec > VacuumCostDelay * 4)
msec = VacuumCostDelay * 4;
+ pgstat_report_wait_start(WAIT_EVENT_VACUUM_DELAY);
pg_usleep((long) (msec * 1000));
+ pgstat_report_wait_end();
VacuumCostBalance = 0;
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index d29c211a76..742ec07b59 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3824,6 +3824,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
case WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL:
event_name = "RecoveryRetrieveRetryInterval";
break;
+ case WAIT_EVENT_VACUUM_DELAY:
+ event_name = "VacuumDelay";
+ break;
/* no default case, so that compiler will warn */
}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 851d0a7246..4db40e23cc 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -848,7 +848,8 @@ typedef enum
WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
WAIT_EVENT_PG_SLEEP,
WAIT_EVENT_RECOVERY_APPLY_DELAY,
- WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL
+ WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL,
+ WAIT_EVENT_VACUUM_DELAY,
} WaitEventTimeout;
/* ----------
--
2.17.0
>From 8153d909cabb94474890f0b55c7733f33923e3c5 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <[email protected]>
Date: Fri, 20 Mar 2020 22:08:09 -0500
Subject: [PATCH v1 2/2] vacuum to report time spent in cost-based delay
---
src/backend/access/gin/ginfast.c | 6 +++---
src/backend/access/gin/ginvacuum.c | 6 +++---
src/backend/access/gist/gistvacuum.c | 2 +-
src/backend/access/hash/hash.c | 2 +-
src/backend/access/heap/vacuumlazy.c | 17 +++++++++++------
src/backend/access/nbtree/nbtree.c | 2 +-
src/backend/access/spgist/spgvacuum.c | 4 ++--
src/backend/commands/vacuum.c | 8 ++++++--
src/include/access/genam.h | 1 +
src/include/commands/vacuum.h | 2 +-
10 files changed, 30 insertions(+), 20 deletions(-)
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index 11d7ec067a..c99dc4a8be 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -892,7 +892,7 @@ ginInsertCleanup(GinState *ginstate, bool full_clean,
*/
processPendingPage(&accum, &datums, page, FirstOffsetNumber);
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
/*
* Is it time to flush memory to disk? Flush if we are at the end of
@@ -929,7 +929,7 @@ ginInsertCleanup(GinState *ginstate, bool full_clean,
{
ginEntryInsert(ginstate, attnum, key, category,
list, nlist, NULL);
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
}
/*
@@ -1002,7 +1002,7 @@ ginInsertCleanup(GinState *ginstate, bool full_clean,
/*
* Read next page in pending list
*/
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
buffer = ReadBuffer(index, blkno);
LockBuffer(buffer, GIN_SHARE);
page = BufferGetPage(buffer);
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 8ae4fd95a7..3ab88153cb 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -662,12 +662,12 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
UnlockReleaseBuffer(buffer);
}
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
for (i = 0; i < nRoot; i++)
{
ginVacuumPostingTree(&gvs, rootOfPostingTree[i]);
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
}
if (blkno == InvalidBlockNumber) /* rightmost page */
@@ -748,7 +748,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Buffer buffer;
Page page;
- vacuum_delay_point();
+ stats->delay_msec += vacuum_delay_point();
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index a9c616c772..d58b11486c 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -265,7 +265,7 @@ restart:
recurse_to = InvalidBlockNumber;
/* call vacuum_delay_point while not holding any buffer lock */
- vacuum_delay_point();
+ vstate->stats->delay_msec += vacuum_delay_point();
buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
info->strategy);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 4871b7ff4d..86a9c7fdaa 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -709,7 +709,7 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
bool retain_pin = false;
bool clear_dead_marking = false;
- vacuum_delay_point();
+ // XXX stats->delay_msec += vacuum_delay_point();
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 03c43efc32..b03b066aa4 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -285,6 +285,7 @@ typedef struct LVRelStats
double new_dead_tuples; /* new estimated total # of dead tuples */
BlockNumber pages_removed;
double tuples_deleted;
+ double delay_msec; /* milliseconds delay by vacuum_delay_point() */
BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
LVDeadTuples *dead_tuples;
int num_index_scans;
@@ -464,6 +465,7 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
vacrelstats->num_index_scans = 0;
vacrelstats->pages_removed = 0;
+ vacrelstats->delay_msec = 0;
vacrelstats->lock_waiter_detected = false;
/* Open all indexes of the relation */
@@ -620,8 +622,8 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
(long long) VacuumPageDirty);
appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
read_rate, write_rate);
- appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
-
+ appendStringInfo(&buf, _("system usage: %s, cost-based delay: %.0f msec"),
+ pg_rusage_show(&ru0), vacrelstats->delay_msec);
ereport(LOG,
(errmsg_internal("%s", buf.data)));
pfree(buf.data);
@@ -860,7 +862,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
break;
}
- vacuum_delay_point();
+ vacrelstats->delay_msec += vacuum_delay_point();
next_unskippable_block++;
}
}
@@ -916,7 +918,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
break;
}
- vacuum_delay_point();
+ vacrelstats->delay_msec += vacuum_delay_point();
next_unskippable_block++;
}
}
@@ -966,7 +968,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
all_visible_according_to_vm = true;
}
- vacuum_delay_point();
+ vacrelstats->delay_msec += vacuum_delay_point();
/*
* If we are close to overrunning the available space for dead-tuple
@@ -1690,6 +1692,9 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
"%u pages are entirely empty.\n",
empty_pages),
empty_pages);
+ if (vacrelstats->delay_msec > 0)
+ appendStringInfo(&buf, "Cost-based delay: %.0f msec\n",
+ vacrelstats->delay_msec);
appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
ereport(elevel,
@@ -1788,7 +1793,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
Page page;
Size freespace;
- vacuum_delay_point();
+ vacrelstats->delay_msec += vacuum_delay_point();
tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples->itemptrs[tupindex]);
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4bb16297c3..7ac8f6df3c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1098,7 +1098,7 @@ restart:
recurse_to = P_NONE;
/* call vacuum_delay_point while not holding any buffer lock */
- vacuum_delay_point();
+ vstate->stats->delay_msec += vacuum_delay_point();
/*
* We can't use _bt_getbuf() here because it always applies
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index bd98707f3c..d531718a89 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -611,7 +611,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
Page page;
/* call vacuum_delay_point while not holding any buffer lock */
- vacuum_delay_point();
+ bds->stats->delay_msec += vacuum_delay_point();
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, bds->info->strategy);
@@ -690,7 +690,7 @@ spgprocesspending(spgBulkDeleteState *bds)
continue; /* ignore already-done items */
/* call vacuum_delay_point while not holding any buffer lock */
- vacuum_delay_point();
+ bds->stats->delay_msec += vacuum_delay_point();
/* examine the referenced page */
blkno = ItemPointerGetBlockNumber(&pitem->tid);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 59731d687f..314513512c 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1990,10 +1990,11 @@ vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
/*
* vacuum_delay_point --- check for interrupts and cost-based delay.
*
+ * Return the number of milliseconds delayed.
* This should be called in each major loop of VACUUM processing,
* typically once per page processed.
*/
-void
+double
vacuum_delay_point(void)
{
double msec = 0;
@@ -2002,7 +2003,7 @@ vacuum_delay_point(void)
CHECK_FOR_INTERRUPTS();
if (!VacuumCostActive || InterruptPending)
- return;
+ return 0;
/*
* For parallel vacuum, the delay is computed based on the shared cost
@@ -2030,7 +2031,10 @@ vacuum_delay_point(void)
/* Might have gotten an interrupt while sleeping */
CHECK_FOR_INTERRUPTS();
+ return msec;
}
+
+ return 0;
}
/*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 7e9364a50c..1cbd651173 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -78,6 +78,7 @@ typedef struct IndexBulkDeleteResult
double tuples_removed; /* # removed during vacuum operation */
BlockNumber pages_deleted; /* # unused pages in index */
BlockNumber pages_free; /* # pages available for reuse */
+ double delay_msec; /* milliseconds delay by vacuum_delay_point() */
} IndexBulkDeleteResult;
/* Typedef for callback function to determine if a tuple is bulk-deletable */
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 2779bea5c9..856714034b 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -273,7 +273,7 @@ extern void vacuum_set_xid_limits(Relation rel,
MultiXactId *multiXactCutoff,
MultiXactId *mxactFullScanLimit);
extern void vac_update_datfrozenxid(void);
-extern void vacuum_delay_point(void);
+extern double vacuum_delay_point(void);
extern bool vacuum_is_relation_owner(Oid relid, Form_pg_class reltuple,
int options);
extern Relation vacuum_open_relation(Oid relid, RangeVar *relation,
--
2.17.0