Hi, On Wed, Jan 28, 2026 at 03:35:21PM +0900, Michael Paquier wrote: > On Mon, Jan 26, 2026 at 06:59:28AM +0000, Bertrand Drouvot wrote: > > The attached, to apply on top of 0001, fix the issue. However it handles > > only the > > WaitLatch in ProcSleep() case and I start to have concern about the others > > WaitLatch() > > that would/could be "woken up" every 1s. > > > Using disable_timeout() and enable_timeout_after() in WaitEventSetWait() > > does not > > look like a great answer to this concern, so I wonder if we should use a > > larger > > flush frequency instead (as proposed up-thread), thoughts? > > Only a larger frequency is not the correct answer here. It would just > reduce the frequency of the extra lock wait messages for one: these > should never appear more than necessary.
Right. The fix in fix_ProcSleep.txt shared up-thread solves that and has been added in 0001 attached. Also the attached is now split in 4 sub-patches with 0002 introducing a new GUC to control the flush interval (default is 10s). Note that 0001 to 0003 could be merged as one patch but I did it that way to ease the review. The new version also adds more documentation and takes care of Sami's comments shared up-thread. > And how about for example extension code? I think that depending on how they write their code around WaitLatch (if any) they could see messages being reported (if they do) in the logs at stats_flush_interval frequency. That said the default value is 10s and that looks pretty long for a latch to wait on. In any case, they should already take care of a latch being woken by WL_LATCH_SET. What do you think? Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From 07ac566644abd0ee01ab76cb96b0b176f3d25c5a Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Mon, 5 Jan 2026 09:41:39 +0000 Subject: [PATCH v4 1/4] Add pgstat_report_anytime_stat() for periodic stats flushing Long running transactions can accumulate significant statistics (WAL, IO, ...) that remain unflushed until the transaction ends. This delays visibility of resource usage in monitoring views like pg_stat_io and pg_stat_wal. This commit introduces pgstat_report_anytime_stat(), which flushes non transactional statistics even inside active transactions. A new timeout handler fires every second to call this function, ensuring timely stats visibility without waiting for transaction completion. Implementation details: - Add PgStat_FlushMode enum to classify stats kinds: * FLUSH_ANYTIME: Stats that can always be flushed (WAL, IO, ...) * FLUSH_AT_TXN_BOUNDARY: Stats requiring transaction boundaries - Modify pgstat_flush_pending_entries() and pgstat_flush_fixed_stats() to accept a boolean anytime_only parameter: * When false: flushes all stats (existing behavior) * When true: flushes only FLUSH_ANYTIME stats and skips FLUSH_AT_TXN_BOUNDARY stats - This relies on the existing PGSTAT_MIN_INTERVAL to fire every 1 second, calling pgstat_report_anytime_stat(false) The force parameter in pgstat_report_anytime_stat() is currently unused (always called with force=false) but reserved for future use cases requiring immediate flushing. --- src/backend/storage/lmgr/proc.c | 10 +++ src/backend/tcop/postgres.c | 16 ++++ src/backend/utils/activity/pgstat.c | 111 +++++++++++++++++++++++----- src/backend/utils/init/globals.c | 1 + src/backend/utils/init/postinit.c | 15 ++++ src/include/miscadmin.h | 1 + src/include/pgstat.h | 4 + src/include/utils/pgstat_internal.h | 20 +++++ src/include/utils/timeout.h | 1 + src/tools/pgindent/typedefs.list | 1 + 10 files changed, 162 insertions(+), 18 deletions(-) 7.0% src/backend/storage/lmgr/ 7.3% src/backend/tcop/ 61.1% src/backend/utils/activity/ 8.6% src/backend/utils/init/ 11.8% src/include/utils/ 3.6% src/include/ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 063826ae576..012705a2ee6 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1322,6 +1322,7 @@ ProcSleep(LOCALLOCK *locallock) bool allow_autovacuum_cancel = true; bool logged_recovery_conflict = false; ProcWaitStatus myWaitStatus; + bool anytime_timeout_was_active = false; /* The caller must've armed the on-error cleanup mechanism */ Assert(GetAwaitedLock() == locallock); @@ -1398,6 +1399,12 @@ ProcSleep(LOCALLOCK *locallock) standbyWaitStart = GetCurrentTimestamp(); } + anytime_timeout_was_active = get_timeout_active(ANYTIME_STATS_UPDATE_TIMEOUT); + + /* No need to try to flush the statistics while the process is sleeping */ + if (anytime_timeout_was_active) + disable_timeout(ANYTIME_STATS_UPDATE_TIMEOUT, false); + /* * If somebody wakes us between LWLockRelease and WaitLatch, the latch * will not wait. But a set latch does not necessarily mean that the lock @@ -1661,6 +1668,9 @@ ProcSleep(LOCALLOCK *locallock) } } while (myWaitStatus == PROC_WAIT_STATUS_WAITING); + if (anytime_timeout_was_active) + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, PGSTAT_MIN_INTERVAL); + /* * Disable the timers, if they are still running. As in LockErrorCleanup, * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index e54bf1e760f..132fae61423 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3530,6 +3530,22 @@ ProcessInterrupts(void) pgstat_report_stat(true); } + /* + * Flush stats outside of transaction boundary if the timeout fired. + * Unlike transactional stats, these can be flushed even inside a running + * transaction. + */ + if (AnytimeStatsUpdateTimeoutPending) + { + AnytimeStatsUpdateTimeoutPending = false; + + pgstat_report_anytime_stat(false); + + /* Schedule next timeout */ + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, + PGSTAT_MIN_INTERVAL); + } + if (ProcSignalBarrierPending) ProcessProcSignalBarrier(); diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 11bb71cad5a..ab4d9088a9a 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -122,8 +122,6 @@ * ---------- */ -/* minimum interval non-forced stats flushes.*/ -#define PGSTAT_MIN_INTERVAL 1000 /* how long until to block flushing pending stats updates */ #define PGSTAT_MAX_INTERVAL 60000 /* when to call pgstat_report_stat() again, even when idle */ @@ -187,7 +185,8 @@ static void pgstat_init_snapshot_fixed(void); static void pgstat_reset_after_failure(void); -static bool pgstat_flush_pending_entries(bool nowait); +static bool pgstat_flush_pending_entries(bool nowait, bool anytime_only); +static bool pgstat_flush_fixed_stats(bool nowait, bool anytime_only); static void pgstat_prep_snapshot(void); static void pgstat_build_snapshot(void); @@ -288,6 +287,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, + .flush_mode = FLUSH_AT_TXN_BOUNDARY, /* so pg_stat_database entries can be seen in all databases */ .accessed_across_databases = true, @@ -305,6 +305,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, + .flush_mode = FLUSH_AT_TXN_BOUNDARY, .shared_size = sizeof(PgStatShared_Relation), .shared_data_off = offsetof(PgStatShared_Relation, stats), @@ -321,6 +322,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, + .flush_mode = FLUSH_AT_TXN_BOUNDARY, .shared_size = sizeof(PgStatShared_Function), .shared_data_off = offsetof(PgStatShared_Function, stats), @@ -336,6 +338,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, + .flush_mode = FLUSH_AT_TXN_BOUNDARY, .accessed_across_databases = true, @@ -353,6 +356,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, + .flush_mode = FLUSH_AT_TXN_BOUNDARY, /* so pg_stat_subscription_stats entries can be seen in all databases */ .accessed_across_databases = true, @@ -370,6 +374,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = false, + .flush_mode = FLUSH_ANYTIME, .accessed_across_databases = true, @@ -388,6 +393,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, archiver), .shared_ctl_off = offsetof(PgStat_ShmemControl, archiver), @@ -404,6 +410,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, bgwriter), .shared_ctl_off = offsetof(PgStat_ShmemControl, bgwriter), @@ -420,6 +427,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, checkpointer), .shared_ctl_off = offsetof(PgStat_ShmemControl, checkpointer), @@ -436,6 +444,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, io), .shared_ctl_off = offsetof(PgStat_ShmemControl, io), @@ -453,6 +462,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, slru), .shared_ctl_off = offsetof(PgStat_ShmemControl, slru), @@ -470,6 +480,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = true, .write_to_file = true, + .flush_mode = FLUSH_ANYTIME, .snapshot_ctl_off = offsetof(PgStat_Snapshot, wal), .shared_ctl_off = offsetof(PgStat_ShmemControl, wal), @@ -775,23 +786,11 @@ pgstat_report_stat(bool force) partial_flush = false; /* flush of variable-numbered stats tracked in pending entries list */ - partial_flush |= pgstat_flush_pending_entries(nowait); + partial_flush |= pgstat_flush_pending_entries(nowait, false); /* flush of other stats kinds */ if (pgstat_report_fixed) - { - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; - - partial_flush |= kind_info->flush_static_cb(nowait); - } - } + partial_flush |= pgstat_flush_fixed_stats(nowait, false); last_flush = now; @@ -1345,9 +1344,14 @@ pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) /* * Flush out pending variable-numbered stats. + * + * If anytime_only is true, only flushes FLUSH_ANYTIME entries. + * This is safe to call inside transactions. + * + * If anytime_only is false, flushes all entries. */ static bool -pgstat_flush_pending_entries(bool nowait) +pgstat_flush_pending_entries(bool nowait, bool anytime_only) { bool have_pending = false; dlist_node *cur = NULL; @@ -1377,6 +1381,20 @@ pgstat_flush_pending_entries(bool nowait) Assert(!kind_info->fixed_amount); Assert(kind_info->flush_pending_cb != NULL); + /* Skip transactional stats if we're in anytime_only mode */ + if (anytime_only && kind_info->flush_mode == FLUSH_AT_TXN_BOUNDARY) + { + have_pending = true; + + if (dlist_has_next(&pgStatPending, cur)) + next = dlist_next_node(&pgStatPending, cur); + else + next = NULL; + + cur = next; + continue; + } + /* flush the stats, if possible */ did_flush = kind_info->flush_pending_cb(entry_ref, nowait); @@ -1402,6 +1420,33 @@ pgstat_flush_pending_entries(bool nowait) return have_pending; } +/* + * Flush fixed-amount stats. + * + * If anytime_only is true, only flushes FLUSH_ANYTIME stats (safe inside transactions). + * If anytime_only is false, flushes all stats with flush_static_cb. + */ +static bool +pgstat_flush_fixed_stats(bool nowait, bool anytime_only) +{ + bool partial_flush = false; + + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info || !kind_info->flush_static_cb) + continue; + + /* Skip transactional stats if we're in anytime_only mode */ + if (anytime_only && kind_info->flush_mode == FLUSH_AT_TXN_BOUNDARY) + continue; + + partial_flush |= kind_info->flush_static_cb(nowait); + } + + return partial_flush; +} /* ------------------------------------------------------------ * Helper / infrastructure functions @@ -2119,3 +2164,33 @@ assign_stats_fetch_consistency(int newval, void *extra) if (pgstat_fetch_consistency != newval) force_stats_snapshot_clear = true; } + +/* + * Flushes only FLUSH_ANYTIME stats using non-blocking locks. Transactional + * stats (FLUSH_AT_TXN_BOUNDARY) remain pending until transaction boundary. + * Safe to call inside transactions. + */ +void +pgstat_report_anytime_stat(bool force) +{ + bool nowait = !force; + + pgstat_assert_is_up(); + + /* + * Exit if no pending stats at all. This avoids unnecessary work when + * backends are idle or in sessions without stats accumulation. + * + * Note: This check isn't precise as there might be only transactional + * stats pending, which we'll skip during the flush. However, maintaining + * precise tracking would add complexity that does not seem worth it from + * a performance point of view (no noticeable performance regression has + * been observed with the current implementation). + */ + if (dlist_is_empty(&pgStatPending) && !pgstat_report_fixed) + return; + + /* Flush stats outside of transaction boundary */ + pgstat_flush_pending_entries(nowait, true); + pgstat_flush_fixed_stats(nowait, true); +} diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 36ad708b360..ad44826c39e 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -40,6 +40,7 @@ volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; +volatile sig_atomic_t AnytimeStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; volatile uint32 CritSectionCount = 0; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 3f401faf3de..6076f531c4a 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -82,6 +82,7 @@ static void TransactionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); static void IdleStatsUpdateTimeoutHandler(void); static void ClientCheckTimeoutHandler(void); +static void AnytimeStatsUpdateTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); @@ -765,6 +766,9 @@ InitPostgres(const char *in_dbname, Oid dboid, RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT, IdleStatsUpdateTimeoutHandler); + RegisterTimeout(ANYTIME_STATS_UPDATE_TIMEOUT, + AnytimeStatsUpdateTimeoutHandler); + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, PGSTAT_MIN_INTERVAL); } /* @@ -1446,3 +1450,14 @@ ThereIsAtLeastOneRole(void) return result; } + +/* + * Timeout handler for flushing non-transactional stats. + */ +static void +AnytimeStatsUpdateTimeoutHandler(void) +{ + AnytimeStatsUpdateTimeoutPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index db559b39c4d..8aeb9628871 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -96,6 +96,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; +extern PGDLLIMPORT volatile sig_atomic_t AnytimeStatsUpdateTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index fff7ecc2533..1651f16f966 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -35,6 +35,9 @@ /* Default directory to store temporary statistics data in */ #define PG_STAT_TMP_DIR "pg_stat_tmp" +/* Minimum interval non-forced stats flushes */ +#define PGSTAT_MIN_INTERVAL 1000 + /* Values for track_functions GUC variable --- order is significant! */ typedef enum TrackFunctionsLevel { @@ -533,6 +536,7 @@ extern void pgstat_initialize(void); /* Functions called from backends */ extern long pgstat_report_stat(bool force); +extern void pgstat_report_anytime_stat(bool force); extern void pgstat_force_next_flush(void); extern void pgstat_reset_counters(void); diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index 9b8fbae00ed..9ca39ea9a9a 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -224,6 +224,19 @@ typedef struct PgStat_SubXactStatus PgStat_TableXactStatus *first; /* head of list for this subxact */ } PgStat_SubXactStatus; +/* + * Flush mode for statistics kinds. + * + * FLUSH_AT_TXN_BOUNDARY has to be the first because we want it to be the + * default value. + */ +typedef enum PgStat_FlushMode +{ + FLUSH_AT_TXN_BOUNDARY, /* All fields can only be flushed at + * transaction boundary */ + FLUSH_ANYTIME, /* All fields can be flushed anytime, + * including within transactions */ +} PgStat_FlushMode; /* * Metadata for a specific kind of statistics. @@ -251,6 +264,13 @@ typedef struct PgStat_KindInfo */ bool track_entry_count:1; + /* + * Some stats have to be updated only at transaction boundaries (such as + * tuples_inserted updated, deleted), so it's very important to set the + * right flush mode (FLUSH_AT_TXN_BOUNDARY being the default). + */ + PgStat_FlushMode flush_mode; + /* * The size of an entry in the shared stats hash table (pointed to by * PgStatShared_HashEntry->body). For fixed-numbered statistics, this is diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index 0965b590b34..10723bb664c 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -35,6 +35,7 @@ typedef enum TimeoutId IDLE_SESSION_TIMEOUT, IDLE_STATS_UPDATE_TIMEOUT, CLIENT_CONNECTION_CHECK_TIMEOUT, + ANYTIME_STATS_UPDATE_TIMEOUT, STARTUP_PROGRESS_TIMEOUT, /* First user-definable timeout reason */ USER_TIMEOUT, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ddbe4c64971..af21c87234a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2268,6 +2268,7 @@ PgStat_Counter PgStat_EntryRef PgStat_EntryRefHashEntry PgStat_FetchConsistency +PgStat_FlushMode PgStat_FunctionCallUsage PgStat_FunctionCounts PgStat_HashKey -- 2.34.1
>From 02ce5e90269558cf481a49e557509eed68d6a9dc Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Wed, 28 Jan 2026 07:53:13 +0000 Subject: [PATCH v4 2/4] Add GUC to specify non-transactional statistics flush interval Adding pgstat_flush_interval, a new GUC to set the interval between flushes of non-transactional statistics. --- doc/src/sgml/config.sgml | 32 +++++++++++++++++++ src/backend/storage/lmgr/proc.c | 2 +- src/backend/tcop/postgres.c | 2 +- src/backend/utils/activity/pgstat.c | 15 +++++++++ src/backend/utils/init/postinit.c | 2 +- src/backend/utils/misc/guc_parameters.dat | 10 ++++++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/pgstat.h | 1 + src/include/utils/guc_hooks.h | 1 + 9 files changed, 63 insertions(+), 3 deletions(-) 55.5% doc/src/sgml/ 5.3% src/backend/storage/lmgr/ 12.6% src/backend/utils/activity/ 5.3% src/backend/utils/init/ 14.9% src/backend/utils/misc/ 3.9% src/include/ diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5560b95ee60..3136816a933 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8834,6 +8834,38 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; </listitem> </varlistentry> + <varlistentry id="guc-stats-flush-interval" xreflabel="stats_flush_interval"> + <term><varname>stats_flush_interval</varname> (<type>integer</type>) + <indexterm> + <primary><varname>stats_flush_interval</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Sets the interval at which non-transactional statistics are made visible + during running transactions. Non-transactional statistics include, for + example, WAL activity and I/O operations. + They become visible at that interval in monitoring views such as + <link linkend="monitoring-pg-stat-io-view"> <structname>pg_stat_io</structname></link> + and <link linkend="monitoring-pg-stat-wal-view"> <structname>pg_stat_wal</structname></link> + during running transactions. + If this value is specified without units, it is taken as milliseconds. + The default is 10 seconds (<literal>10s</literal>), which is probably + about the smallest value you would want in practice for long running + transactions. + </para> + <note> + <para> + This parameter does not affect transactional statistics such as + <structname>pg_stat_all_tables</structname> columns (like + <structfield>n_tup_ins</structfield>, <structfield>n_tup_upd</structfield>, + <structfield>n_tup_del</structfield>), which are always flushed at transaction + boundaries to maintain consistency. + </para> + </note> + </listitem> + </varlistentry> + </variablelist> </sect2> diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 012705a2ee6..caa6eecca88 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1669,7 +1669,7 @@ ProcSleep(LOCALLOCK *locallock) } while (myWaitStatus == PROC_WAIT_STATUS_WAITING); if (anytime_timeout_was_active) - enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, PGSTAT_MIN_INTERVAL); + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, pgstat_flush_interval); /* * Disable the timers, if they are still running. As in LockErrorCleanup, diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 132fae61423..c0e81cb13d0 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3543,7 +3543,7 @@ ProcessInterrupts(void) /* Schedule next timeout */ enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, - PGSTAT_MIN_INTERVAL); + pgstat_flush_interval); } if (ProcSignalBarrierPending) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index ab4d9088a9a..ca08dd49cd7 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -113,6 +113,7 @@ #include "utils/memutils.h" #include "utils/pgstat_internal.h" #include "utils/timestamp.h" +#include "utils/timeout.h" /* ---------- @@ -202,6 +203,7 @@ static inline bool pgstat_is_kind_valid(PgStat_Kind kind); bool pgstat_track_counts = false; int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; +int pgstat_flush_interval = 10000; /* ---------- @@ -2165,6 +2167,19 @@ assign_stats_fetch_consistency(int newval, void *extra) force_stats_snapshot_clear = true; } +/* + * GUC assign_hook for stats_flush_interval. + */ +void +assign_stats_flush_interval(int newval, void *extra) +{ + if (get_timeout_active(ANYTIME_STATS_UPDATE_TIMEOUT)) + { + disable_timeout(ANYTIME_STATS_UPDATE_TIMEOUT, false); + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, newval); + } +} + /* * Flushes only FLUSH_ANYTIME stats using non-blocking locks. Transactional * stats (FLUSH_AT_TXN_BOUNDARY) remain pending until transaction boundary. diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 6076f531c4a..c7c0d618671 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -768,7 +768,7 @@ InitPostgres(const char *in_dbname, Oid dboid, IdleStatsUpdateTimeoutHandler); RegisterTimeout(ANYTIME_STATS_UPDATE_TIMEOUT, AnytimeStatsUpdateTimeoutHandler); - enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, PGSTAT_MIN_INTERVAL); + enable_timeout_after(ANYTIME_STATS_UPDATE_TIMEOUT, pgstat_flush_interval); } /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index f0260e6e412..3bb43362e51 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2782,6 +2782,16 @@ assign_hook => 'assign_stats_fetch_consistency', }, +{ name => 'stats_flush_interval', type => 'int', context => 'PGC_USERSET', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the interval between flushes of non-transactional statistics.', + flags => 'GUC_UNIT_MS', + variable => 'pgstat_flush_interval', + boot_val => '10000', + min => '1000', + max => 'INT_MAX', + assign_hook => 'assign_stats_flush_interval' +}, + { name => 'subtransaction_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', short_desc => 'Sets the size of the dedicated buffer pool used for the subtransaction cache.', long_desc => '0 means use a fraction of "shared_buffers".', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c4f92fcdac8..6ce5a250170 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -669,6 +669,7 @@ #track_wal_io_timing = off #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot +#stats_flush_interval = 10s # in milliseconds # - Monitoring - diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 1651f16f966..e0f222695bf 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -816,6 +816,7 @@ extern PgStat_WalStats *pgstat_fetch_stat_wal(void); extern PGDLLIMPORT bool pgstat_track_counts; extern PGDLLIMPORT int pgstat_track_functions; extern PGDLLIMPORT int pgstat_fetch_consistency; +extern PGDLLIMPORT int pgstat_flush_interval; /* diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index b6ecb0e769f..3a2ae6c41cd 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -132,6 +132,7 @@ extern bool check_session_authorization(char **newval, void **extra, GucSource s extern void assign_session_authorization(const char *newval, void *extra); extern void assign_session_replication_role(int newval, void *extra); extern void assign_stats_fetch_consistency(int newval, void *extra); +extern void assign_stats_flush_interval(int newval, void *extra); extern bool check_ssl(bool *newval, void **extra, GucSource source); extern bool check_stage_log_stats(bool *newval, void **extra, GucSource source); extern bool check_standard_conforming_strings(bool *newval, void **extra, -- 2.34.1
>From fe85fd375c89a85a84e47a88f01a9d629a8abb09 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Tue, 6 Jan 2026 11:06:31 +0000 Subject: [PATCH v4 3/4] Remove useless calls to flush some stats Now that some stats can be flushed outside of transaction boundaries, remove useless calls to report/flush some stats. Those calls were in place because before commit <XXXX> stats were flushed only at transaction boundaries. Note that: - it reverts 039549d70f6 (it just keeps its tests) - it can't be done for checkpointer and bgworker for example because they don't have a flush callback to call - it can't be done for auxiliary process (walsummarizer for example) because they currently do not register the new timeout handler --- src/backend/replication/walreceiver.c | 10 ------ src/backend/replication/walsender.c | 36 ++------------------ src/backend/utils/activity/pgstat_relation.c | 13 ------- src/test/recovery/t/001_stream_rep.pl | 1 + src/test/subscription/t/001_rep_changes.pl | 1 + 5 files changed, 4 insertions(+), 57 deletions(-) 69.9% src/backend/replication/ 22.8% src/backend/utils/activity/ 3.5% src/test/recovery/t/ 3.6% src/test/subscription/t/ diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 6970af3f3ff..dcbe3517b46 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -565,16 +565,6 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) */ bool requestReply = false; - /* - * Report pending statistics to the cumulative stats - * system. This location is useful for the report as it - * is not within a tight loop in the WAL receiver, to - * avoid bloating pgstats with requests, while also making - * sure that the reports happen each time a status update - * is sent. - */ - pgstat_report_wal(false); - /* * Check if time since last receive from primary has * reached the configured limit. diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index a0e6a3d200c..74102def9c7 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -94,14 +94,10 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_lsn.h" -#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/timeout.h" #include "utils/timestamp.h" -/* Minimum interval used by walsender for stats flushes, in ms */ -#define WALSENDER_STATS_FLUSH_INTERVAL 1000 - /* * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ. * @@ -1825,7 +1821,6 @@ WalSndWaitForWal(XLogRecPtr loc) int wakeEvents; uint32 wait_event = 0; static XLogRecPtr RecentFlushPtr = InvalidXLogRecPtr; - TimestampTz last_flush = 0; /* * Fast path to avoid acquiring the spinlock in case we already know we @@ -1846,7 +1841,6 @@ WalSndWaitForWal(XLogRecPtr loc) { bool wait_for_standby_at_stop = false; long sleeptime; - TimestampTz now; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); @@ -1957,8 +1951,7 @@ WalSndWaitForWal(XLogRecPtr loc) * new WAL to be generated. (But if we have nothing to send, we don't * want to wake on socket-writable.) */ - now = GetCurrentTimestamp(); - sleeptime = WalSndComputeSleeptime(now); + sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); wakeEvents = WL_SOCKET_READABLE; @@ -1967,15 +1960,6 @@ WalSndWaitForWal(XLogRecPtr loc) Assert(wait_event != 0); - /* Report IO statistics, if needed */ - if (TimestampDifferenceExceeds(last_flush, now, - WALSENDER_STATS_FLUSH_INTERVAL)) - { - pgstat_flush_io(false); - (void) pgstat_flush_backend(false, PGSTAT_BACKEND_FLUSH_IO); - last_flush = now; - } - WalSndWait(wakeEvents, sleeptime, wait_event); } @@ -2878,8 +2862,6 @@ WalSndCheckTimeOut(void) static void WalSndLoop(WalSndSendDataCallback send_data) { - TimestampTz last_flush = 0; - /* * Initialize the last reply timestamp. That enables timeout processing * from hereon. @@ -2974,9 +2956,6 @@ WalSndLoop(WalSndSendDataCallback send_data) * WalSndWaitForWal() handle any other blocking; idle receivers need * its additional actions. For physical replication, also block if * caught up; its send_data does not block. - * - * The IO statistics are reported in WalSndWaitForWal() for the - * logical WAL senders. */ if ((WalSndCaughtUp && send_data != XLogSendLogical && !streamingDoneSending) || @@ -2984,7 +2963,6 @@ WalSndLoop(WalSndSendDataCallback send_data) { long sleeptime; int wakeEvents; - TimestampTz now; if (!streamingDoneReceiving) wakeEvents = WL_SOCKET_READABLE; @@ -2995,21 +2973,11 @@ WalSndLoop(WalSndSendDataCallback send_data) * Use fresh timestamp, not last_processing, to reduce the chance * of reaching wal_sender_timeout before sending a keepalive. */ - now = GetCurrentTimestamp(); - sleeptime = WalSndComputeSleeptime(now); + sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; - /* Report IO statistics, if needed */ - if (TimestampDifferenceExceeds(last_flush, now, - WALSENDER_STATS_FLUSH_INTERVAL)) - { - pgstat_flush_io(false); - (void) pgstat_flush_backend(false, PGSTAT_BACKEND_FLUSH_IO); - last_flush = now; - } - /* Sleep until something happens or we time out */ WalSndWait(wakeEvents, sleeptime, WAIT_EVENT_WAL_SENDER_MAIN); } diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index bc8c43b96aa..feae2ae5f44 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -260,15 +260,6 @@ pgstat_report_vacuum(Relation rel, PgStat_Counter livetuples, } pgstat_unlock_entry(entry_ref); - - /* - * Flush IO statistics now. pgstat_report_stat() will flush IO stats, - * however this will not be called until after an entire autovacuum cycle - * is done -- which will likely vacuum many relations -- or until the - * VACUUM command has processed all tables and committed. - */ - pgstat_flush_io(false); - (void) pgstat_flush_backend(false, PGSTAT_BACKEND_FLUSH_IO); } /* @@ -360,10 +351,6 @@ pgstat_report_analyze(Relation rel, } pgstat_unlock_entry(entry_ref); - - /* see pgstat_report_vacuum() */ - pgstat_flush_io(false); - (void) pgstat_flush_backend(false, PGSTAT_BACKEND_FLUSH_IO); } /* diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index e9ac67813c7..c058a5f9b1f 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -15,6 +15,7 @@ my $node_primary = PostgreSQL::Test::Cluster->new('primary'); $node_primary->init( allows_streaming => 1, auth_extra => [ '--create-role' => 'repl_role' ]); +$node_primary->append_conf('postgresql.conf', "stats_flush_interval= '1s'"); $node_primary->start; my $backup_name = 'my_backup'; diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index d7e62e4d488..dda872f7074 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -11,6 +11,7 @@ use Test::More; # Initialize publisher node my $node_publisher = PostgreSQL::Test::Cluster->new('publisher'); $node_publisher->init(allows_streaming => 'logical'); +$node_publisher->append_conf('postgresql.conf', "stats_flush_interval= '1s'"); $node_publisher->start; # Create subscriber node -- 2.34.1
>From fe3d6b69d8a9eb91d74c195c0d8617096ef56297 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Mon, 19 Jan 2026 06:27:55 +0000 Subject: [PATCH v4 4/4] Add FLUSH_MIXED support and implement it for RELATION stats This commit extends the non transactional stats infrastructure to support statistics kinds with mixed transaction behavior: some fields are transactional (e.g., tuple inserts/updates/deletes) while others are non transactional (e.g., sequential scans blocks read, ...). It introduces FLUSH_MIXED as a third flush mode type, alongside FLUSH_ANYTIME and FLUSH_AT_TXN_BOUNDARY. For FLUSH_MIXED kinds, a new flush_anytime_cb callback enables partial flushing of only the non transactional fields during running transactions. Some tests are also added. Implementation details: - Add FLUSH_MIXED to PgStat_FlushMode enum - Add flush_anytime_cb to PgStat_KindInfo for partial flushing callback - Update pgstat_flush_pending_entries() to call flush_anytime_cb for FLUSH_MIXED entries when in anytime_only mode - Keep FLUSH_MIXED entries in the pending list after partial flush, as transactional fields still need to be flushed at transaction boundary RELATION stats are making use of FLUSH_MIXED: - Change RELATION from FLUSH_AT_TXN_BOUNDARY to FLUSH_MIXED - Implement pgstat_relation_flush_anytime_cb() to flush only read related stats: numscans, tuples_returned, tuples_fetched, blocks_fetched, blocks_hit - Clear these fields after flushing to prevent double counting when pgstat_relation_flush_cb() runs at transaction commit - Transactional stats (tuples_inserted, tuples_updated, tuples_deleted, live_tuples, dead_tuples) remain pending until transaction boundary The DATABASE kind is also changed from FLUSH_AT_TXN_BOUNDARY to FLUSH_ANYTIME, so that some stats inherited from relations stats are also visible while the transaction is in progress. Remark: We could also imagine adding a new flush_anytime_static_cb() callback for future FLUSH_MIXED fixed amount stats. --- doc/src/sgml/monitoring.sgml | 29 ++++++++ src/backend/utils/activity/pgstat.c | 32 ++++++-- src/backend/utils/activity/pgstat_relation.c | 78 ++++++++++++++++++++ src/include/utils/pgstat_internal.h | 9 +++ src/test/isolation/expected/stats.out | 40 ++++++++++ src/test/isolation/expected/stats_1.out | 40 ++++++++++ src/test/isolation/specs/stats.spec | 17 ++++- 7 files changed, 236 insertions(+), 9 deletions(-) 15.3% doc/src/sgml/ 44.8% src/backend/utils/activity/ 4.6% src/include/utils/ 29.4% src/test/isolation/expected/ 5.6% src/test/isolation/specs/ diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b77d189a500..581d6ea7811 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3767,6 +3767,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage </tgroup> </table> + <note> + <para> + Some statistics are updated while a transaction is in progress (for example, + <structfield>blks_read</structfield>, <structfield>blks_hit</structfield>, + <structfield>tup_returned</structfield> and <structfield>tup_fetched</structfield>). + Statistics that either do not depend on transactions or require transactional + consistency are updated only when the transaction ends. Statistics that require + transactional consistency include <structfield>xact_commit</structfield>, + <structfield>xact_rollback</structfield>, <structfield>tup_inserted</structfield>, + <structfield>tup_updated</structfield> and <structfield>tup_deleted</structfield>. + </para> + </note> + </sect2> <sect2 id="monitoring-pg-stat-database-conflicts-view"> @@ -4223,6 +4236,15 @@ description | Waiting for a newly initialized WAL file to reach durable storage </tgroup> </table> + <note> + <para> + The <structfield>seq_scan</structfield>, <structfield>last_seq_scan</structfield>, + <structfield>seq_tup_read</structfield>, <structfield>idx_scan</structfield>, + <structfield>last_idx_scan</structfield> and <structfield>idx_tup_fetch</structfield> + are updated while the transactions are in progress. + </para> + </note> + </sect2> <sect2 id="monitoring-pg-stat-all-indexes-view"> @@ -4404,6 +4426,13 @@ description | Waiting for a newly initialized WAL file to reach durable storage tuples (see <xref linkend="indexes-multicolumn"/>). </para> </note> + <note> + <para> + The <structfield>idx_scan</structfield>, <structfield>last_idx_scan</structfield>, + <structfield>idx_tup_read</structfield> and <structfield>idx_tup_fetch</structfield> + are updated while the transactions are in progress. + </para> + </note> <tip> <para> <command>EXPLAIN ANALYZE</command> outputs the total number of index diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index ca08dd49cd7..987bf1c0f6e 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -289,7 +289,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, - .flush_mode = FLUSH_AT_TXN_BOUNDARY, + .flush_mode = FLUSH_ANYTIME, /* so pg_stat_database entries can be seen in all databases */ .accessed_across_databases = true, @@ -307,7 +307,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .fixed_amount = false, .write_to_file = true, - .flush_mode = FLUSH_AT_TXN_BOUNDARY, + .flush_mode = FLUSH_MIXED, .shared_size = sizeof(PgStatShared_Relation), .shared_data_off = offsetof(PgStatShared_Relation, stats), @@ -315,6 +315,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .pending_size = sizeof(PgStat_TableStatus), .flush_pending_cb = pgstat_relation_flush_cb, + .flush_anytime_cb = pgstat_relation_flush_anytime_cb, .delete_pending_cb = pgstat_relation_delete_pending_cb, .reset_timestamp_cb = pgstat_relation_reset_timestamp_cb, }, @@ -1347,10 +1348,11 @@ pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) /* * Flush out pending variable-numbered stats. * - * If anytime_only is true, only flushes FLUSH_ANYTIME entries. + * If anytime_only is true, only flushes FLUSH_ANYTIME and FLUSH_MIXED entries, + * using flush_anytime_cb for FLUSH_MIXED. * This is safe to call inside transactions. * - * If anytime_only is false, flushes all entries. + * If anytime_only is false, flushes all entries using flush_pending_cb. */ static bool pgstat_flush_pending_entries(bool nowait, bool anytime_only) @@ -1378,6 +1380,7 @@ pgstat_flush_pending_entries(bool nowait, bool anytime_only) PgStat_Kind kind = key.kind; const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); bool did_flush; + bool is_partial_flush = false; dlist_node *next; Assert(!kind_info->fixed_amount); @@ -1397,8 +1400,21 @@ pgstat_flush_pending_entries(bool nowait, bool anytime_only) continue; } - /* flush the stats, if possible */ - did_flush = kind_info->flush_pending_cb(entry_ref, nowait); + /* flush the stats (with the appropriate callback), if possible */ + if (anytime_only && + kind_info->flush_mode == FLUSH_MIXED && + kind_info->flush_anytime_cb != NULL) + { + /* Partial flush of non-transactional fields only */ + did_flush = kind_info->flush_anytime_cb(entry_ref, nowait); + is_partial_flush = true; + } + else + { + /* Full flush */ + did_flush = kind_info->flush_pending_cb(entry_ref, nowait); + is_partial_flush = false; + } Assert(did_flush || nowait); @@ -1408,8 +1424,8 @@ pgstat_flush_pending_entries(bool nowait, bool anytime_only) else next = NULL; - /* if successfully flushed, remove entry */ - if (did_flush) + /* if successfull non partial flush, remove entry */ + if (did_flush && !is_partial_flush) pgstat_delete_pending_entry(entry_ref); else have_pending = true; diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index feae2ae5f44..d6b799c4354 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -887,6 +887,84 @@ pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) return true; } +/* + * Flush only non-transactional relation stats. + * + * This is called periodically during running transactions to make some + * statistics visible without waiting for the transaction to finish. + * + * Transactional stats (inserts/updates/deletes and their effects on live/dead + * tuple counts) remain in pending until the transaction ends, at which point + * pgstat_relation_flush_cb() will flush them. + * + * If nowait is true and the lock could not be immediately acquired, returns + * false without flushing the entry. Otherwise returns true. + */ +bool +pgstat_relation_flush_anytime_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + Oid dboid; + PgStat_TableStatus *lstats; /* pending stats entry */ + PgStatShared_Relation *shtabstats; + PgStat_StatTabEntry *tabentry; /* table entry of shared stats */ + PgStat_StatDBEntry *dbentry; /* pending database entry */ + + dboid = entry_ref->shared_entry->key.dboid; + lstats = (PgStat_TableStatus *) entry_ref->pending; + shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats; + + /* + * Check if there are any non-transactional stats to flush. Avoid + * unnecessarily locking the entry if nothing accumulated. + */ + if (!(lstats->counts.numscans > 0 || + lstats->counts.tuples_returned > 0 || + lstats->counts.tuples_fetched > 0 || + lstats->counts.blocks_fetched > 0 || + lstats->counts.blocks_hit > 0)) + return true; + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + + /* Add only the non-transactional values to the shared entry */ + tabentry = &shtabstats->stats; + + tabentry->numscans += lstats->counts.numscans; + if (lstats->counts.numscans) + { + TimestampTz t = GetCurrentTimestamp(); + + if (t > tabentry->lastscan) + tabentry->lastscan = t; + } + tabentry->tuples_returned += lstats->counts.tuples_returned; + tabentry->tuples_fetched += lstats->counts.tuples_fetched; + tabentry->blocks_fetched += lstats->counts.blocks_fetched; + tabentry->blocks_hit += lstats->counts.blocks_hit; + + pgstat_unlock_entry(entry_ref); + + /* Also update the corresponding fields in database stats */ + dbentry = pgstat_prep_database_pending(dboid); + dbentry->tuples_returned += lstats->counts.tuples_returned; + dbentry->tuples_fetched += lstats->counts.tuples_fetched; + dbentry->blocks_fetched += lstats->counts.blocks_fetched; + dbentry->blocks_hit += lstats->counts.blocks_hit; + + /* + * Clear the flushed fields from pending stats to prevent double-counting + * when pgstat_relation_flush_cb() runs at transaction boundary. + */ + lstats->counts.numscans = 0; + lstats->counts.tuples_returned = 0; + lstats->counts.tuples_fetched = 0; + lstats->counts.blocks_fetched = 0; + lstats->counts.blocks_hit = 0; + + return true; +} + void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) { diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index 9ca39ea9a9a..f91bbfd460f 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -236,6 +236,8 @@ typedef enum PgStat_FlushMode * transaction boundary */ FLUSH_ANYTIME, /* All fields can be flushed anytime, * including within transactions */ + FLUSH_MIXED, /* Mix of fields that can be flushed anytime + * or only at transaction boundary */ } PgStat_FlushMode; /* @@ -271,6 +273,12 @@ typedef struct PgStat_KindInfo */ PgStat_FlushMode flush_mode; + /* + * For FLUSH_MIXED kinds: callback to flush only some fields. If NULL for + * a MIXED kind, treated as FLUSH_AT_TXN_BOUNDARY. + */ + bool (*flush_anytime_cb) (PgStat_EntryRef *entry_ref, bool nowait); + /* * The size of an entry in the shared stats hash table (pointed to by * PgStatShared_HashEntry->body). For fixed-numbered statistics, this is @@ -783,6 +791,7 @@ extern void AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern void PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern bool pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern bool pgstat_relation_flush_anytime_cb(PgStat_EntryRef *entry_ref, bool nowait); extern void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref); extern void pgstat_relation_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); diff --git a/src/test/isolation/expected/stats.out b/src/test/isolation/expected/stats.out index cfad309ccf3..6d62b30e4a7 100644 --- a/src/test/isolation/expected/stats.out +++ b/src/test/isolation/expected/stats.out @@ -2245,6 +2245,46 @@ seq_scan|seq_tup_read|n_tup_ins|n_tup_upd|n_tup_del|n_live_tup|n_dead_tup|vacuum (1 row) +starting permutation: s2_begin s2_table_select s1_sleep s1_table_stats s2_table_drop s2_commit +pg_stat_force_next_flush +------------------------ + +(1 row) + +step s2_begin: BEGIN; +step s2_table_select: SELECT * FROM test_stat_tab ORDER BY key, value; +key|value +---+----- +k0 | 1 +(1 row) + +step s1_sleep: SELECT pg_sleep(1.5); +pg_sleep +-------- + +(1 row) + +step s1_table_stats: + SELECT + pg_stat_get_numscans(tso.oid) AS seq_scan, + pg_stat_get_tuples_returned(tso.oid) AS seq_tup_read, + pg_stat_get_tuples_inserted(tso.oid) AS n_tup_ins, + pg_stat_get_tuples_updated(tso.oid) AS n_tup_upd, + pg_stat_get_tuples_deleted(tso.oid) AS n_tup_del, + pg_stat_get_live_tuples(tso.oid) AS n_live_tup, + pg_stat_get_dead_tuples(tso.oid) AS n_dead_tup, + pg_stat_get_vacuum_count(tso.oid) AS vacuum_count + FROM test_stat_oid AS tso + WHERE tso.name = 'test_stat_tab' + +seq_scan|seq_tup_read|n_tup_ins|n_tup_upd|n_tup_del|n_live_tup|n_dead_tup|vacuum_count +--------+------------+---------+---------+---------+----------+----------+------------ + 1| 1| 1| 0| 0| 1| 0| 0 +(1 row) + +step s2_table_drop: DROP TABLE test_stat_tab; +step s2_commit: COMMIT; + starting permutation: s1_track_counts_off s1_table_stats s1_track_counts_on pg_stat_force_next_flush ------------------------ diff --git a/src/test/isolation/expected/stats_1.out b/src/test/isolation/expected/stats_1.out index e1d937784cb..2fade10e817 100644 --- a/src/test/isolation/expected/stats_1.out +++ b/src/test/isolation/expected/stats_1.out @@ -2253,6 +2253,46 @@ seq_scan|seq_tup_read|n_tup_ins|n_tup_upd|n_tup_del|n_live_tup|n_dead_tup|vacuum (1 row) +starting permutation: s2_begin s2_table_select s1_sleep s1_table_stats s2_table_drop s2_commit +pg_stat_force_next_flush +------------------------ + +(1 row) + +step s2_begin: BEGIN; +step s2_table_select: SELECT * FROM test_stat_tab ORDER BY key, value; +key|value +---+----- +k0 | 1 +(1 row) + +step s1_sleep: SELECT pg_sleep(1.5); +pg_sleep +-------- + +(1 row) + +step s1_table_stats: + SELECT + pg_stat_get_numscans(tso.oid) AS seq_scan, + pg_stat_get_tuples_returned(tso.oid) AS seq_tup_read, + pg_stat_get_tuples_inserted(tso.oid) AS n_tup_ins, + pg_stat_get_tuples_updated(tso.oid) AS n_tup_upd, + pg_stat_get_tuples_deleted(tso.oid) AS n_tup_del, + pg_stat_get_live_tuples(tso.oid) AS n_live_tup, + pg_stat_get_dead_tuples(tso.oid) AS n_dead_tup, + pg_stat_get_vacuum_count(tso.oid) AS vacuum_count + FROM test_stat_oid AS tso + WHERE tso.name = 'test_stat_tab' + +seq_scan|seq_tup_read|n_tup_ins|n_tup_upd|n_tup_del|n_live_tup|n_dead_tup|vacuum_count +--------+------------+---------+---------+---------+----------+----------+------------ + 0| 0| 1| 0| 0| 1| 0| 0 +(1 row) + +step s2_table_drop: DROP TABLE test_stat_tab; +step s2_commit: COMMIT; + starting permutation: s1_track_counts_off s1_table_stats s1_track_counts_on pg_stat_force_next_flush ------------------------ diff --git a/src/test/isolation/specs/stats.spec b/src/test/isolation/specs/stats.spec index da16710da0f..a4084efda49 100644 --- a/src/test/isolation/specs/stats.spec +++ b/src/test/isolation/specs/stats.spec @@ -50,6 +50,8 @@ step s1_rollback { ROLLBACK; } step s1_prepare_a { PREPARE TRANSACTION 'a'; } step s1_commit_prepared_a { COMMIT PREPARED 'a'; } step s1_rollback_prepared_a { ROLLBACK PREPARED 'a'; } +# Has to be greater than session 2 stats_flush_interval +step s1_sleep { SELECT pg_sleep(1.5); } # Function stats steps step s1_ff { SELECT pg_stat_force_next_flush(); } @@ -132,12 +134,16 @@ step s1_slru_check_stats { session s2 -setup { SET stats_fetch_consistency = 'none'; } +setup { + SET stats_fetch_consistency = 'none'; + SET stats_flush_interval = '1s'; +} step s2_begin { BEGIN; } step s2_commit { COMMIT; } step s2_commit_prepared_a { COMMIT PREPARED 'a'; } step s2_rollback_prepared_a { ROLLBACK PREPARED 'a'; } step s2_ff { SELECT pg_stat_force_next_flush(); } +step s2_table_drop { DROP TABLE test_stat_tab; } # Function stats steps step s2_track_funcs_all { SET track_functions = 'all'; } @@ -435,6 +441,15 @@ permutation s1_table_drop s1_table_stats +### Check that some stats are updated (seq_scan and seq_tup_read) +### while the transaction is still running +permutation + s2_begin + s2_table_select + s1_sleep + s1_table_stats + s2_table_drop + s2_commit ### Check that we don't count changes with track counts off, but allow access ### to prior stats -- 2.34.1
