From bbfc7f5a1b96481943bdced043d2931a2576fd21 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 1 Jan 2023 16:23:18 -0800
Subject: [PATCH v1] Add TIMING SAMPLING option for EXPLAIN ANALYZE

This introduces a new mode, TIMING SAMPLING, as a lower overhead alternative to
TIMING ON for EXPLAIN ANALYZE and auto_explain.

Instead of measuring the current time when entering and exiting each plan node
(which can get expensive even on modern systems, e.g. when doing many loops),
the sampling timing mode measures the time at a fixed recurring interval using
a wall clock timer.

The sampling frequency can be adjusted from 1 to 1000 Hz by passing the SAMPLEFREQ
option to EXPLAIN, and defaults to 1000 Hz (1ms). Higher frequencies are
technically possible but currently not supported by the Postgres timeout API.

The sampled wall clock time is cached and used by the the plan node instrumentation
to attribute how much sampled time passed whilst a plan node was executed.

Since sampling is likely to undercount the actual time taken, the sampled time
values are scaled to the total execution time of the query (or parallel query
worker), before being displayed.

Additionally, this adds support for sampling based timing to auto_explain by
introducing the new "auto_explain.log_timing = sampling" setting, and
"auto_explain.log_timing_samplefreq" setting to set the frequency.
---
 contrib/auto_explain/auto_explain.c  |  48 ++++++++++--
 src/backend/commands/explain.c       | 113 ++++++++++++++++++++++++++-
 src/backend/executor/execMain.c      |  16 ++++
 src/backend/executor/execParallel.c  |  38 +++++++++
 src/backend/executor/instrument.c    |  89 +++++++++++++++++++++
 src/backend/utils/init/postinit.c    |   2 +
 src/include/commands/explain.h       |   4 +
 src/include/executor/execdesc.h      |   5 +-
 src/include/executor/instrument.h    |   7 ++
 src/include/portability/instr_time.h |   6 ++
 src/include/utils/timeout.h          |   1 +
 11 files changed, 320 insertions(+), 9 deletions(-)

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index 269a0fa86c..137a41e6e1 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -24,6 +24,13 @@
 
 PG_MODULE_MAGIC;
 
+typedef enum
+{
+	LOG_TIMING_OFF,
+	LOG_TIMING_ON,
+	LOG_TIMING_SAMPLING
+} LogTimingType;
+
 /* GUC variables */
 static int	auto_explain_log_min_duration = -1; /* msec or -1 */
 static int	auto_explain_log_parameter_max_length = -1; /* bytes or -1 */
@@ -32,7 +39,8 @@ static bool auto_explain_log_verbose = false;
 static bool auto_explain_log_buffers = false;
 static bool auto_explain_log_wal = false;
 static bool auto_explain_log_triggers = false;
-static bool auto_explain_log_timing = true;
+static int	auto_explain_log_timing = LOG_TIMING_ON;
+static int	auto_explain_log_timing_samplefreq = 1000;
 static bool auto_explain_log_settings = false;
 static int	auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
 static int	auto_explain_log_level = LOG;
@@ -61,6 +69,19 @@ static const struct config_enum_entry loglevel_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry log_timing_options[] = {
+	{"sampling", LOG_TIMING_SAMPLING, false},
+	{"on", LOG_TIMING_ON, false},
+	{"off", LOG_TIMING_OFF, false},
+	{"true", LOG_TIMING_ON, true},
+	{"false", LOG_TIMING_OFF, true},
+	{"yes", LOG_TIMING_ON, true},
+	{"no", LOG_TIMING_OFF, true},
+	{"1", LOG_TIMING_ON, true},
+	{"0", LOG_TIMING_OFF, true},
+	{NULL, 0, false}
+};
+
 /* Current nesting depth of ExecutorRun calls */
 static int	nesting_level = 0;
 
@@ -218,17 +239,30 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
-	DefineCustomBoolVariable("auto_explain.log_timing",
+	DefineCustomEnumVariable("auto_explain.log_timing",
 							 "Collect timing data, not just row counts.",
 							 NULL,
 							 &auto_explain_log_timing,
-							 true,
+							 LOG_TIMING_ON,
+							 log_timing_options,
 							 PGC_SUSET,
 							 0,
 							 NULL,
 							 NULL,
 							 NULL);
 
+	DefineCustomIntVariable("auto_explain.log_timing_samplefreq",
+							"Frequency for sampling-based timing, if used.",
+							"Number of sampling ticks per second (Hz).",
+							&auto_explain_log_timing_samplefreq,
+							1000,
+							1, 1000,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomRealVariable("auto_explain.sample_rate",
 							 "Fraction of queries to process.",
 							 NULL,
@@ -284,14 +318,17 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 		/* Enable per-node instrumentation iff log_analyze is required. */
 		if (auto_explain_log_analyze && (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
 		{
-			if (auto_explain_log_timing)
+			if (auto_explain_log_timing == LOG_TIMING_ON)
 				queryDesc->instrument_options |= INSTRUMENT_TIMER;
+			else if (auto_explain_log_timing == LOG_TIMING_SAMPLING)
+				queryDesc->instrument_options |= INSTRUMENT_TIMER_SAMPLING | INSTRUMENT_ROWS;
 			else
 				queryDesc->instrument_options |= INSTRUMENT_ROWS;
 			if (auto_explain_log_buffers)
 				queryDesc->instrument_options |= INSTRUMENT_BUFFERS;
 			if (auto_explain_log_wal)
 				queryDesc->instrument_options |= INSTRUMENT_WAL;
+			queryDesc->sample_freq_hz = auto_explain_log_timing_samplefreq;
 		}
 	}
 
@@ -394,7 +431,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 			es->verbose = auto_explain_log_verbose;
 			es->buffers = (es->analyze && auto_explain_log_buffers);
 			es->wal = (es->analyze && auto_explain_log_wal);
-			es->timing = (es->analyze && auto_explain_log_timing);
+			es->timing = (es->analyze && auto_explain_log_timing == LOG_TIMING_ON);
+			es->sampling = (es->analyze && auto_explain_log_timing == LOG_TIMING_SAMPLING);
 			es->summary = es->analyze;
 			es->format = auto_explain_log_format;
 			es->settings = auto_explain_log_settings;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f86983c660..32d63de67f 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -12,6 +12,7 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+#include <sys/time.h>
 
 #include "access/xact.h"
 #include "catalog/pg_type.h"
@@ -30,12 +31,14 @@
 #include "storage/bufmgr.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
+#include "utils/float.h"
 #include "utils/guc_tables.h"
 #include "utils/json.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
+#include "utils/timeout.h"
 #include "utils/tuplesort.h"
 #include "utils/typcache.h"
 #include "utils/xml.h"
@@ -192,8 +195,41 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 			es->settings = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "timing") == 0)
 		{
+			char	   *p = opt->arg != NULL ? defGetString(opt) : NULL;
 			timing_set = true;
-			es->timing = defGetBoolean(opt);
+			if (p == NULL || pg_strcasecmp(p, "1") == 0 || pg_strcasecmp(p, "true") == 0 || pg_strcasecmp(p, "on") == 0)
+			{
+				es->timing = true;
+				es->sampling = false;
+			}
+			else if (pg_strcasecmp(p, "0") == 0 || pg_strcasecmp(p, "false") == 0 || pg_strcasecmp(p, "off") == 0)
+			{
+				es->timing = false;
+				es->sampling = false;
+			}
+			else if (pg_strcasecmp(p, "sampling") == 0)
+			{
+				es->timing = false;
+				es->sampling = true;
+			}
+			else
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("unrecognized value for EXPLAIN option \"%s\": \"%s\"",
+								opt->defname, p),
+						 parser_errposition(pstate, opt->location)));
+			}
+		}
+		else if (strcmp(opt->defname, "samplefreq") == 0)
+		{
+			int32 p = defGetInt32(opt);
+			if (p < 1 || p > 1000)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("EXPLAIN option samplefreq requires value between %d and %d", 1, 1000),
+						 parser_errposition(pstate, opt->location)));
+			es->sample_freq_hz = (uint32) p;
 		}
 		else if (strcmp(opt->defname, "summary") == 0)
 		{
@@ -236,11 +272,20 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 	es->timing = (timing_set) ? es->timing : es->analyze;
 
 	/* check that timing is used with EXPLAIN ANALYZE */
-	if (es->timing && !es->analyze)
+	if ((es->timing || es->sampling) && !es->analyze)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("EXPLAIN option TIMING requires ANALYZE")));
 
+	/* check that sampling is enabled when sample frequency is passed */
+	if (es->sample_freq_hz != 0 && !es->sampling)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("EXPLAIN option SAMPLEFREQ requires TIMING SAMPLING")));
+
+	/* if sampling frequency was not set explicitly, set default value */
+	es->sample_freq_hz = es->sample_freq_hz != 0 ? es->sample_freq_hz : 1000;
+
 	/* if the summary was not set explicitly, set default value */
 	es->summary = (summary_set) ? es->summary : es->analyze;
 
@@ -531,6 +576,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 
 	if (es->analyze && es->timing)
 		instrument_option |= INSTRUMENT_TIMER;
+	else if (es->analyze && es->sampling)
+		instrument_option |= INSTRUMENT_TIMER_SAMPLING | INSTRUMENT_ROWS;
 	else if (es->analyze)
 		instrument_option |= INSTRUMENT_ROWS;
 
@@ -578,6 +625,21 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	/* call ExecutorStart to prepare the plan for execution */
 	ExecutorStart(queryDesc, eflags);
 
+	/*
+	 * If sampling, we initialize executor time tracking for the query,
+	 * in order to scale individual sampled times to the total accurate
+	 * clock time spent for running all plan nodes in ExecutorRun + time
+	 * spent in ExecutorFinish. This also keeps the total sampled time.
+	 *
+	 * Note this is separate from "Execution Time" shown by EXPLAIN, which
+	 * includes ExecutorStart, ExecutorEnd and the EXPLAIN print functions.
+	 */
+	if (es->sampling)
+	{
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		queryDesc->sample_freq_hz = es->sample_freq_hz;
+	}
+
 	/* Execute the plan for statistics if asked for */
 	if (es->analyze)
 	{
@@ -595,6 +657,10 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		/* run cleanup too */
 		ExecutorFinish(queryDesc);
 
+		/* ensure total time gets set, if needed */
+		if (es->sampling)
+			InstrEndLoop(queryDesc->totaltime);
+
 		/* We can't run ExecutorEnd 'till we're done printing the stats... */
 		totaltime += elapsed_time(&starttime);
 	}
@@ -768,6 +834,7 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc)
 	es->deparse_cxt = deparse_context_for_plan_tree(queryDesc->plannedstmt,
 													es->rtable_names);
 	es->printed_subplans = NULL;
+	es->totaltime = queryDesc->totaltime;
 
 	/*
 	 * Sometimes we mark a Gather node as "invisible", which means that it's
@@ -1642,6 +1709,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		double		startup_ms = 1000.0 * planstate->instrument->startup / nloops;
 		double		total_ms = 1000.0 * planstate->instrument->total / nloops;
 		double		rows = planstate->instrument->ntuples / nloops;
+		double		sampled_time_ms;
+
+		if (es->totaltime != NULL && es->totaltime->sampled_total != 0)
+		{
+			/* Sampling undercounts time, scale per-node sampled time based on actual full execution time */
+			double sampled_pct = (double) planstate->instrument->sampled_total / es->totaltime->sampled_total;
+			sampled_time_ms = 1000.0 * es->totaltime->total * sampled_pct / nloops;
+		}
+		else
+		{
+			sampled_time_ms = get_float8_nan();
+		}
 
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 		{
@@ -1649,6 +1728,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				appendStringInfo(es->str,
 								 " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
 								 startup_ms, total_ms, rows, nloops);
+			else if (es->sampling)
+				appendStringInfo(es->str,
+								 " (actual sampled time=%.3f rows=%.0f loops=%.0f)",
+								 sampled_time_ms, rows, nloops);
 			else
 				appendStringInfo(es->str,
 								 " (actual rows=%.0f loops=%.0f)",
@@ -1663,6 +1746,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Actual Total Time", "ms", total_ms,
 									 3, es);
 			}
+			else if (es->sampling)
+			{
+				ExplainPropertyFloat("Actual Sampled Time", "ms", sampled_time_ms,
+									 3, es);
+			}
 			ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
 			ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
 		}
@@ -1678,6 +1766,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Actual Startup Time", "ms", 0.0, 3, es);
 				ExplainPropertyFloat("Actual Total Time", "ms", 0.0, 3, es);
 			}
+			else if (es->sampling)
+			{
+				ExplainPropertyFloat("Actual Sampled Time", "ms", 0.0, 3, es);
+			}
 			ExplainPropertyFloat("Actual Rows", NULL, 0.0, 0, es);
 			ExplainPropertyFloat("Actual Loops", NULL, 0.0, 0, es);
 		}
@@ -1699,6 +1791,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			double		startup_ms;
 			double		total_ms;
 			double		rows;
+			double		sampled_time_ms;
 
 			if (nloops <= 0)
 				continue;
@@ -1706,6 +1799,13 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			total_ms = 1000.0 * instrument->total / nloops;
 			rows = instrument->ntuples / nloops;
 
+			/*
+			 * For parallel query, normalization of sampled times with the total
+			 * time is done in each worker, see ExecParallelNormalizeSampledTiming
+			 * */
+			if (es->sampling)
+				sampled_time_ms = (instrument->sampled_total / 1000000.0) / nloops;
+
 			ExplainOpenWorker(n, es);
 
 			if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1715,6 +1815,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					appendStringInfo(es->str,
 									 "actual time=%.3f..%.3f rows=%.0f loops=%.0f\n",
 									 startup_ms, total_ms, rows, nloops);
+				else if (es->sampling)
+					appendStringInfo(es->str,
+								 	 "actual sampled time=%.3f rows=%.0f loops=%.0f\n",
+								 	 sampled_time_ms, rows, nloops);
 				else
 					appendStringInfo(es->str,
 									 "actual rows=%.0f loops=%.0f\n",
@@ -1729,6 +1833,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					ExplainPropertyFloat("Actual Total Time", "ms",
 										 total_ms, 3, es);
 				}
+				else if (es->sampling)
+				{
+					ExplainPropertyFloat("Actual Sampled Time", "ms",
+										 sampled_time_ms, 3, es);
+				}
 				ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
 				ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
 			}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 2c2b3a8874..55d4ed9a05 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -333,7 +333,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
+	{
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStartSampling(queryDesc->sample_freq_hz);
 		InstrStartNode(queryDesc->totaltime);
+	}
 
 	/*
 	 * extract information from the query descriptor and the query feature.
@@ -379,7 +383,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 		dest->rShutdown(dest);
 
 	if (queryDesc->totaltime)
+	{
 		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStopSampling();
+	}
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -429,7 +437,11 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
+	{
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStartSampling(queryDesc->sample_freq_hz);
 		InstrStartNode(queryDesc->totaltime);
+	}
 
 	/* Run ModifyTable nodes to completion */
 	ExecPostprocessPlan(estate);
@@ -439,7 +451,11 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 		AfterTriggerEndQuery(estate);
 
 	if (queryDesc->totaltime)
+	{
 		InstrStopNode(queryDesc->totaltime, 0);
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStopSampling();
+	}
 
 	MemoryContextSwitchTo(oldcontext);
 
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index a5b8e43ec5..21d1eab908 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -1296,6 +1296,29 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 								 instrumentation);
 }
 
+/*
+ * Normalize sampled timing information collected for EXPLAIN ANALYZE,
+ * based on the parallel worker's executor total time and total sampled time.
+ *
+ * For non-parallel cases this is done when printing each plan node, but for
+ * parallel query we need to do this normalization ahead of time, to avoid
+ * passing a bunch of extra information back to the leader process.
+ */
+static bool
+ExecParallelNormalizeSampledTiming(PlanState *planstate,
+								   Instrumentation *worker_totaltime)
+{
+	if (worker_totaltime->sampled_total != 0)
+	{
+		/* Sampling undercounts time, scale per-node sampled time based on actual full execution time */
+		double sampled_pct = (double) planstate->instrument->sampled_total / worker_totaltime->sampled_total;
+		planstate->instrument->sampled_total = (uint64) (worker_totaltime->total * sampled_pct * 1000000000.0);
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelNormalizeSampledTiming,
+								 worker_totaltime);
+}
+
 /*
  * Initialize the PlanState and its descendants with the information
  * retrieved from shared memory.  This has to be done once the PlanState
@@ -1419,6 +1442,13 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 										 true);
 	queryDesc = ExecParallelGetQueryDesc(toc, receiver, instrument_options);
 
+	/* Set up instrumentation for sampling based EXPLAIN ANALYZE, if requested */
+	if (instrument_options & INSTRUMENT_TIMER_SAMPLING)
+	{
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		queryDesc->sample_freq_hz = 1000; // TODO: Pass the correct value here
+	}
+
 	/* Setting debug_query_string for individual workers */
 	debug_query_string = queryDesc->sourceText;
 
@@ -1470,6 +1500,14 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	/* Shut down the executor */
 	ExecutorFinish(queryDesc);
 
+	/* Normalize sampled time values whilst we have the query descriptor */
+	if (instrument_options & INSTRUMENT_TIMER_SAMPLING)
+	{
+		InstrEndLoop(queryDesc->totaltime);
+		ExecParallelNormalizeSampledTiming(queryDesc->planstate,
+										   queryDesc->totaltime);
+	}
+
 	/* Report buffer/WAL usage during parallel execution. */
 	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ceff4727d4..dba321bc1b 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,15 +16,96 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "nodes/pg_list.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+#include "utils/memutils.h"
 
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
 static WalUsage save_pgWalUsage;
 
+volatile uint64 last_sampled_time;
+static List *sample_rate_stack;
+
 static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
 static void WalUsageAdd(WalUsage *dst, WalUsage *add);
 
+/*
+ * Update the last sampled timestamp
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+InstrumentSamplingTimeoutHandler(void)
+{
+	instr_time now;
+	INSTR_TIME_SET_CURRENT(now);
+	last_sampled_time = INSTR_TIME_GET_NANOSEC(now);
+}
+
+static void
+StartSamplingTimeout(int sample_rate_hz, bool disable_old_timeout)
+{
+	int timeout_delay_ms = 1000 / sample_rate_hz;
+	TimestampTz fin_time = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), timeout_delay_ms);
+	instr_time now;
+
+	if (disable_old_timeout)
+		disable_timeout(INSTRUMENT_SAMPLING_TIMEOUT, false);
+
+	INSTR_TIME_SET_CURRENT(now);
+	last_sampled_time = INSTR_TIME_GET_NANOSEC(now);
+
+	enable_timeout_every(INSTRUMENT_SAMPLING_TIMEOUT, fin_time, timeout_delay_ms);
+}
+
+void
+InstrStartSampling(int sample_rate_hz)
+{
+	MemoryContext old_ctx;
+
+	Assert(sample_rate_hz > 0);
+	Assert(sample_rate_hz <= 1000);
+
+	/* In case of errors, a previous timeout may have been stopped without us knowing */
+	if (sample_rate_stack != NIL && !get_timeout_active(INSTRUMENT_SAMPLING_TIMEOUT))
+	{
+		list_free(sample_rate_stack);
+		sample_rate_stack = NIL;
+	}
+
+	if (sample_rate_stack == NIL)
+		StartSamplingTimeout(sample_rate_hz, false);
+	else if (sample_rate_hz > llast_int(sample_rate_stack))
+		/* Reset timeout if a higher sampling frequency is requested */
+		StartSamplingTimeout(sample_rate_hz, true);
+	else
+		sample_rate_hz = llast_int(sample_rate_stack);
+
+	/* Keep sample rate so we can reduce the frequency or stop the timeout */
+	old_ctx = MemoryContextSwitchTo(TopMemoryContext);
+	sample_rate_stack = lappend_int(sample_rate_stack, sample_rate_hz);
+	MemoryContextSwitchTo(old_ctx);
+}
+
+void
+InstrStopSampling()
+{
+	int old_sample_rate_hz;
+
+	Assert(sample_rate_stack != NIL);
+
+	old_sample_rate_hz = llast_int(sample_rate_stack);
+	sample_rate_stack = list_delete_last(sample_rate_stack);
+
+	if (sample_rate_stack == NIL)
+		disable_timeout(INSTRUMENT_SAMPLING_TIMEOUT, false);
+	else if (old_sample_rate_hz > llast_int(sample_rate_stack))
+		/* Reset timeout if we're returning to a lower frequency */
+		StartSamplingTimeout(llast_int(sample_rate_stack), true);
+}
 
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
@@ -77,6 +158,9 @@ InstrStartNode(Instrumentation *instr)
 
 	if (instr->need_walusage)
 		instr->walusage_start = pgWalUsage;
+
+	/* Save sampled start time unconditionally (this is very cheap and not worth a branch) */
+	instr->sampled_starttime = last_sampled_time;
 }
 
 /* Exit from a plan node */
@@ -125,6 +209,9 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (instr->async_mode && save_tuplecount < 1.0)
 			instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
 	}
+
+	/* Calculate sampled time unconditionally (this is very cheap and not worth a branch) */
+	instr->sampled_total += last_sampled_time - instr->sampled_starttime;
 }
 
 /* Update tuple count */
@@ -193,6 +280,8 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
 
 	if (dst->need_walusage)
 		WalUsageAdd(&dst->walusage, &add->walusage);
+
+	dst->sampled_total += add->sampled_total;
 }
 
 /* note current values during parallel executor startup */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index a990c833c5..f469a41a7b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -760,6 +760,8 @@ InitPostgres(const char *in_dbname, Oid dboid,
 		RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
 		RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT,
 						IdleStatsUpdateTimeoutHandler);
+		RegisterTimeout(INSTRUMENT_SAMPLING_TIMEOUT,
+						InstrumentSamplingTimeoutHandler);
 	}
 
 	/*
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 9ebde089ae..29b0651f34 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -44,6 +44,8 @@ typedef struct ExplainState
 	bool		buffers;		/* print buffer usage */
 	bool		wal;			/* print WAL usage */
 	bool		timing;			/* print detailed node timing */
+	bool		sampling;		/* print per-node timing based on sampling */
+	uint32		sample_freq_hz; /* frequency of sampling for per-node timing */
 	bool		summary;		/* print total planning and execution timing */
 	bool		settings;		/* print modified settings */
 	ExplainFormat format;		/* output format */
@@ -57,6 +59,8 @@ typedef struct ExplainState
 	List	   *deparse_cxt;	/* context list for deparsing expressions */
 	Bitmapset  *printed_subplans;	/* ids of SubPlans we've printed */
 	bool		hide_workers;	/* set if we find an invisible Gather */
+	/* state for the whole query execution */
+	struct Instrumentation *totaltime;	/* totaltime from query descriptor */
 	/* state related to the current plan node */
 	ExplainWorkersState *workers_state; /* needed if parallel plan */
 } ExplainState;
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index e79e2c001f..6e1bb35c3c 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,9 @@ typedef struct QueryDesc
 	/* This field is set by ExecutorRun */
 	bool		already_executed;	/* true if previously executed */
 
-	/* This is always set NULL by the core system, but plugins can change it */
-	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun */
+	/* This is used by sampling mode for EXPLAIN ANALYZE timings, and plugins */
+	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun and ExecutorFinish */
+	int sample_freq_hz;	/* frequency of sampling (if enabled) */
 } QueryDesc;
 
 /* in pquery.c */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 2945cce3a9..e08941023a 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -60,6 +60,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
+	INSTRUMENT_TIMER_SAMPLING = 1 << 4,	/* needs timer based on sampling */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
@@ -78,6 +79,7 @@ typedef struct Instrumentation
 	double		tuplecount;		/* # of tuples emitted so far this cycle */
 	BufferUsage bufusage_start; /* buffer usage at start */
 	WalUsage	walusage_start; /* WAL usage at start */
+	uint64		sampled_starttime; /* sampled start time of node */
 	/* Accumulated statistics across all completed cycles: */
 	double		startup;		/* total startup time (in seconds) */
 	double		total;			/* total time (in seconds) */
@@ -88,6 +90,7 @@ typedef struct Instrumentation
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
 	BufferUsage bufusage;		/* total buffer usage */
 	WalUsage	walusage;		/* total WAL usage */
+	uint64		sampled_total;  /* sampled total time */
 } Instrumentation;
 
 typedef struct WorkerInstrumentation
@@ -99,6 +102,10 @@ typedef struct WorkerInstrumentation
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
+extern void InstrStartSampling(int sample_rate_hz);
+extern void InstrStopSampling();
+extern void InstrumentSamplingTimeoutHandler(void);
+
 extern Instrumentation *InstrAlloc(int n, int instrument_options,
 								   bool async_mode);
 extern void InstrInit(Instrumentation *instr, int instrument_options);
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 22bcf3d288..d0f60d91c1 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -138,6 +138,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* WIN32 */
 
 /* Use QueryPerformanceCounter() */
@@ -168,6 +171,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index c068986d09..49d4c214dd 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -35,6 +35,7 @@ typedef enum TimeoutId
 	IDLE_STATS_UPDATE_TIMEOUT,
 	CLIENT_CONNECTION_CHECK_TIMEOUT,
 	STARTUP_PROGRESS_TIMEOUT,
+	INSTRUMENT_SAMPLING_TIMEOUT,
 	/* First user-definable timeout reason */
 	USER_TIMEOUT,
 	/* Maximum number of timeout reasons */
-- 
2.34.0

