From 5228ab37a62dfd923f8e60529b5d64c457e26739 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 1 Jan 2023 16:23:18 -0800
Subject: [PATCH v2] Add TIMING SAMPLING option for EXPLAIN ANALYZE

This introduces a new mode, TIMING SAMPLING, as a lower overhead alternative to
TIMING ON for EXPLAIN ANALYZE and auto_explain.

Instead of measuring the current time when entering and exiting each plan node
(which can get expensive even on modern systems, e.g. when doing many loops),
the sampling timing mode measures the time at a fixed recurring interval using
a wall clock timer.

The sampling frequency can be adjusted from 1 to 1000 Hz by passing the SAMPLEFREQ
option to EXPLAIN, and defaults to 1000 Hz (1ms). Higher frequencies are
technically possible but currently not supported by the Postgres timeout API.

The sampled wall clock time is cached and used by the the plan node instrumentation
to attribute how much sampled time passed whilst a plan node was executed.

Since sampling is likely to undercount the actual time taken, the sampled time
values are scaled to the total execution time of the query (or parallel query
worker), before being displayed.

Additionally, this adds support for sampling based timing to auto_explain by
introducing the new "auto_explain.log_timing = sampling" setting, and
"auto_explain.log_timing_samplefreq" setting to set the frequency.
---
 contrib/auto_explain/auto_explain.c   |  48 +++++++++--
 src/backend/commands/explain.c        | 113 +++++++++++++++++++++++++-
 src/backend/executor/execMain.c       |  18 ++++
 src/backend/executor/execParallel.c   |  40 +++++++++
 src/backend/executor/instrument.c     | 104 ++++++++++++++++++++++++
 src/backend/tcop/pquery.c             |   3 +
 src/backend/utils/init/postinit.c     |   2 +
 src/include/commands/explain.h        |   4 +
 src/include/executor/execdesc.h       |   5 +-
 src/include/executor/instrument.h     |   7 ++
 src/include/nodes/execnodes.h         |   1 +
 src/include/portability/instr_time.h  |   6 ++
 src/include/utils/timeout.h           |   1 +
 src/test/regress/expected/explain.out |  68 ++++++++++++++++
 src/test/regress/sql/explain.sql      |   8 ++
 15 files changed, 419 insertions(+), 9 deletions(-)

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index c3ac27ae99..b887889571 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -24,6 +24,13 @@
 
 PG_MODULE_MAGIC;
 
+typedef enum
+{
+	LOG_TIMING_OFF,
+	LOG_TIMING_ON,
+	LOG_TIMING_SAMPLING
+} LogTimingType;
+
 /* GUC variables */
 static int	auto_explain_log_min_duration = -1; /* msec or -1 */
 static int	auto_explain_log_parameter_max_length = -1; /* bytes or -1 */
@@ -32,7 +39,8 @@ static bool auto_explain_log_verbose = false;
 static bool auto_explain_log_buffers = false;
 static bool auto_explain_log_wal = false;
 static bool auto_explain_log_triggers = false;
-static bool auto_explain_log_timing = true;
+static int	auto_explain_log_timing = LOG_TIMING_ON;
+static int	auto_explain_log_timing_samplefreq = 1000;
 static bool auto_explain_log_settings = false;
 static int	auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
 static int	auto_explain_log_level = LOG;
@@ -61,6 +69,19 @@ static const struct config_enum_entry loglevel_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry log_timing_options[] = {
+	{"sampling", LOG_TIMING_SAMPLING, false},
+	{"on", LOG_TIMING_ON, false},
+	{"off", LOG_TIMING_OFF, false},
+	{"true", LOG_TIMING_ON, true},
+	{"false", LOG_TIMING_OFF, true},
+	{"yes", LOG_TIMING_ON, true},
+	{"no", LOG_TIMING_OFF, true},
+	{"1", LOG_TIMING_ON, true},
+	{"0", LOG_TIMING_OFF, true},
+	{NULL, 0, false}
+};
+
 /* Current nesting depth of ExecutorRun calls */
 static int	nesting_level = 0;
 
@@ -218,17 +239,30 @@ _PG_init(void)
 							 NULL,
 							 NULL);
 
-	DefineCustomBoolVariable("auto_explain.log_timing",
+	DefineCustomEnumVariable("auto_explain.log_timing",
 							 "Collect timing data, not just row counts.",
 							 NULL,
 							 &auto_explain_log_timing,
-							 true,
+							 LOG_TIMING_ON,
+							 log_timing_options,
 							 PGC_SUSET,
 							 0,
 							 NULL,
 							 NULL,
 							 NULL);
 
+	DefineCustomIntVariable("auto_explain.log_timing_samplefreq",
+							"Frequency for sampling-based timing, if used.",
+							"Number of sampling ticks per second (Hz).",
+							&auto_explain_log_timing_samplefreq,
+							1000,
+							1, 1000,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomRealVariable("auto_explain.sample_rate",
 							 "Fraction of queries to process.",
 							 NULL,
@@ -284,14 +318,17 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 		/* Enable per-node instrumentation iff log_analyze is required. */
 		if (auto_explain_log_analyze && (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
 		{
-			if (auto_explain_log_timing)
+			if (auto_explain_log_timing == LOG_TIMING_ON)
 				queryDesc->instrument_options |= INSTRUMENT_TIMER;
+			else if (auto_explain_log_timing == LOG_TIMING_SAMPLING)
+				queryDesc->instrument_options |= INSTRUMENT_TIMER_SAMPLING | INSTRUMENT_ROWS;
 			else
 				queryDesc->instrument_options |= INSTRUMENT_ROWS;
 			if (auto_explain_log_buffers)
 				queryDesc->instrument_options |= INSTRUMENT_BUFFERS;
 			if (auto_explain_log_wal)
 				queryDesc->instrument_options |= INSTRUMENT_WAL;
+			queryDesc->sample_freq_hz = auto_explain_log_timing_samplefreq;
 		}
 	}
 
@@ -394,7 +431,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 			es->verbose = auto_explain_log_verbose;
 			es->buffers = (es->analyze && auto_explain_log_buffers);
 			es->wal = (es->analyze && auto_explain_log_wal);
-			es->timing = (es->analyze && auto_explain_log_timing);
+			es->timing = (es->analyze && auto_explain_log_timing == LOG_TIMING_ON);
+			es->sampling = (es->analyze && auto_explain_log_timing == LOG_TIMING_SAMPLING);
 			es->summary = es->analyze;
 			es->format = auto_explain_log_format;
 			es->settings = auto_explain_log_settings;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e4621ef8d6..6c07e88efb 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -12,6 +12,7 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+#include <sys/time.h>
 
 #include "access/xact.h"
 #include "catalog/pg_type.h"
@@ -30,12 +31,14 @@
 #include "storage/bufmgr.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
+#include "utils/float.h"
 #include "utils/guc_tables.h"
 #include "utils/json.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
+#include "utils/timeout.h"
 #include "utils/tuplesort.h"
 #include "utils/typcache.h"
 #include "utils/xml.h"
@@ -192,8 +195,41 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 			es->settings = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "timing") == 0)
 		{
+			char	   *p = opt->arg != NULL ? defGetString(opt) : NULL;
 			timing_set = true;
-			es->timing = defGetBoolean(opt);
+			if (p == NULL || pg_strcasecmp(p, "1") == 0 || pg_strcasecmp(p, "true") == 0 || pg_strcasecmp(p, "on") == 0)
+			{
+				es->timing = true;
+				es->sampling = false;
+			}
+			else if (pg_strcasecmp(p, "0") == 0 || pg_strcasecmp(p, "false") == 0 || pg_strcasecmp(p, "off") == 0)
+			{
+				es->timing = false;
+				es->sampling = false;
+			}
+			else if (pg_strcasecmp(p, "sampling") == 0)
+			{
+				es->timing = false;
+				es->sampling = true;
+			}
+			else
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("unrecognized value for EXPLAIN option \"%s\": \"%s\"",
+								opt->defname, p),
+						 parser_errposition(pstate, opt->location)));
+			}
+		}
+		else if (strcmp(opt->defname, "samplefreq") == 0)
+		{
+			int32 p = defGetInt32(opt);
+			if (p < 1 || p > 1000)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("EXPLAIN option samplefreq requires value between %d and %d", 1, 1000),
+						 parser_errposition(pstate, opt->location)));
+			es->sample_freq_hz = (uint32) p;
 		}
 		else if (strcmp(opt->defname, "summary") == 0)
 		{
@@ -236,11 +272,20 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 	es->timing = (timing_set) ? es->timing : es->analyze;
 
 	/* check that timing is used with EXPLAIN ANALYZE */
-	if (es->timing && !es->analyze)
+	if ((es->timing || es->sampling) && !es->analyze)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("EXPLAIN option TIMING requires ANALYZE")));
 
+	/* check that sampling is enabled when sample frequency is passed */
+	if (es->sample_freq_hz != 0 && !es->sampling)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("EXPLAIN option SAMPLEFREQ requires TIMING SAMPLING")));
+
+	/* if sampling frequency was not set explicitly, set default value */
+	es->sample_freq_hz = es->sample_freq_hz != 0 ? es->sample_freq_hz : 1000;
+
 	/* if the summary was not set explicitly, set default value */
 	es->summary = (summary_set) ? es->summary : es->analyze;
 
@@ -531,6 +576,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 
 	if (es->analyze && es->timing)
 		instrument_option |= INSTRUMENT_TIMER;
+	else if (es->analyze && es->sampling)
+		instrument_option |= INSTRUMENT_TIMER_SAMPLING | INSTRUMENT_ROWS;
 	else if (es->analyze)
 		instrument_option |= INSTRUMENT_ROWS;
 
@@ -578,6 +625,21 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	/* call ExecutorStart to prepare the plan for execution */
 	ExecutorStart(queryDesc, eflags);
 
+	/*
+	 * If sampling, we initialize executor time tracking for the query,
+	 * in order to scale individual sampled times to the total accurate
+	 * clock time spent for running all plan nodes in ExecutorRun + time
+	 * spent in ExecutorFinish. This also keeps the total sampled time.
+	 *
+	 * Note this is separate from "Execution Time" shown by EXPLAIN, which
+	 * includes ExecutorStart, ExecutorEnd and the EXPLAIN print functions.
+	 */
+	if (es->sampling)
+	{
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		queryDesc->sample_freq_hz = es->sample_freq_hz;
+	}
+
 	/* Execute the plan for statistics if asked for */
 	if (es->analyze)
 	{
@@ -595,6 +657,10 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		/* run cleanup too */
 		ExecutorFinish(queryDesc);
 
+		/* ensure total time gets set, if needed */
+		if (es->sampling)
+			InstrEndLoop(queryDesc->totaltime);
+
 		/* We can't run ExecutorEnd 'till we're done printing the stats... */
 		totaltime += elapsed_time(&starttime);
 	}
@@ -768,6 +834,7 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc)
 	es->deparse_cxt = deparse_context_for_plan_tree(queryDesc->plannedstmt,
 													es->rtable_names);
 	es->printed_subplans = NULL;
+	es->totaltime = queryDesc->totaltime;
 
 	/*
 	 * Sometimes we mark a Gather node as "invisible", which means that it's
@@ -1642,6 +1709,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		double		startup_ms = 1000.0 * planstate->instrument->startup / nloops;
 		double		total_ms = 1000.0 * planstate->instrument->total / nloops;
 		double		rows = planstate->instrument->ntuples / nloops;
+		double		sampled_time_ms;
+
+		if (es->totaltime != NULL && es->totaltime->sampled_total != 0)
+		{
+			/* Sampling undercounts time, scale per-node sampled time based on actual full execution time */
+			double sampled_pct = (double) planstate->instrument->sampled_total / es->totaltime->sampled_total;
+			sampled_time_ms = 1000.0 * es->totaltime->total * sampled_pct / nloops;
+		}
+		else
+		{
+			sampled_time_ms = get_float8_nan();
+		}
 
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 		{
@@ -1649,6 +1728,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				appendStringInfo(es->str,
 								 " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
 								 startup_ms, total_ms, rows, nloops);
+			else if (es->sampling)
+				appendStringInfo(es->str,
+								 " (actual sampled time=%.3f rows=%.0f loops=%.0f)",
+								 sampled_time_ms, rows, nloops);
 			else
 				appendStringInfo(es->str,
 								 " (actual rows=%.0f loops=%.0f)",
@@ -1663,6 +1746,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Actual Total Time", "ms", total_ms,
 									 3, es);
 			}
+			else if (es->sampling)
+			{
+				ExplainPropertyFloat("Actual Sampled Time", "ms", sampled_time_ms,
+									 3, es);
+			}
 			ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
 			ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
 		}
@@ -1678,6 +1766,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Actual Startup Time", "ms", 0.0, 3, es);
 				ExplainPropertyFloat("Actual Total Time", "ms", 0.0, 3, es);
 			}
+			else if (es->sampling)
+			{
+				ExplainPropertyFloat("Actual Sampled Time", "ms", 0.0, 3, es);
+			}
 			ExplainPropertyFloat("Actual Rows", NULL, 0.0, 0, es);
 			ExplainPropertyFloat("Actual Loops", NULL, 0.0, 0, es);
 		}
@@ -1699,6 +1791,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			double		startup_ms;
 			double		total_ms;
 			double		rows;
+			double		sampled_time_ms = 0.0;
 
 			if (nloops <= 0)
 				continue;
@@ -1706,6 +1799,13 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			total_ms = 1000.0 * instrument->total / nloops;
 			rows = instrument->ntuples / nloops;
 
+			/*
+			 * For parallel query, normalization of sampled times with the total
+			 * time is done in each worker, see ExecParallelNormalizeSampledTiming
+			 * */
+			if (es->sampling)
+				sampled_time_ms = (instrument->sampled_total / 1000000.0) / nloops;
+
 			ExplainOpenWorker(n, es);
 
 			if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1715,6 +1815,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					appendStringInfo(es->str,
 									 "actual time=%.3f..%.3f rows=%.0f loops=%.0f\n",
 									 startup_ms, total_ms, rows, nloops);
+				else if (es->sampling)
+					appendStringInfo(es->str,
+								 	 "actual sampled time=%.3f rows=%.0f loops=%.0f\n",
+								 	 sampled_time_ms, rows, nloops);
 				else
 					appendStringInfo(es->str,
 									 "actual rows=%.0f loops=%.0f\n",
@@ -1729,6 +1833,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					ExplainPropertyFloat("Actual Total Time", "ms",
 										 total_ms, 3, es);
 				}
+				else if (es->sampling)
+				{
+					ExplainPropertyFloat("Actual Sampled Time", "ms",
+										 sampled_time_ms, 3, es);
+				}
 				ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
 				ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
 			}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index a5115b9c1f..0f7badf56d 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -251,6 +251,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
 	estate->es_top_eflags = eflags;
 	estate->es_instrument = queryDesc->instrument_options;
+	estate->es_sample_freq_hz = queryDesc->sample_freq_hz;
 	estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
 
 	/*
@@ -333,7 +334,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
+	{
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStartSampling(queryDesc->sample_freq_hz);
 		InstrStartNode(queryDesc->totaltime);
+	}
 
 	/*
 	 * extract information from the query descriptor and the query feature.
@@ -379,7 +384,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 		dest->rShutdown(dest);
 
 	if (queryDesc->totaltime)
+	{
 		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStopSampling();
+	}
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -429,7 +438,11 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
+	{
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStartSampling(queryDesc->sample_freq_hz);
 		InstrStartNode(queryDesc->totaltime);
+	}
 
 	/* Run ModifyTable nodes to completion */
 	ExecPostprocessPlan(estate);
@@ -439,7 +452,11 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 		AfterTriggerEndQuery(estate);
 
 	if (queryDesc->totaltime)
+	{
 		InstrStopNode(queryDesc->totaltime, 0);
+		if (queryDesc->instrument_options & INSTRUMENT_TIMER_SAMPLING)
+			InstrStopSampling();
+	}
 
 	MemoryContextSwitchTo(oldcontext);
 
@@ -2819,6 +2836,7 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree)
 	/* es_trig_target_relations must NOT be copied */
 	rcestate->es_top_eflags = parentestate->es_top_eflags;
 	rcestate->es_instrument = parentestate->es_instrument;
+	rcestate->es_sample_freq_hz = parentestate->es_sample_freq_hz;
 	/* es_auxmodifytables must NOT be copied */
 
 	/*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index aa3f283453..09dfe7a082 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -78,6 +78,7 @@ typedef struct FixedParallelExecutorState
 	dsa_pointer param_exec;
 	int			eflags;
 	int			jit_flags;
+	int			sample_freq_hz;	/* frequency of sampling mode for EXPLAIN ANALYZE timings, if enabled */
 } FixedParallelExecutorState;
 
 /*
@@ -740,6 +741,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	fpes->param_exec = InvalidDsaPointer;
 	fpes->eflags = estate->es_top_eflags;
 	fpes->jit_flags = estate->es_jit_flags;
+	fpes->sample_freq_hz = estate->es_sample_freq_hz;
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, fpes);
 
 	/* Store query string */
@@ -1296,6 +1298,29 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 								 instrumentation);
 }
 
+/*
+ * Normalize sampled timing information collected for EXPLAIN ANALYZE,
+ * based on the parallel worker's executor total time and total sampled time.
+ *
+ * For non-parallel cases this is done when printing each plan node, but for
+ * parallel query we need to do this normalization ahead of time, to avoid
+ * passing a bunch of extra information back to the leader process.
+ */
+static bool
+ExecParallelNormalizeSampledTiming(PlanState *planstate,
+								   Instrumentation *worker_totaltime)
+{
+	if (worker_totaltime->sampled_total != 0)
+	{
+		/* Sampling undercounts time, scale per-node sampled time based on actual full execution time */
+		double sampled_pct = (double) planstate->instrument->sampled_total / worker_totaltime->sampled_total;
+		planstate->instrument->sampled_total = (uint64) (worker_totaltime->total * sampled_pct * 1000000000.0);
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelNormalizeSampledTiming,
+								 worker_totaltime);
+}
+
 /*
  * Initialize the PlanState and its descendants with the information
  * retrieved from shared memory.  This has to be done once the PlanState
@@ -1419,6 +1444,13 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 										 true);
 	queryDesc = ExecParallelGetQueryDesc(toc, receiver, instrument_options);
 
+	/* Set up instrumentation for sampling based EXPLAIN ANALYZE, if requested */
+	if (instrument_options & INSTRUMENT_TIMER_SAMPLING)
+	{
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		queryDesc->sample_freq_hz = fpes->sample_freq_hz;
+	}
+
 	/* Setting debug_query_string for individual workers */
 	debug_query_string = queryDesc->sourceText;
 
@@ -1470,6 +1502,14 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	/* Shut down the executor */
 	ExecutorFinish(queryDesc);
 
+	/* Normalize sampled time values whilst we have the query descriptor */
+	if (instrument_options & INSTRUMENT_TIMER_SAMPLING)
+	{
+		InstrEndLoop(queryDesc->totaltime);
+		ExecParallelNormalizeSampledTiming(queryDesc->planstate,
+										   queryDesc->totaltime);
+	}
+
 	/* Report buffer/WAL usage during parallel execution. */
 	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ee78a5749d..668cde5ee4 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,15 +16,111 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "nodes/pg_list.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+#include "utils/memutils.h"
 
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
 static WalUsage save_pgWalUsage;
 
+volatile uint64 last_sampled_time;
+static List *sample_rate_stack;
+
 static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
 static void WalUsageAdd(WalUsage *dst, WalUsage *add);
 
+/*
+ * Update the last sampled timestamp
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+InstrumentSamplingTimeoutHandler(void)
+{
+	instr_time now;
+	INSTR_TIME_SET_CURRENT(now);
+	last_sampled_time = INSTR_TIME_GET_NANOSEC(now);
+}
+
+static void
+StartSamplingTimeout(int sample_rate_hz, bool disable_old_timeout)
+{
+	int timeout_delay_ms = 1000 / sample_rate_hz;
+	TimestampTz fin_time = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), timeout_delay_ms);
+	instr_time now;
+
+	if (disable_old_timeout)
+		disable_timeout(INSTRUMENT_SAMPLING_TIMEOUT, false);
+
+	INSTR_TIME_SET_CURRENT(now);
+	last_sampled_time = INSTR_TIME_GET_NANOSEC(now);
+
+	enable_timeout_every(INSTRUMENT_SAMPLING_TIMEOUT, fin_time, timeout_delay_ms);
+}
+
+/*
+ * Start sampling timing information with the given rate (or higher)
+ *
+ * Note this may be called multiple times in a nested manner, for example when
+ * a query calls a function which calls a query. Further, it is possible for
+ * different statement nesting levels to have different sampling rates. In such
+ * cases we pick the highest sampling rate of all currently in effect.
+ */
+void
+InstrStartSampling(int sample_rate_hz)
+{
+	MemoryContext old_ctx;
+
+	Assert(sample_rate_hz > 0);
+	Assert(sample_rate_hz <= 1000);
+
+	/* In case of errors, a previous timeout may have been stopped without us knowing */
+	if (sample_rate_stack != NIL && !get_timeout_active(INSTRUMENT_SAMPLING_TIMEOUT))
+	{
+		list_free(sample_rate_stack);
+		sample_rate_stack = NIL;
+	}
+
+	if (sample_rate_stack == NIL)
+		StartSamplingTimeout(sample_rate_hz, false);
+	else if (sample_rate_hz > llast_int(sample_rate_stack))
+		/* Reset timeout if a higher sampling frequency is requested */
+		StartSamplingTimeout(sample_rate_hz, true);
+	else
+		sample_rate_hz = llast_int(sample_rate_stack);
+
+	/* Keep sample rate so we can reduce the frequency or stop the timeout */
+	old_ctx = MemoryContextSwitchTo(TopMemoryContext);
+	sample_rate_stack = lappend_int(sample_rate_stack, sample_rate_hz);
+	MemoryContextSwitchTo(old_ctx);
+}
+
+/*
+ * Stop sampling timing information at the current rate
+ *
+ * When called in a nested manner this will go back to the lower sampling rate
+ * (if applicable), if this is the highest nesting level it will stop sampling
+ * timing altogether.
+ */
+void
+InstrStopSampling()
+{
+	int old_sample_rate_hz;
+
+	Assert(sample_rate_stack != NIL);
+
+	old_sample_rate_hz = llast_int(sample_rate_stack);
+	sample_rate_stack = list_delete_last(sample_rate_stack);
+
+	if (sample_rate_stack == NIL)
+		disable_timeout(INSTRUMENT_SAMPLING_TIMEOUT, false);
+	else if (old_sample_rate_hz > llast_int(sample_rate_stack))
+		/* Reset timeout if we're returning to a lower frequency */
+		StartSamplingTimeout(llast_int(sample_rate_stack), true);
+}
 
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
@@ -77,6 +173,9 @@ InstrStartNode(Instrumentation *instr)
 
 	if (instr->need_walusage)
 		instr->walusage_start = pgWalUsage;
+
+	/* Save sampled start time unconditionally (this is very cheap and not worth a branch) */
+	instr->sampled_starttime = last_sampled_time;
 }
 
 /* Exit from a plan node */
@@ -125,6 +224,9 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (instr->async_mode && save_tuplecount < 1.0)
 			instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
 	}
+
+	/* Calculate sampled time unconditionally (this is very cheap and not worth a branch) */
+	instr->sampled_total += last_sampled_time - instr->sampled_starttime;
 }
 
 /* Update tuple count */
@@ -193,6 +295,8 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
 
 	if (dst->need_walusage)
 		WalUsageAdd(&dst->walusage, &add->walusage);
+
+	dst->sampled_total += add->sampled_total;
 }
 
 /* note current values during parallel executor startup */
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 5f0248acc5..78ee2db00e 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -90,7 +90,10 @@ CreateQueryDesc(PlannedStmt *plannedstmt,
 	qd->tupDesc = NULL;
 	qd->estate = NULL;
 	qd->planstate = NULL;
+
+	/* may be set later by EXPLAIN (ANALYZE, TIMING SAMPLING), or plugins */
 	qd->totaltime = NULL;
+	qd->sample_freq_hz = 0;
 
 	/* not yet executed */
 	qd->already_executed = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index ae5a85ed65..0b332efb36 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -760,6 +760,8 @@ InitPostgres(const char *in_dbname, Oid dboid,
 		RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
 		RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT,
 						IdleStatsUpdateTimeoutHandler);
+		RegisterTimeout(INSTRUMENT_SAMPLING_TIMEOUT,
+						InstrumentSamplingTimeoutHandler);
 	}
 
 	/*
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 7c1071ddd1..6b3d5e8145 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -44,6 +44,8 @@ typedef struct ExplainState
 	bool		buffers;		/* print buffer usage */
 	bool		wal;			/* print WAL usage */
 	bool		timing;			/* print detailed node timing */
+	bool		sampling;		/* print per-node timing based on sampling */
+	uint32		sample_freq_hz; /* frequency of sampling for per-node timing */
 	bool		summary;		/* print total planning and execution timing */
 	bool		settings;		/* print modified settings */
 	ExplainFormat format;		/* output format */
@@ -57,6 +59,8 @@ typedef struct ExplainState
 	List	   *deparse_cxt;	/* context list for deparsing expressions */
 	Bitmapset  *printed_subplans;	/* ids of SubPlans we've printed */
 	bool		hide_workers;	/* set if we find an invisible Gather */
+	/* state for the whole query execution */
+	struct Instrumentation *totaltime;	/* totaltime from query descriptor */
 	/* state related to the current plan node */
 	ExplainWorkersState *workers_state; /* needed if parallel plan */
 } ExplainState;
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index af2bf36dfb..6d0c1b4f6f 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,9 @@ typedef struct QueryDesc
 	/* This field is set by ExecutorRun */
 	bool		already_executed;	/* true if previously executed */
 
-	/* This is always set NULL by the core system, but plugins can change it */
-	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun */
+	/* This is used by sampling mode for EXPLAIN ANALYZE timings, and plugins */
+	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun and ExecutorFinish */
+	int sample_freq_hz;	/* frequency of sampling (if enabled) */
 } QueryDesc;
 
 /* in pquery.c */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 87e5e2183b..e495952881 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -60,6 +60,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
+	INSTRUMENT_TIMER_SAMPLING = 1 << 4,	/* needs timer based on sampling */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
@@ -78,6 +79,7 @@ typedef struct Instrumentation
 	double		tuplecount;		/* # of tuples emitted so far this cycle */
 	BufferUsage bufusage_start; /* buffer usage at start */
 	WalUsage	walusage_start; /* WAL usage at start */
+	uint64		sampled_starttime; /* sampled start time of node */
 	/* Accumulated statistics across all completed cycles: */
 	double		startup;		/* total startup time (in seconds) */
 	double		total;			/* total time (in seconds) */
@@ -88,6 +90,7 @@ typedef struct Instrumentation
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
 	BufferUsage bufusage;		/* total buffer usage */
 	WalUsage	walusage;		/* total WAL usage */
+	uint64		sampled_total;  /* sampled total time */
 } Instrumentation;
 
 typedef struct WorkerInstrumentation
@@ -99,6 +102,10 @@ typedef struct WorkerInstrumentation
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
+extern void InstrStartSampling(int sample_rate_hz);
+extern void InstrStopSampling(void);
+extern void InstrumentSamplingTimeoutHandler(void);
+
 extern Instrumentation *InstrAlloc(int n, int instrument_options,
 								   bool async_mode);
 extern void InstrInit(Instrumentation *instr, int instrument_options);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 20f4c8b35f..1151397503 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -661,6 +661,7 @@ typedef struct EState
 
 	int			es_top_eflags;	/* eflags passed to ExecutorStart */
 	int			es_instrument;	/* OR of InstrumentOption flags */
+	int			es_sample_freq_hz; 	/* frequency of sampling mode for EXPLAIN ANALYZE timings, if enabled */
 	bool		es_finished;	/* true when ExecutorFinish is done */
 
 	List	   *es_exprcontexts;	/* List of ExprContexts within EState */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 9ea1a68bd9..0a77d693b4 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -138,6 +138,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* WIN32 */
 
 /* Use QueryPerformanceCounter() */
@@ -168,6 +171,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index e561a1cde9..0bf4ecc5f8 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -35,6 +35,7 @@ typedef enum TimeoutId
 	IDLE_STATS_UPDATE_TIMEOUT,
 	CLIENT_CONNECTION_CHECK_TIMEOUT,
 	STARTUP_PROGRESS_TIMEOUT,
+	INSTRUMENT_SAMPLING_TIMEOUT,
 	/* First user-definable timeout reason */
 	USER_TIMEOUT,
 	/* Maximum number of timeout reasons */
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 48620edbc2..68b043941d 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -18,6 +18,8 @@ begin
     loop
         -- Replace any numeric word with just 'N'
         ln := regexp_replace(ln, '-?\m\d+\M', 'N', 'g');
+        -- Replace NaN with 'N.N' to support TIMING SAMPLING tests
+        ln := regexp_replace(ln, '-?\mNaN\M', 'N.N', 'g');
         -- In sort output, the above won't match units-suffixed numbers
         ln := regexp_replace(ln, '\m\d+kB', 'NkB', 'g');
         -- Ignore text-mode buffers output because it varies depending
@@ -42,6 +44,8 @@ begin
     loop
         -- Replace any numeric word with just '0'
         ln := regexp_replace(ln, '\m\d+\M', '0', 'g');
+        -- Replace NaN with '0.0' to support TIMING SAMPLING tests
+        ln := regexp_replace(ln, '\mNaN\M', '0.0', 'g');
         data := data || ln;
     end loop;
     return data::jsonb;
@@ -178,6 +182,70 @@ select explain_filter('explain (analyze, buffers, format yaml) select * from int
    Execution Time: N.N
 (1 row)
 
+select explain_filter('explain (analyze, timing sampling) select * from int8_tbl i8');
+                                          explain_filter                                          
+--------------------------------------------------------------------------------------------------
+ Seq Scan on int8_tbl i8  (cost=N.N..N.N rows=N width=N) (actual sampled time=N.N rows=N loops=N)
+ Planning Time: N.N ms
+ Execution Time: N.N ms
+(3 rows)
+
+select explain_filter('explain (analyze, timing sampling, samplefreq 10, format text) select * from int8_tbl i8');
+                                          explain_filter                                          
+--------------------------------------------------------------------------------------------------
+ Seq Scan on int8_tbl i8  (cost=N.N..N.N rows=N width=N) (actual sampled time=N.N rows=N loops=N)
+ Planning Time: N.N ms
+ Execution Time: N.N ms
+(3 rows)
+
+select explain_filter('explain (analyze, timing sampling, samplefreq 100, format xml) select * from int8_tbl i8');
+                    explain_filter                     
+-------------------------------------------------------
+ <explain xmlns="http://www.postgresql.org/N/explain">+
+   <Query>                                            +
+     <Plan>                                           +
+       <Node-Type>Seq Scan</Node-Type>                +
+       <Parallel-Aware>false</Parallel-Aware>         +
+       <Async-Capable>false</Async-Capable>           +
+       <Relation-Name>int8_tbl</Relation-Name>        +
+       <Alias>i8</Alias>                              +
+       <Startup-Cost>N.N</Startup-Cost>               +
+       <Total-Cost>N.N</Total-Cost>                   +
+       <Plan-Rows>N</Plan-Rows>                       +
+       <Plan-Width>N</Plan-Width>                     +
+       <Actual-Sampled-Time>N.N</Actual-Sampled-Time> +
+       <Actual-Rows>N</Actual-Rows>                   +
+       <Actual-Loops>N</Actual-Loops>                 +
+     </Plan>                                          +
+     <Planning-Time>N.N</Planning-Time>               +
+     <Triggers>                                       +
+     </Triggers>                                      +
+     <Execution-Time>N.N</Execution-Time>             +
+   </Query>                                           +
+ </explain>
+(1 row)
+
+select explain_filter('explain (analyze, timing sampling, samplefreq 1000, format yaml) select * from int8_tbl i8');
+        explain_filter         
+-------------------------------
+ - Plan:                      +
+     Node Type: "Seq Scan"    +
+     Parallel Aware: false    +
+     Async Capable: false     +
+     Relation Name: "int8_tbl"+
+     Alias: "i8"              +
+     Startup Cost: N.N        +
+     Total Cost: N.N          +
+     Plan Rows: N             +
+     Plan Width: N            +
+     Actual Sampled Time: N.N +
+     Actual Rows: N           +
+     Actual Loops: N          +
+   Planning Time: N.N         +
+   Triggers:                  +
+   Execution Time: N.N
+(1 row)
+
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
                      explain_filter                      
 ---------------------------------------------------------
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ae3f7a308d..6eaf2d61aa 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -20,6 +20,8 @@ begin
     loop
         -- Replace any numeric word with just 'N'
         ln := regexp_replace(ln, '-?\m\d+\M', 'N', 'g');
+        -- Replace NaN with 'N.N' to support TIMING SAMPLING tests
+        ln := regexp_replace(ln, '-?\mNaN\M', 'N.N', 'g');
         -- In sort output, the above won't match units-suffixed numbers
         ln := regexp_replace(ln, '\m\d+kB', 'NkB', 'g');
         -- Ignore text-mode buffers output because it varies depending
@@ -45,6 +47,8 @@ begin
     loop
         -- Replace any numeric word with just '0'
         ln := regexp_replace(ln, '\m\d+\M', '0', 'g');
+        -- Replace NaN with '0.0' to support TIMING SAMPLING tests
+        ln := regexp_replace(ln, '\mNaN\M', '0.0', 'g');
         data := data || ln;
     end loop;
     return data::jsonb;
@@ -67,6 +71,10 @@ select explain_filter('explain (analyze, verbose) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
 select explain_filter('explain (analyze, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, timing sampling) select * from int8_tbl i8');
+select explain_filter('explain (analyze, timing sampling, samplefreq 10, format text) select * from int8_tbl i8');
+select explain_filter('explain (analyze, timing sampling, samplefreq 100, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, timing sampling, samplefreq 1000, format yaml) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
 
-- 
2.34.0

