Re: Reduce timing overhead of EXPLAIN ANALYZE using rdtsc?

Álvaro Herrera Sun, 19 Oct 2025 11:17:01 -0700

On 2025-Jul-27, Lukas Fittl wrote:

> See attached v11 (and moved to the PG19-2 commitfest), split into a new set
> of patches:


I rebased (but not reviewed) this patchset now that Michael committed
part of 0001, as seen in another thread.

Quickly looking at 0003, I wonder if adding a separate --fast switch to
pg_test_timing is really what we want.  Why not report both the fast and
legacy measurements in platforms that support both, instead?  If I were
a consultant trying to understand a customer's system, I would have to
ask them to run it twice just in case 'fast' is supported, and I don't
think that's very helpful.  Also, were the doc updates lost somehow, or
were they made irrelevant by other concurrent pg_test_timing
development?

Thanks

-- 
Álvaro Herrera        Breisgau, Deutschland  —  https://www.EnterpriseDB.com/
"Ninguna manada de bestias tiene una voz tan horrible como la humana" (Orual)

>From 3844daeee1f8eac0263f1421929812b4b04fad38 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sun, 27 Jul 2025 10:45:49 -0700
Subject: [PATCH v12 1/3] cpuidex check: Support detecting newer GCC versions
 defining it in cpuid.h

Author: Lukas Fittl <[email protected]>
Discussion: https://postgr.es/m/CAP53Pky-BN0Ui+A9no3TsU=GoMTFpxYSWYtp_LVaDH=y69b...@mail.gmail.com
---
 meson.build                       | 4 ++++
 src/port/pg_crc32c_sse42_choose.c | 4 ++--
 src/port/pg_popcount_avx512.c     | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/meson.build b/meson.build
index 395416a6060..007ec30800f 100644
--- a/meson.build
+++ b/meson.build
@@ -2015,7 +2015,11 @@ if cc.links('''
     args: test_c_args)
   cdata.set('HAVE__GET_CPUID_COUNT', 1)
 elif cc.links('''
+    #if defined(_MSC_VER)
     #include <intrin.h>
+    #else
+    #include <cpuid.h>
+    #endif
     int main(int arg, char **argv)
     {
         unsigned int exx[4] = {0, 0, 0, 0};
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 74d2421ba2b..750f390bfdf 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,11 +20,11 @@
 
 #include "c.h"
 
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
 #include <cpuid.h>
 #endif
 
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
 #include <intrin.h>
 #endif
 
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index 80c0aee3e73..80d9a372dd7 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -14,13 +14,13 @@
 
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
 #include <cpuid.h>
 #endif
 
 #include <immintrin.h>
 
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
 #include <intrin.h>
 #endif
 
-- 
2.47.3

>From d613599d09fe7841c3c6b86a4500e78b77cc3dd2 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Fri, 25 Jul 2025 17:57:20 -0700
Subject: [PATCH v12 2/3] Use time stamp counter to measure time on Linux/x86

We switch to using the time stamp counter (TSC) instead of clock_gettime()
to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime
is reduced by around 10% for queries moving lots of rows through the plan.

For now this is only enabled on Linux/x86, in case the system clocksource is
reported as TSC. Relying on the Linux kernel simplifies the logic to detect
if the present TSC is usable (frequency invariant, synchronized between
sockets, etc.). In all other cases we fallback to clock_gettime().

Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP.
RDTSCP waits for outstanding instructions to retire on out-of-order CPUs.
This adds noticably for little benefit in the typical InstrStartNode() /
InstrStopNode() use case. The macro to be used in such cases is called
INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT()
uses RDTSCP and is supposed to be used when precision is more important
than performance.

Author: David Geier <[email protected]>
Author: Andres Freund <[email protected]>
Author: Lukas Fittl <[email protected]>
Reviewed-by:
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
---
 src/backend/access/heap/vacuumlazy.c |   4 +-
 src/backend/executor/instrument.c    |  12 +-
 src/backend/utils/init/postinit.c    |   3 +
 src/bin/pgbench/pgbench.c            |   3 +
 src/bin/psql/startup.c               |   4 +
 src/common/Makefile                  |   1 +
 src/common/instr_time.c              | 206 +++++++++++++++++++++++++++
 src/common/meson.build               |   1 +
 src/include/portability/instr_time.h | 136 +++++++++++++++---
 9 files changed, 348 insertions(+), 22 deletions(-)
 create mode 100644 src/common/instr_time.c

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index d2b031fdd06..5027048cac4 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3409,8 +3409,8 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
 			INSTR_TIME_SET_CURRENT(currenttime);
 			elapsed = currenttime;
 			INSTR_TIME_SUBTRACT(elapsed, starttime);
-			if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
-				>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
+			if (INSTR_TIME_GET_MILLISEC(elapsed) >=
+				VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
 			{
 				if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
 				{
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 56e635f4700..01f67c5d972 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
 void
 InstrStartNode(Instrumentation *instr)
 {
-	if (instr->need_timer &&
-		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
-		elog(ERROR, "InstrStartNode called twice in a row");
+	if (instr->need_timer)
+	{
+		if (!INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStartNode called twice in a row");
+		else
+			INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
+	}
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 			elog(ERROR, "InstrStopNode called without start");
 
-		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT_FAST(endtime);
 		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 641e535a73c..d573409903b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -810,6 +810,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Initialize portal manager */
 	EnablePortalManager();
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/*
 	 * Load relcache entries for the shared system catalogs.  This must create
 	 * at least entries for pg_database and catalogs used for authentication.
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 1515ed405ba..79bef2d2aec 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -7290,6 +7290,9 @@ main(int argc, char **argv)
 		initRandomState(&state[i].cs_func_rs);
 	}
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* opening connection... */
 	con = doConnect();
 	if (con == NULL)
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 249b6aa5169..d615df593c7 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 2c720caa509..1a2fbbe887f 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 00000000000..fdf47699f20
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#if defined(HAVE__GET_CPUID) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
+#include <cpuid.h>
+#endif
+
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#include "portability/instr_time.h"
+
+#ifndef WIN32
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up
+ * with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
+ * for more details.
+ *
+ * By default assume we are using clock_gettime() as a fallback which uses
+ * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
+ * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
+ *
+ * When using the RDTSC instruction directly this is filled in during initialization
+ * based on the relevant CPUID fields.
+ */
+int64		ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
+int64		ticks_per_sec = NS_PER_S;
+int64		max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
+
+#if defined(__x86_64__) && defined(__linux__)
+/*
+ * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
+ */
+bool		has_rdtsc = false;
+
+/*
+ * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
+ */
+bool		has_rdtscp = false;
+
+#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d)	/* KVMKVMKVM */
+
+static bool
+get_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	uint32		r[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x15);
+#else
+#error cpuid instruction not available
+#endif
+
+	if (r[2] > 0)
+	{
+		if (r[0] == 0 || r[1] == 0)
+			return false;
+
+		*tsc_freq = r[2] / 1000 * r[1] / r[0];
+		return true;
+	}
+
+	/* Some CPUs only report frequency in 16H */
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x16);
+#else
+#error cpuid instruction not available
+#endif
+
+	if (r[0] > 0)
+	{
+		*tsc_freq = r[0] * 1000;
+		return true;
+	}
+
+	/*
+	 * Check if we have a KVM or VMware Hypervisor passing down TSC frequency
+	 * to us in a guest VM
+	 *
+	 * Note that accessing the 0x40000000 leaf for Hypervisor info requires
+	 * use of __cpuidex to set ECX to 0. The similar __get_cpuid_count
+	 * function does not work as expected since it contains a check for
+	 * __get_cpuid_max, which has been observed to be lower than the special
+	 * Hypervisor leaf.
+	 */
+#if defined(HAVE__CPUIDEX)
+	__cpuidex((int32 *) r, 0x40000000, 0);
+	if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
+	{
+		__cpuidex((int32 *) r, 0x40000010, 0);
+		if (r[0] > 0)
+		{
+			*tsc_freq = r[0];
+			return true;
+		}
+	}
+#endif
+
+	return false;
+}
+
+static bool
+is_rdtscp_available()
+{
+	uint32		r[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	if (!__get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]))
+		return false;
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x80000001);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (r[3] & (1 << 27)) != 0;
+}
+
+/*
+ * Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ *
+ * Since Linux already does a significant amount of work to determine
+ * whether TSC is a viable clock source, decide based on that.
+ */
+void
+pg_initialize_rdtsc(void)
+{
+	FILE	   *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+
+	if (fp)
+	{
+		char		buf[128];
+
+		if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
+		{
+			/*
+			 * Compute baseline CPU peformance, determines speed at which
+			 * RDTSC advances.
+			 */
+			uint32		tsc_freq;
+
+			if (get_tsc_frequency_khz(&tsc_freq))
+			{
+				/*
+				 * Ticks to nanoseconds conversion requires floating point
+				 * math because because:
+				 *
+				 * sec = ticks / frequency_hz ns  = ticks / frequency_hz *
+				 * 1,000,000,000 ns  = ticks * (1,000,000,000 / frequency_hz)
+				 * ns  = ticks * (1,000,000 / frequency_khz) <-- now in
+				 * kilohertz
+				 *
+				 * Here, 'ns' is usually a floating number. For example for a
+				 * 2.5 GHz CPU the scaling factor becomes 1,000,000 /
+				 * 2,500,000 = 1.2.
+				 *
+				 * To be able to use integer math we work around the lack of
+				 * precision. We first scale the integer up and after the
+				 * multiplication by the number of ticks in
+				 * INSTR_TIME_GET_NANOSEC() we divide again by the same value.
+				 * We picked the scaler such that it provides enough precision
+				 * and is a power-of-two which allows for shifting instead of
+				 * doing an integer division.
+				 */
+				ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
+				ticks_per_sec = tsc_freq * 1000;	/* KHz->Hz */
+				max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+
+				has_rdtsc = true;
+				has_rdtscp = is_rdtscp_available();
+			}
+		}
+
+		fclose(fp);
+	}
+}
+#endif							/* defined(__x86_64__) && defined(__linux__) */
+
+#endif							/* WIN32 */
diff --git a/src/common/meson.build b/src/common/meson.build
index 1540ba67cca..62b90b3e609 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index f71a851b18d..e2e339a0c4f 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC
+ * clocksource is also used on the host OS.  Otherwise, and on other
+ * Unix-like systems we use clock_gettime() and on Windows we use
+ * QueryPerformanceCounter(). These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -17,10 +19,11 @@
  *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)		set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t)	set t to current time without waiting
+ * 									for instructions in out-of-order window
  *
- * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
- *									evaluates to whether t changed
+ * INSTR_TIME_SET_CURRENT(t)		set t to current time while waiting for
+ * 									instructions in OOO to retire
  *
  * INSTR_TIME_ADD(x, y)				x += y
  *
@@ -81,6 +84,15 @@ typedef struct instr_time
 
 #ifndef WIN32
 
+/*
+ * Make sure this is a power-of-two, so that the compiler can turn the
+ * multiplications and divisions into shifts.
+ */
+#define TICKS_TO_NS_PRECISION (1<<14)
+
+extern int64 ticks_per_ns_scaled;
+extern int64 ticks_per_sec;
+extern int64 max_ticks_no_overflow;
 
 /* Use clock_gettime() */
 
@@ -106,9 +118,18 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
+#if defined(__x86_64__) && defined(__linux__)
+#include <x86intrin.h>
+#include <cpuid.h>
+
+extern bool has_rdtsc;
+extern bool has_rdtscp;
+
+extern void pg_initialize_rdtsc(void);
+#endif
+
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_clock_gettime(void)
 {
 	instr_time	now;
 	struct timespec tmp;
@@ -119,11 +140,94 @@ pg_clock_gettime_ns(void)
 	return now;
 }
 
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtsc)
+	{
+		instr_time	now;
+
+		now.ticks = __rdtsc();
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtscp)
+	{
+		instr_time	now;
+		uint32		unused;
+
+		now.ticks = __rdtscp(&unused);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline int64_t
+pg_ticks_to_ns(instr_time t)
+{
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 * Check overflow without actually overflowing via: a * b > max <=> a >
+	 * max / b
+	 */
+	int64		ns = 0;
+
+	if (unlikely(t.ticks > max_ticks_no_overflow))
+	{
+		/*
+		 * Compute how often the maximum number of ticks fits completely into
+		 * the number of elapsed ticks and convert that number into
+		 * nanoseconds. Then multiply by the count to arrive at the final
+		 * value. In a 2nd step we adjust the number of elapsed ticks and
+		 * convert the remaining ticks.
+		 */
+		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+
+		ns = max_ns * count;
+
+		/*
+		 * Subtract the ticks that we now already accounted for, so that they
+		 * don't get counted twice.
+		 */
+		t.ticks -= count * max_ticks_no_overflow;
+		Assert(t.ticks >= 0);
+	}
+
+	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	return ns;
+}
+
+static inline void
+pg_initialize_get_ticks()
+{
+#if defined(__x86_64__) && defined(__linux__)
+	pg_initialize_rdtsc();
+#endif
+}
+
+#define INSTR_TIME_INITIALIZE() \
+	pg_initialize_get_ticks()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
+	((t) = pg_get_ticks())
 
 #define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
+	pg_ticks_to_ns(t)
 
 
 #else							/* WIN32 */
@@ -131,7 +235,7 @@ pg_clock_gettime_ns(void)
 
 /* Use QueryPerformanceCounter() */
 
-/* helper for INSTR_TIME_SET_CURRENT */
+/* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */
 static inline instr_time
 pg_query_performance_counter(void)
 {
@@ -153,6 +257,11 @@ GetTimerFrequency(void)
 	return (double) f.QuadPart;
 }
 
+#define INSTR_TIME_INITIALIZE()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_query_performance_counter())
+
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
@@ -168,13 +277,8 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_ZERO(t)	((t).ticks == 0)
 
-
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
-#define INSTR_TIME_SET_CURRENT_LAZY(t) \
-	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
-
-
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
-- 
2.47.3

>From bcf61f229e360e96bb936ad08d64b6a43b181bb2 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sun, 27 Jul 2025 08:48:48 -0700
Subject: [PATCH v12 3/3] pg_test_timing: Add --fast flag to test fast timing,
 report time source

In passing also reduce the per-loop overhead caused by repeated divisions
in INSTR_TIME_GET_NANOSEC when the ticks variable has become very large,
instead diff first and then turn it into nanosecs.
---
 src/bin/pg_test_timing/pg_test_timing.c | 59 +++++++++++++++++++------
 src/include/portability/instr_time.h    | 30 ++++++++-----
 2 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index a5621251afc..b77ef2063b6 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -16,6 +16,7 @@ static const char *progname;
 
 static unsigned int test_duration = 3;
 static double max_rprct = 99.99;
+static bool fast_timing = false;
 
 /* record duration in powers of 2 nanoseconds */
 static long long int histogram[32];
@@ -56,6 +57,7 @@ handle_args(int argc, char *argv[])
 	static struct option long_options[] = {
 		{"duration", required_argument, NULL, 'd'},
 		{"cutoff", required_argument, NULL, 'c'},
+		{"fast", no_argument, NULL, 'f'},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -68,7 +70,7 @@ handle_args(int argc, char *argv[])
 	{
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
-			printf(_("Usage: %s [-d DURATION] [-c CUTOFF]\n"), progname);
+			printf(_("Usage: %s [-d DURATION] [-c CUTOFF] [--fast]\n"), progname);
 			exit(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
@@ -78,7 +80,7 @@ handle_args(int argc, char *argv[])
 		}
 	}
 
-	while ((option = getopt_long(argc, argv, "d:c:",
+	while ((option = getopt_long(argc, argv, "d:c:f:",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -125,6 +127,10 @@ handle_args(int argc, char *argv[])
 				}
 				break;
 
+			case 'f':
+				fast_timing = true;
+				break;
+
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 						progname);
@@ -155,11 +161,31 @@ test_timing(unsigned int duration)
 	uint64		total_time;
 	int64		time_elapsed = 0;
 	uint64		loop_count = 0;
-	uint64		prev,
-				cur;
 	instr_time	start_time,
 				end_time,
-				temp;
+				prev,
+				cur;
+	char	   *time_source = NULL;
+	bool		fast_timing_used = false;
+
+	INSTR_TIME_INITIALIZE();
+
+#if !defined(WIN32) && defined(__x86_64__) && defined(__linux__)
+	if (fast_timing && has_rdtsc)
+	{
+		time_source = "RDTSC";
+		fast_timing_used = true;
+	}
+	else if (has_rdtscp)
+		time_source = "RDTSCP";
+	else
+		time_source = PG_INSTR_CLOCK_NAME;
+#else
+	time_source = PG_INSTR_CLOCK_NAME;
+#endif
+	if (fast_timing && !fast_timing_used)
+		printf(_("Warning: Fast timing requested, but not available - regular timing source will be used\n"));
+	printf(_("Time source: %s\n"), time_source);
 
 	/*
 	 * Pre-zero the statistics data structures.  They're already zero by
@@ -173,8 +199,11 @@ test_timing(unsigned int duration)
 
 	total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
 
-	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_NANOSEC(start_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(start_time);
+	else
+		INSTR_TIME_SET_CURRENT(start_time);
+	cur = start_time;
 
 	while (time_elapsed < total_time)
 	{
@@ -182,9 +211,11 @@ test_timing(unsigned int duration)
 					bits;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_NANOSEC(temp);
-		diff = cur - prev;
+		if (fast_timing)
+			INSTR_TIME_SET_CURRENT_FAST(cur);
+		else
+			INSTR_TIME_SET_CURRENT(cur);
+		diff = INSTR_TIME_DIFF_NANOSEC(cur, prev);
 
 		/* Did time go backwards? */
 		if (unlikely(diff < 0))
@@ -217,11 +248,13 @@ test_timing(unsigned int duration)
 			largest_diff_count++;
 
 		loop_count++;
-		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
+		time_elapsed = INSTR_TIME_DIFF_NANOSEC(cur, start_time);
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(end_time);
+	else
+		INSTR_TIME_SET_CURRENT(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index e2e339a0c4f..f02296f1026 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -112,10 +112,13 @@ extern int64 max_ticks_no_overflow;
  */
 #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC_RAW
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC_RAW)"
 #elif defined(CLOCK_MONOTONIC)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC)"
 #else
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_REALTIME)"
 #endif
 
 #if defined(__x86_64__) && defined(__linux__)
@@ -174,7 +177,7 @@ pg_get_ticks(void)
 }
 
 static inline int64_t
-pg_ticks_to_ns(instr_time t)
+pg_ticks_to_ns(int64 ticks)
 {
 	/*
 	 * Would multiplication overflow? If so perform computation in two parts.
@@ -183,7 +186,7 @@ pg_ticks_to_ns(instr_time t)
 	 */
 	int64		ns = 0;
 
-	if (unlikely(t.ticks > max_ticks_no_overflow))
+	if (unlikely(ticks > max_ticks_no_overflow))
 	{
 		/*
 		 * Compute how often the maximum number of ticks fits completely into
@@ -192,7 +195,7 @@ pg_ticks_to_ns(instr_time t)
 		 * value. In a 2nd step we adjust the number of elapsed ticks and
 		 * convert the remaining ticks.
 		 */
-		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		count = ticks / max_ticks_no_overflow;
 		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 
 		ns = max_ns * count;
@@ -201,11 +204,11 @@ pg_ticks_to_ns(instr_time t)
 		 * Subtract the ticks that we now already accounted for, so that they
 		 * don't get counted twice.
 		 */
-		t.ticks -= count * max_ticks_no_overflow;
-		Assert(t.ticks >= 0);
+		ticks -= count * max_ticks_no_overflow;
+		Assert(ticks >= 0);
 	}
 
-	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	ns += ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 	return ns;
 }
 
@@ -226,14 +229,14 @@ pg_initialize_get_ticks()
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_get_ticks())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	pg_ticks_to_ns(t)
-
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	(pg_ticks_to_ns(ticks))
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
+#define PG_INSTR_CLOCK_NAME	"QueryPerformanceCounter"
 
 /* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */
 static inline instr_time
@@ -265,8 +268,8 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	((int64) ((ticks) * ((double) NS_PER_S / GetTimerFrequency())))
 
 #endif							/* WIN32 */
 
@@ -285,9 +288,14 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SUBTRACT(x,y) \
 	((x).ticks -= (y).ticks)
 
+#define INSTR_TIME_DIFF_NANOSEC(x,y) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((x).ticks - (y).ticks))
+
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((t).ticks))
 
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
-- 
2.47.3

Re: Reduce timing overhead of EXPLAIN ANALYZE using rdtsc?

Reply via email to