This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new f198a01edf Extend benchmark comparison script with more detailed 
statistics (#16262)
f198a01edf is described below

commit f198a01edfd777ba63d2aef7789647bcc1fa3122
Author: Pepijn Van Eeckhoudt <pep...@vaneeckhoudt.net>
AuthorDate: Fri Jun 6 22:17:27 2025 +0200

    Extend benchmark comparison script with more detailed statistics (#16262)
---
 benchmarks/bench.sh   | 17 ++++++++----
 benchmarks/compare.py | 75 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 7fafb751b6..b34c646c5b 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -52,6 +52,7 @@ Usage:
 $0 data [benchmark] [query]
 $0 run [benchmark]
 $0 compare <branch1> <branch2>
+$0 compare_detail <branch1> <branch2>
 $0 venv
 
 **********
@@ -66,10 +67,11 @@ DATAFUSION_DIR=/source/datafusion ./bench.sh run tpch
 **********
 * Commands
 **********
-data:         Generates or downloads data needed for benchmarking
-run:          Runs the named benchmark
-compare:      Compares results from benchmark runs
-venv:         Creates new venv (unless already exists) and installs compare's 
requirements into it
+data:            Generates or downloads data needed for benchmarking
+run:             Runs the named benchmark
+compare:         Compares fastest results from benchmark runs
+compare_detail:  Compares minimum, average (±stddev), and maximum results from 
benchmark runs
+venv:            Creates new venv (unless already exists) and installs 
compare's requirements into it
 
 **********
 * Benchmarks
@@ -360,6 +362,9 @@ main() {
         compare)
             compare_benchmarks "$ARG2" "$ARG3"
             ;;
+        compare_detail)
+            compare_benchmarks "$ARG2" "$ARG3" "--detailed"
+            ;;
         venv)
             setup_venv
             ;;
@@ -958,6 +963,8 @@ compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="$1"
     BRANCH2="$2"
+    OPTS="$3"
+
     if [ -z "$BRANCH1" ] ; then
         echo "<branch1> not specified. Available branches:"
         ls -1 "${BASE_RESULTS_DIR}"
@@ -978,7 +985,7 @@ compare_benchmarks() {
             echo "--------------------"
             echo "Benchmark ${BENCH}"
             echo "--------------------"
-            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py 
"${RESULTS_FILE1}" "${RESULTS_FILE2}"
+            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py 
$OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
         else
             echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not 
exist"
         fi
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 0dd067ca9c..7e51a38a92 100755
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -18,7 +18,9 @@
 
 from __future__ import annotations
 
+import argparse
 import json
+import math
 from dataclasses import dataclass
 from typing import Dict, List, Any
 from pathlib import Path
@@ -55,18 +57,57 @@ class QueryRun:
             query=data["query"],
             iterations=[QueryResult(**iteration) for iteration in 
data["iterations"]],
             start_time=data["start_time"],
-            success=data["success"],
+            success=data.get("success", True),
         )
 
     @property
-    def execution_time(self) -> float:
+    def min_execution_time(self) -> float:
         assert len(self.iterations) >= 1
 
-        # Use minimum execution time to account for variations / other
-        # things the system was doing
         return min(iteration.elapsed for iteration in self.iterations)
 
 
+    @property
+    def max_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        return max(iteration.elapsed for iteration in self.iterations)
+
+
+    @property
+    def mean_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        total = sum(iteration.elapsed for iteration in self.iterations)
+        return total / len(self.iterations)
+
+
+    @property
+    def stddev_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        mean = self.mean_execution_time
+        squared_diffs = [(iteration.elapsed - mean) ** 2 for iteration in 
self.iterations]
+        variance = sum(squared_diffs) / len(self.iterations)
+        return math.sqrt(variance)
+
+    def execution_time_report(self, detailed = False) -> tuple[float, str]:
+        if detailed:
+            mean_execution_time = self.mean_execution_time
+            return (
+                mean_execution_time,
+                f"{self.min_execution_time:.2f} / {mean_execution_time :.2f} 
±{self.stddev_execution_time:.2f} / {self.max_execution_time:.2f} ms"
+            )
+        else:
+            # Use minimum execution time to account for variations / other
+            # things the system was doing
+            min_execution_time = self.min_execution_time
+            return (
+                min_execution_time,
+                f"{min_execution_time :.2f} ms"
+            )
+
+
 @dataclass
 class Context:
     benchmark_version: str
@@ -108,6 +149,7 @@ def compare(
     baseline_path: Path,
     comparison_path: Path,
     noise_threshold: float,
+    detailed: bool,
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline_path)
     comparison = BenchmarkRun.load_from_file(comparison_path)
@@ -142,16 +184,19 @@ def compare(
             failure_count += 1
             table.add_row(
                 f"Q{baseline_result.query}",
-                "FAIL" if base_failed else 
f"{baseline_result.execution_time:.2f}ms",
-                "FAIL" if comp_failed else 
f"{comparison_result.execution_time:.2f}ms",
+                "FAIL" if base_failed else 
baseline_result.execution_time_report(detailed)[1],
+                "FAIL" if comp_failed else 
comparison_result.execution_time_report(detailed)[1],
                 change_text,
             )
             continue
 
-        total_baseline_time += baseline_result.execution_time
-        total_comparison_time += comparison_result.execution_time
+        baseline_value, baseline_text = 
baseline_result.execution_time_report(detailed)
+        comparison_value, comparison_text = 
comparison_result.execution_time_report(detailed)
+
+        total_baseline_time += baseline_value
+        total_comparison_time += comparison_value
 
-        change = comparison_result.execution_time / 
baseline_result.execution_time
+        change = comparison_value / baseline_value
 
         if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold):
             change_text = "no change"
@@ -165,8 +210,8 @@ def compare(
 
         table.add_row(
             f"Q{baseline_result.query}",
-            f"{baseline_result.execution_time:.2f}ms",
-            f"{comparison_result.execution_time:.2f}ms",
+            baseline_text,
+            comparison_text,
             change_text,
         )
 
@@ -215,10 +260,16 @@ def main() -> None:
         default=0.05,
         help="The threshold for statistically insignificant results (+/- %5).",
     )
+    compare_parser.add_argument(
+        "--detailed",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Show detailed result comparison instead of minimum runtime.",
+    )
 
     options = parser.parse_args()
 
-    compare(options.baseline_path, options.comparison_path, 
options.noise_threshold)
+    compare(options.baseline_path, options.comparison_path, 
options.noise_threshold, options.detailed)
 
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

Reply via email to