This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new f198a01edf Extend benchmark comparison script with more detailed statistics (#16262) f198a01edf is described below commit f198a01edfd777ba63d2aef7789647bcc1fa3122 Author: Pepijn Van Eeckhoudt <pep...@vaneeckhoudt.net> AuthorDate: Fri Jun 6 22:17:27 2025 +0200 Extend benchmark comparison script with more detailed statistics (#16262) --- benchmarks/bench.sh | 17 ++++++++---- benchmarks/compare.py | 75 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 17 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 7fafb751b6..b34c646c5b 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -52,6 +52,7 @@ Usage: $0 data [benchmark] [query] $0 run [benchmark] $0 compare <branch1> <branch2> +$0 compare_detail <branch1> <branch2> $0 venv ********** @@ -66,10 +67,11 @@ DATAFUSION_DIR=/source/datafusion ./bench.sh run tpch ********** * Commands ********** -data: Generates or downloads data needed for benchmarking -run: Runs the named benchmark -compare: Compares results from benchmark runs -venv: Creates new venv (unless already exists) and installs compare's requirements into it +data: Generates or downloads data needed for benchmarking +run: Runs the named benchmark +compare: Compares fastest results from benchmark runs +compare_detail: Compares minimum, average (±stddev), and maximum results from benchmark runs +venv: Creates new venv (unless already exists) and installs compare's requirements into it ********** * Benchmarks @@ -360,6 +362,9 @@ main() { compare) compare_benchmarks "$ARG2" "$ARG3" ;; + compare_detail) + compare_benchmarks "$ARG2" "$ARG3" "--detailed" + ;; venv) setup_venv ;; @@ -958,6 +963,8 @@ compare_benchmarks() { BASE_RESULTS_DIR="${SCRIPT_DIR}/results" BRANCH1="$1" BRANCH2="$2" + OPTS="$3" + if [ -z "$BRANCH1" ] ; then echo "<branch1> not specified. Available branches:" ls -1 "${BASE_RESULTS_DIR}" @@ -978,7 +985,7 @@ compare_benchmarks() { echo "--------------------" echo "Benchmark ${BENCH}" echo "--------------------" - PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}" + PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}" else echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist" fi diff --git a/benchmarks/compare.py b/benchmarks/compare.py index 0dd067ca9c..7e51a38a92 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -18,7 +18,9 @@ from __future__ import annotations +import argparse import json +import math from dataclasses import dataclass from typing import Dict, List, Any from pathlib import Path @@ -55,18 +57,57 @@ class QueryRun: query=data["query"], iterations=[QueryResult(**iteration) for iteration in data["iterations"]], start_time=data["start_time"], - success=data["success"], + success=data.get("success", True), ) @property - def execution_time(self) -> float: + def min_execution_time(self) -> float: assert len(self.iterations) >= 1 - # Use minimum execution time to account for variations / other - # things the system was doing return min(iteration.elapsed for iteration in self.iterations) + @property + def max_execution_time(self) -> float: + assert len(self.iterations) >= 1 + + return max(iteration.elapsed for iteration in self.iterations) + + + @property + def mean_execution_time(self) -> float: + assert len(self.iterations) >= 1 + + total = sum(iteration.elapsed for iteration in self.iterations) + return total / len(self.iterations) + + + @property + def stddev_execution_time(self) -> float: + assert len(self.iterations) >= 1 + + mean = self.mean_execution_time + squared_diffs = [(iteration.elapsed - mean) ** 2 for iteration in self.iterations] + variance = sum(squared_diffs) / len(self.iterations) + return math.sqrt(variance) + + def execution_time_report(self, detailed = False) -> tuple[float, str]: + if detailed: + mean_execution_time = self.mean_execution_time + return ( + mean_execution_time, + f"{self.min_execution_time:.2f} / {mean_execution_time :.2f} ±{self.stddev_execution_time:.2f} / {self.max_execution_time:.2f} ms" + ) + else: + # Use minimum execution time to account for variations / other + # things the system was doing + min_execution_time = self.min_execution_time + return ( + min_execution_time, + f"{min_execution_time :.2f} ms" + ) + + @dataclass class Context: benchmark_version: str @@ -108,6 +149,7 @@ def compare( baseline_path: Path, comparison_path: Path, noise_threshold: float, + detailed: bool, ) -> None: baseline = BenchmarkRun.load_from_file(baseline_path) comparison = BenchmarkRun.load_from_file(comparison_path) @@ -142,16 +184,19 @@ def compare( failure_count += 1 table.add_row( f"Q{baseline_result.query}", - "FAIL" if base_failed else f"{baseline_result.execution_time:.2f}ms", - "FAIL" if comp_failed else f"{comparison_result.execution_time:.2f}ms", + "FAIL" if base_failed else baseline_result.execution_time_report(detailed)[1], + "FAIL" if comp_failed else comparison_result.execution_time_report(detailed)[1], change_text, ) continue - total_baseline_time += baseline_result.execution_time - total_comparison_time += comparison_result.execution_time + baseline_value, baseline_text = baseline_result.execution_time_report(detailed) + comparison_value, comparison_text = comparison_result.execution_time_report(detailed) + + total_baseline_time += baseline_value + total_comparison_time += comparison_value - change = comparison_result.execution_time / baseline_result.execution_time + change = comparison_value / baseline_value if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold): change_text = "no change" @@ -165,8 +210,8 @@ def compare( table.add_row( f"Q{baseline_result.query}", - f"{baseline_result.execution_time:.2f}ms", - f"{comparison_result.execution_time:.2f}ms", + baseline_text, + comparison_text, change_text, ) @@ -215,10 +260,16 @@ def main() -> None: default=0.05, help="The threshold for statistically insignificant results (+/- %5).", ) + compare_parser.add_argument( + "--detailed", + action=argparse.BooleanOptionalAction, + default=False, + help="Show detailed result comparison instead of minimum runtime.", + ) options = parser.parse_args() - compare(options.baseline_path, options.comparison_path, options.noise_threshold) + compare(options.baseline_path, options.comparison_path, options.noise_threshold, options.detailed) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org