This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-benchmarks.git
The following commit(s) were added to refs/heads/main by this push:
new be69b68 Add script to generate comparison of two benchmark runs (#5)
be69b68 is described below
commit be69b68df97a33d878093cfd039fca1f2b36d70b
Author: Andy Grove <[email protected]>
AuthorDate: Thu May 30 19:12:13 2024 -0600
Add script to generate comparison of two benchmark runs (#5)
* Add script to generate comparison of two benchmark runs
* rename script
* rename script
* updates
* add another chart style
* improve script
* support more than 2 files
* update README
---
.gitignore | 4 +-
README.md | 31 +++++----
scripts/generate-comparison.py | 152 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+), 14 deletions(-)
diff --git a/.gitignore b/.gitignore
index 2a2f6e2..f4e2059 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
+venv
.idea
-*.json
\ No newline at end of file
+*.json
+*.png
diff --git a/README.md b/README.md
index b772f3e..e405e6c 100644
--- a/README.md
+++ b/README.md
@@ -48,17 +48,12 @@ prohibited by the TPC.
## Data Generation
-See the benchmark-specific instructions for generating the CSV data for
[TPC-H](tpch) and [TPC-DS](tpcds).
-
-## Converting CSV data to Parquet
-
-Although it is valid to run benchmarks against CSV data, this does not really
represent how most of the world is
-running OLAP queries, especially when dealing with large datasets. When
benchmarking DataFusion, we typically want
-to be querying Parquet data, so first we must convert the generated datasets
to Parquet. Also, we typically do not
-want a single file per table, so we also need to repartition the data.
-
-We plan on adding Python scripts in this repository to perform this conversion
and repartitioning. Until then you may
-want to write your own scripts using DataFusion or Spark. Another option is to
use [tpc-tools](https://crates.io/crates/tpctools).
+See the benchmark-specific instructions for generating the CSV data for
[TPC-H](tpch) and [TPC-DS](tpcds) and for
+converting that data to Parquet format. Although it is valid to run benchmarks
against CSV data, this does not really
+represent how most of the world is running OLAP queries, especially when
dealing with large datasets. When benchmarking
+DataFusion and its subprojects, we typically want to be querying Parquet data.
Also, we typically do not
+want a single file per table, so we also need to repartition the data. The
provided scripts take care of this conversion
+and repartitioning.
## Running the Benchmarks with DataFusion
@@ -67,10 +62,20 @@ Scripts are available for the following DataFusion projects:
- [DataFusion Python](./runners/datafusion-python)
- [DataFusion Comet](./runners/datafusion-comet)
+These benchmarking scripts produce JSON files containing query timings.
+
## Comparing Results
-Coming soon. The plan is to add some Python scripts for comparing results from
different runs and producing charts
-that we can use in blog posts.
+The Python script
[scripts/generate-comparison.py](scripts/generate-comparison.py) can be used to
produce charts
+comparing results from different benchmark runs.
+
+For example:
+
+```shell
+python scripts/generate-comparison.py file1.json file2.json --labels "Spark"
"Comet" --benchmark "TPC-H 100GB"
+```
+
+This will create image files in the current directory in PNG format.
## Legal Notices
diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
new file mode 100644
index 0000000..b13904e
--- /dev/null
+++ b/scripts/generate-comparison.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+
+def geomean(data):
+ return np.prod(data) ** (1 / len(data))
+
+def generate_query_speedup_chart(baseline, comparison, label1: str, label2:
str, benchmark: str):
+ results = []
+ for query in range(1, 23):
+ a = np.median(np.array(baseline[str(query)]))
+ b = np.median(np.array(comparison[str(query)]))
+ if a > b:
+ speedup = a/b-1
+ else:
+ speedup = -(1/(a/b)-1)
+ results.append(("q" + str(query), round(speedup*100, 0)))
+
+ results = sorted(results, key=lambda x: -x[1])
+
+ queries, speedups = zip(*results)
+
+ # Create figure and axis
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # Create bar chart
+ bars = ax.bar(queries, speedups, color='skyblue')
+
+ # Add text annotations
+ for bar, speedup in zip(bars, speedups):
+ yval = bar.get_height()
+ if yval >= 0:
+ ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5),
f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
+ color='blue', rotation=90)
+ else:
+ ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%',
va='top', ha='center', fontsize=8,
+ color='blue', rotation=90)
+
+ # Add title and labels
+ ax.set_title(label2 + " speedup over " + label1 + " (" + benchmark + ")")
+ ax.set_ylabel('Speedup (100% speedup = 2x faster)')
+ ax.set_xlabel('Query')
+
+ # Customize the y-axis to handle both positive and negative values better
+ ax.axhline(0, color='black', linewidth=0.8)
+ min_value = (min(speedups) // 100) * 100
+ max_value = ((max(speedups) // 100) + 1) * 100 + 50
+ ax.set_ylim(min_value, max_value)
+
+ # Show grid for better readability
+ ax.yaxis.grid(True)
+
+ # Save the plot as an image file
+ plt.savefig('tpch_queries_speedup.png', format='png')
+
+
+def generate_query_comparison_chart(results, labels, benchmark: str):
+ queries = []
+ benches = []
+ for _ in results:
+ benches.append([])
+ for query in range(1, 23):
+ queries.append("q" + str(query))
+ for i in range(0, len(results)):
+ benches[i].append(np.median(np.array(results[i][str(query)])))
+
+ # Define the width of the bars
+ bar_width = 0.3
+
+ # Define the positions of the bars on the x-axis
+ index = np.arange(len(queries)) * 1.5
+
+ # Create a bar chart
+ fig, ax = plt.subplots(figsize=(15, 6))
+ for i in range(0, len(results)):
+ bar = ax.bar(index + i * bar_width, benches[i], bar_width,
label=labels[i])
+
+ # Add labels, title, and legend
+ ax.set_title(benchmark)
+ ax.set_xlabel('Queries')
+ ax.set_ylabel('Query Time (seconds)')
+ ax.set_xticks(index + bar_width / 2)
+ ax.set_xticklabels(queries)
+ ax.legend()
+
+ # Save the plot as an image file
+ plt.savefig('tpch_queries_compare.png', format='png')
+
+def generate_summary(results, labels, benchmark: str):
+ timings = []
+ for _ in results:
+ timings.append(0)
+
+ for query in range(1, 23):
+ for i in range(0, len(results)):
+ timings[i] += np.median(np.array(results[i][str(query)]))
+
+ # Create figure and axis
+ fig, ax = plt.subplots()
+
+ # Add title and labels
+ ax.set_title(benchmark)
+ ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is
better)')
+
+ times = [round(x,0) for x in timings]
+
+ # Create bar chart
+ bars = ax.bar(labels, times, color='skyblue')
+
+ # Add text annotations
+ for bar in bars:
+ yval = bar.get_height()
+ ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}',
va='bottom') # va: vertical alignment
+
+ plt.savefig('tpch_allqueries.png', format='png')
+
+def main(files, labels, benchmark: str):
+ results = []
+ for filename in files:
+ with open(filename) as f:
+ results.append(json.load(f))
+ generate_summary(results, labels, benchmark)
+ generate_query_comparison_chart(results, labels, benchmark)
+ if len(files) == 2:
+ generate_query_speedup_chart(results[0], results[1], labels[0],
labels[1], benchmark)
+
+if __name__ == '__main__':
+ argparse = argparse.ArgumentParser(description='Generate comparison')
+ argparse.add_argument('filenames', nargs='+', type=str, help='JSON result
files')
+ argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
+ argparse.add_argument('--benchmark', type=str, help='Benchmark
description')
+ args = argparse.parse_args()
+ main(args.filenames, args.labels, args.benchmark)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]