This is an automated email from the ASF dual-hosted git repository.
andygrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new f65226a7d docs: Add benchmark results for 0.16.0 (#4272)
f65226a7d is described below
commit f65226a7d92cc5280336c06e3869d0f23297829e
Author: Andy Grove <[email protected]>
AuthorDate: Tue May 12 08:01:08 2026 -0600
docs: Add benchmark results for 0.16.0 (#4272)
---
.../results/0.16.0/comet-tpcds.json | 206 ++++++++++-----------
.../results/0.16.0/comet-tpch-hashjoin.json | 44 ++---
.../results/0.16.0/comet-tpch.json | 44 ++---
.../results/0.16.0/spark-tpcds.json | 206 ++++++++++-----------
.../results/0.16.0/spark-tpch.json | 44 ++---
benchmarks/tpc/generate-comparison.py | 58 +++---
benchmarks/tpc/tpcbench.py | 89 +++++----
.../benchmark-results/0.16.0/tpcds_allqueries.png | Bin 0 -> 25524 bytes
.../0.16.0/tpcds_queries_compare.png | Bin 0 -> 37679 bytes
.../0.16.0/tpcds_queries_speedup_abs.png | Bin 0 -> 64879 bytes
.../0.16.0/tpcds_queries_speedup_rel.png | Bin 0 -> 92557 bytes
.../benchmark-results/0.16.0/tpch_allqueries.png | Bin 0 -> 23705 bytes
.../0.16.0/tpch_queries_compare.png | Bin 0 -> 25488 bytes
.../0.16.0/tpch_queries_speedup_abs.png | Bin 0 -> 32108 bytes
.../0.16.0/tpch_queries_speedup_rel.png | Bin 0 -> 36118 bytes
.../contributor-guide/benchmark-results/tpc-ds.md | 34 ++--
.../contributor-guide/benchmark-results/tpc-h.md | 48 +++--
17 files changed, 398 insertions(+), 375 deletions(-)
diff --git
a/docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpcds.json
b/benchmarks/results/0.16.0/comet-tpcds.json
similarity index 55%
rename from
docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpcds.json
rename to benchmarks/results/0.16.0/comet-tpcds.json
index 556d26dd9..ff3872ac2 100644
--- a/docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpcds.json
+++ b/benchmarks/results/0.16.0/comet-tpcds.json
@@ -14,312 +14,312 @@
"spark_profile": "3.5_2.12"
},
"1": [
- 6.6325
+ 5.1795
],
"2": [
- 11.296
+ 5.5120000000000005
],
"3": [
- 3.5389999999999997
+ 1.6395
],
"4": [
- 42.13
+ 28.048499999999997
],
"5": [
- 16.0765
+ 5.8245000000000005
],
"6": [
- 2.5905
+ 1.788
],
"7": [
- 7.6775
+ 2.832
],
"8": [
- 2.7664999999999997
+ 1.9335
],
"9": [
- 41.19
+ 9.186
],
"10": [
- 4.798500000000001
+ 3.1355
],
"11": [
- 16.6935
+ 14.4275
],
"12": [
- 1.9075000000000002
+ 1.3115
],
"13": [
- 8.3805
+ 3.0445
],
"15": [
- 2.657
+ 2.0875
],
"16": [
- 15.8
+ 7.5585
],
"17": [
- 5.198
+ 3.8885
],
"18": [
- 4.775499999999999
+ 4.176
],
"19": [
- 2.0535
+ 1.7650000000000001
],
"20": [
- 1.951
+ 1.222
],
"21": [
- 1.6165
+ 1.05
],
"22": [
- 10.1985
+ 2.936
],
"25": [
- 4.18
+ 2.9595000000000002
],
"26": [
- 3.6559999999999997
+ 1.722
],
"27": [
- 5.875500000000001
+ 2.704
],
"28": [
- 41.400000000000006
+ 10.931000000000001
],
"29": [
- 12.814499999999999
+ 6.866
],
"30": [
- 3.683
+ 2.6035000000000004
],
"31": [
- 6.7405
+ 5.3685
],
"32": [
- 1.5375
+ 0.928
],
"33": [
- 3.2175000000000002
+ 1.484
],
"34": [
- 4.367
+ 1.8795000000000002
],
"35": [
- 6.285500000000001
+ 5.1385
],
"36": [
- 6.1295
+ 2.67
],
"37": [
- 8.221
+ 2.474
],
"38": [
- 13.931999999999999
+ 7.744
],
"40": [
- 5.8815
+ 3.1055
],
"41": [
- 1.1164999999999998
+ 0.4105
],
"42": [
- 2.5195
+ 0.7605
],
"43": [
- 5.170999999999999
+ 1.9024999999999999
],
"44": [
- 25.598
+ 6.7085
],
"45": [
- 2.5675
+ 1.998
],
"46": [
- 7.5
+ 3.548
],
"47": [
- 8.979500000000002
+ 5.522
],
"48": [
- 8.464500000000001
+ 2.217
],
"49": [
- 8.951
+ 4.5725
],
"50": [
- 31.622500000000002
+ 18.654
],
"51": [
- 10.7525
+ 8.218
],
"52": [
- 1.5
+ 0.89
],
"53": [
- 6.2330000000000005
+ 2.2885
],
"54": [
- 3.2664999999999997
+ 5.673
],
"55": [
- 1.295
+ 1.1235
],
"56": [
- 3.4320000000000004
+ 1.5915
],
"57": [
- 5.213
+ 4.1205
],
"58": [
- 2.982
+ 1.3315000000000001
],
"59": [
- 17.828
+ 8.3505
],
"60": [
- 3.588
+ 1.741
],
"61": [
- 4.023
+ 1.7810000000000001
],
"62": [
- 10.6945
+ 3.405
],
"63": [
- 5.3575
+ 1.9024999999999999
],
"64": [
- 39.058
+ 13.6375
],
"65": [
- 13.0225
+ 6.718
],
"66": [
- 5.6965
+ 2.86
],
"67": [
- 58.116
+ 38.2625
],
"68": [
- 2.9850000000000003
+ 5.2355
],
"69": [
- 4.6395
+ 2.375
],
"70": [
- 11.1165
+ 4.8575
],
"71": [
- 2.0905
+ 1.319
],
"72": [
- 14.392
+ 10.3335
],
"73": [
- 1.939
+ 1.205
],
"74": [
- 14.888
+ 10.2715
],
"75": [
- 20.826500000000003
+ 17.2075
],
"76": [
- 19.8905
+ 7.3555
],
"77": [
- 1.883
+ 1.452
],
"78": [
- 33.607
+ 22.3005
],
"79": [
- 4.1065
+ 2.1355
],
"80": [
- 9.498999999999999
+ 5.6899999999999995
],
"81": [
- 5.681
+ 3.8565000000000005
],
"82": [
- 16.321
+ 3.6550000000000002
],
"83": [
- 2.0175
+ 1.012
],
"84": [
- 3.9284999999999997
+ 2.3385
],
"85": [
- 6.912
+ 4.9245
],
"86": [
- 4.362
+ 1.52
],
"87": [
- 13.2085
+ 7.6884999999999994
],
"88": [
- 57.921499999999995
+ 20.369999999999997
],
"89": [
- 4.8945
+ 1.9765
],
"90": [
- 11.091000000000001
+ 3.0755
],
"91": [
- 1.6255000000000002
+ 3.1205
],
"92": [
- 1.565
+ 1.0354999999999999
],
"93": [
- 51.4765
+ 23.848
],
"94": [
- 12.693
+ 5.6185
],
"95": [
- 30.4705
+ 22.183500000000002
],
"96": [
- 12.237
+ 4.0295000000000005
],
"97": [
- 12.588000000000001
+ 5.724
],
"98": [
- 2.2845
+ 1.5735000000000001
],
"99": [
- 11.4715
+ 3.9865000000000004
],
"14a": [
- 49.5975
+ 34.26049999999999
],
"14b": [
- 47.707499999999996
+ 32.920500000000004
],
"23a": [
- 85.717
+ 41.715
],
"23b": [
- 119.332
+ 57.44
],
"24a": [
- 47.7055
+ 26.122
],
"24b": [
- 44.661
+ 26.015500000000003
],
"39a": [
- 5.699
+ 8.402999999999999
],
"39b": [
- 5.2835
+ 7.2705
]
}
\ No newline at end of file
diff --git
a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch-hashjoin.json
b/benchmarks/results/0.16.0/comet-tpch-hashjoin.json
similarity index 67%
rename from
docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch-hashjoin.json
rename to benchmarks/results/0.16.0/comet-tpch-hashjoin.json
index 003636ad1..86f1dbda3 100644
---
a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch-hashjoin.json
+++ b/benchmarks/results/0.16.0/comet-tpch-hashjoin.json
@@ -15,69 +15,69 @@
"spark_profile": "3.5_2.12"
},
"1": [
- 12.007
+ 12.5465
],
"2": [
- 24.2505
+ 22.102
],
"3": [
- 16.8625
+ 16.917
],
"4": [
- 14.686499999999999
+ 14.571000000000002
],
"5": [
- 32.248999999999995
+ 32.952
],
"6": [
- 0.7415
+ 0.6825
],
"7": [
- 15.712499999999999
+ 15.8275
],
"8": [
- 39.1135
+ 38.9715
],
"9": [
- 47.789
+ 47.0205
],
"10": [
- 28.569
+ 28.0815
],
"11": [
- 14.600999999999999
+ 15.011
],
"12": [
- 7.8575
+ 7.832000000000001
],
"13": [
- 10.9605
+ 10.588000000000001
],
"14": [
- 2.607
+ 2.643
],
"15": [
- 11.962
+ 12.073
],
"16": [
- 15.587499999999999
+ 11.6235
],
"17": [
- 36.6875
+ 36.1635
],
"18": [
- 76.5035
+ 75.21950000000001
],
"19": [
- 9.379
+ 9.403500000000001
],
"20": [
- 8.841000000000001
+ 9.2045
],
"21": [
- 102.9005
+ 92.664
],
"22": [
- 10.042
+ 9.611
]
}
\ No newline at end of file
diff --git
a/docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpch.json
b/benchmarks/results/0.16.0/comet-tpch.json
similarity index 65%
rename from
docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpch.json
rename to benchmarks/results/0.16.0/comet-tpch.json
index 8fc3772d7..c8a94f5d4 100644
--- a/docs/source/contributor-guide/benchmark-results/spark-3.5.8-tpch.json
+++ b/benchmarks/results/0.16.0/comet-tpch.json
@@ -14,69 +14,69 @@
"spark_profile": "3.5_2.12"
},
"1": [
- 101.212
+ 12.224499999999999
],
"2": [
- 38.5525
+ 23.6055
],
"3": [
- 36.48
+ 27.3585
],
"4": [
- 31.700499999999998
+ 14.620999999999999
],
"5": [
- 68.3175
+ 63.3855
],
"6": [
- 1.6925
+ 0.861
],
"7": [
- 28.233
+ 23.738
],
"8": [
- 63.251000000000005
+ 69.8215
],
"9": [
- 86.0805
+ 83.149
],
"10": [
- 40.848
+ 30.7035
],
"11": [
- 33.468999999999994
+ 16.0325
],
"12": [
- 16.343
+ 10.6155
],
"13": [
- 27.7675
+ 12.280000000000001
],
"14": [
- 8.062000000000001
+ 2.9764999999999997
],
"15": [
- 26.322499999999998
+ 12.7395
],
"16": [
- 27.36
+ 12.495000000000001
],
"17": [
- 97.70949999999999
+ 38.936
],
"18": [
- 154.4655
+ 67.9055
],
"19": [
- 14.817
+ 11.100999999999999
],
"20": [
- 18.521
+ 9.4275
],
"21": [
- 134.6705
+ 83.303
],
"22": [
- 17.3075
+ 10.29
]
}
\ No newline at end of file
diff --git
a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpcds.json
b/benchmarks/results/0.16.0/spark-tpcds.json
similarity index 54%
rename from
docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpcds.json
rename to benchmarks/results/0.16.0/spark-tpcds.json
index f84bbfce4..2b64e889a 100644
--- a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpcds.json
+++ b/benchmarks/results/0.16.0/spark-tpcds.json
@@ -14,312 +14,312 @@
"spark_profile": "3.5_2.12"
},
"1": [
- 7.220000000000001
+ 7.167
],
"2": [
- 5.5635
+ 13.923
],
"3": [
- 3.2104999999999997
+ 4.2364999999999995
],
"4": [
- 48.129999999999995
+ 43.514
],
"5": [
- 6.2745
+ 16.4775
],
"6": [
- 1.9304999999999999
+ 4.067
],
"7": [
- 7.42
+ 8.856
],
"8": [
- 2.644
+ 3.09
],
"9": [
- 9.9525
+ 42.6285
],
"10": [
- 4.0455000000000005
+ 4.7945
],
"11": [
- 20.5745
+ 16.853
],
"12": [
- 1.5710000000000002
+ 2.0755
],
"13": [
- 8.305499999999999
+ 8.653
],
"15": [
- 2.293
+ 2.9574999999999996
],
"16": [
- 6.834
+ 16.53
],
"17": [
- 5.15
+ 5.3765
],
"18": [
- 4.7525
+ 5.1055
],
"19": [
- 1.9245
+ 2.1015
],
"20": [
- 1.529
+ 2.058
],
"21": [
- 1.4005
+ 1.8235000000000001
],
"22": [
- 8.411000000000001
+ 9.503499999999999
],
"25": [
- 4.8605
+ 4.1765
],
"26": [
- 3.6575
+ 5.429
],
"27": [
- 6.09
+ 6.763999999999999
],
"28": [
- 11.378499999999999
+ 44.5585
],
"29": [
- 12.758
+ 13.5135
],
"30": [
- 3.346
+ 3.671
],
"31": [
- 7.4355
+ 6.894
],
"32": [
- 1.2095
+ 1.669
],
"33": [
- 3.035
+ 3.0945
],
"34": [
- 4.402
+ 4.410500000000001
],
"35": [
- 6.5415
+ 6.3294999999999995
],
"36": [
- 5.6705000000000005
+ 5.4935
],
"37": [
- 3.2424999999999997
+ 10.256
],
"38": [
- 11.7975
+ 13.1195
],
"40": [
- 3.5564999999999998
+ 6.274
],
"41": [
- 0.4645
+ 1.272
],
"42": [
- 0.9555
+ 1.3410000000000002
],
"43": [
- 4.8149999999999995
+ 4.9745
],
"44": [
- 7.016
+ 26.9015
],
"45": [
- 2.567
+ 2.5075000000000003
],
"46": [
- 6.1690000000000005
+ 6.579000000000001
],
"47": [
- 7.805999999999999
+ 8.629
],
"48": [
- 6.811999999999999
+ 8.0245
],
"49": [
- 4.8355
+ 10.067499999999999
],
"50": [
- 21.1985
+ 33.194
],
"51": [
- 10.940000000000001
+ 11.3055
],
"52": [
- 1.0964999999999998
+ 1.297
],
"53": [
- 5.8635
+ 5.3425
],
"54": [
- 3.2199999999999998
+ 3.349
],
"55": [
- 1.0135
+ 1.2945
],
"56": [
- 2.948
+ 3.0195
],
"57": [
- 5.361000000000001
+ 5.2780000000000005
],
"58": [
- 2.4905
+ 2.798
],
"59": [
- 7.554
+ 18.6845
],
"60": [
- 3.2135
+ 3.419
],
"61": [
- 2.383
+ 3.394
],
"62": [
- 3.6390000000000002
+ 10.23
],
"63": [
- 4.607
+ 5.6935
],
"64": [
- 23.2
+ 42.333
],
"65": [
- 13.316500000000001
+ 12.4815
],
"66": [
- 5.2635000000000005
+ 4.963
],
"67": [
- 62.164500000000004
+ 62.204499999999996
],
"68": [
- 6.563000000000001
+ 3.158
],
"69": [
- 3.784
+ 4.324
],
"70": [
- 9.7715
+ 11.595500000000001
],
"71": [
- 1.9004999999999999
+ 2.0025
],
"72": [
- 10.899999999999999
+ 14.5035
],
"73": [
- 1.709
+ 2.148
],
"74": [
- 17.8455
+ 14.9735
],
"75": [
- 18.0295
+ 20.9365
],
"76": [
- 7.417
+ 18.448
],
"77": [
- 1.7635
+ 1.707
],
"78": [
- 32.436
+ 33.5505
],
"79": [
- 3.9115
+ 4.267
],
"80": [
- 6.602
+ 9.6695
],
"81": [
- 5.242
+ 4.689
],
"82": [
- 5.0135000000000005
+ 16.3265
],
"83": [
- 1.5425
+ 1.831
],
"84": [
- 1.734
+ 3.6265
],
"85": [
- 5.2865
+ 6.4815000000000005
],
"86": [
- 2.8755
+ 4.405
],
"87": [
- 13.177
+ 13.663
],
"88": [
- 21.369
+ 58.489999999999995
],
"89": [
- 4.142
+ 5.5785
],
"90": [
- 3.5090000000000003
+ 21.299
],
"91": [
- 1.302
+ 2.094
],
"92": [
- 1.104
+ 1.9665
],
"93": [
- 25.0165
+ 46.0995
],
"94": [
- 5.6465
+ 13.7265
],
"95": [
- 22.176499999999997
+ 29.724
],
"96": [
- 4.4205000000000005
+ 12.8995
],
"97": [
- 12.806000000000001
+ 14.062000000000001
],
"98": [
- 2.0945
+ 2.3645
],
"99": [
- 3.9445
+ 12.163
],
"14a": [
- 51.093999999999994
+ 53.1815
],
"14b": [
- 44.180499999999995
+ 44.912
],
"23a": [
- 64.36500000000001
+ 91.458
],
"23b": [
- 76.929
+ 112.96950000000001
],
"24a": [
- 26.692
+ 49.903499999999994
],
"24b": [
- 26.1125
+ 48.6445
],
"39a": [
- 8.219
+ 5.496
],
"39b": [
- 7.592499999999999
+ 5.5565
]
}
\ No newline at end of file
diff --git
a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch.json
b/benchmarks/results/0.16.0/spark-tpch.json
similarity index 66%
rename from
docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch.json
rename to benchmarks/results/0.16.0/spark-tpch.json
index 2eb1ade06..6c53a0fd2 100644
--- a/docs/source/contributor-guide/benchmark-results/comet-0.15.0-tpch.json
+++ b/benchmarks/results/0.16.0/spark-tpch.json
@@ -14,69 +14,69 @@
"spark_profile": "3.5_2.12"
},
"1": [
- 11.9865
+ 99.91
],
"2": [
- 22.6175
+ 40.828500000000005
],
"3": [
- 26.852
+ 36.4985
],
"4": [
- 14.3615
+ 34.4655
],
"5": [
- 65.00649999999999
+ 68.779
],
"6": [
- 0.6839999999999999
+ 1.759
],
"7": [
- 23.7965
+ 27.957
],
"8": [
- 69.518
+ 62.8085
],
"9": [
- 84.6605
+ 80.4875
],
"10": [
- 29.8585
+ 44.2515
],
"11": [
- 15.7475
+ 33.549499999999995
],
"12": [
- 10.299
+ 16.2305
],
"13": [
- 12.3625
+ 26.3355
],
"14": [
- 2.982
+ 7.8815
],
"15": [
- 12.059999999999999
+ 27.697
],
"16": [
- 16.506
+ 28.218
],
"17": [
- 39.364000000000004
+ 97.5495
],
"18": [
- 69.58349999999999
+ 150.066
],
"19": [
- 11.554
+ 16.0935
],
"20": [
- 9.604500000000002
+ 17.506
],
"21": [
- 85.452
+ 110.69
],
"22": [
- 10.0525
+ 16.811
]
}
\ No newline at end of file
diff --git a/benchmarks/tpc/generate-comparison.py
b/benchmarks/tpc/generate-comparison.py
index e5058a3bf..35b2b9cb4 100644
--- a/benchmarks/tpc/generate-comparison.py
+++ b/benchmarks/tpc/generate-comparison.py
@@ -18,12 +18,16 @@
import argparse
import json
import logging
+import os
import matplotlib.pyplot as plt
import numpy as np
+import re
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
+QUERY_KEY_RE = re.compile(r'^(\d+)([a-z]*)$')
+
def geomean(data):
return np.prod(data) ** (1 / len(data))
@@ -34,19 +38,20 @@ def get_durations(result, query_key):
return value["durations"]
return value
+def query_sort_key(key):
+ """Sort key for query labels like "14", "14a", "14b" so sub-queries sit
between 14 and 15."""
+ m = QUERY_KEY_RE.match(str(key))
+ if m:
+ return (int(m.group(1)), m.group(2))
+ return (float('inf'), str(key))
+
def get_all_queries(results):
- """Return the sorted union of all query keys across all result sets."""
+ """Return the sorted union of query keys across all result sets, as
strings."""
all_keys = set()
for result in results:
all_keys.update(result.keys())
- # Filter to numeric query keys and sort numerically
- numeric_keys = []
- for k in all_keys:
- try:
- numeric_keys.append(int(k))
- except ValueError:
- pass
- return sorted(numeric_keys)
+ query_keys = [str(k) for k in all_keys if QUERY_KEY_RE.match(str(k))]
+ return sorted(query_keys, key=query_sort_key)
def get_common_queries(results, labels):
"""Return queries present in ALL result sets, warning about queries
missing from some files."""
@@ -92,7 +97,7 @@ def check_result_consistency(results, labels, benchmark):
details = ", ".join(f"{label}={h}" for label, h in hashes)
logger.warning(f"Query {query}: result hash mismatch:
{details}")
-def generate_query_rel_speedup_chart(baseline, comparison, label1: str,
label2: str, benchmark: str, title: str, common_queries=None):
+def generate_query_rel_speedup_chart(baseline, comparison, label1: str,
label2: str, benchmark: str, title: str, common_queries=None, output_dir: str =
'.'):
if common_queries is None:
common_queries = range(1, query_count(benchmark)+1)
results = []
@@ -137,19 +142,15 @@ def generate_query_rel_speedup_chart(baseline,
comparison, label1: str, label2:
ax.axhline(0, color='black', linewidth=0.8)
min_value = (min(speedups) // 100) * 100
max_value = ((max(speedups) // 100) + 1) * 100 + 50
- if benchmark == "tpch":
- ax.set_ylim(min_value, max_value)
- else:
- # TODO improve this
- ax.set_ylim(-250, 300)
+ ax.set_ylim(min_value, max_value)
# Show grid for better readability
ax.yaxis.grid(True)
# Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_speedup_rel.png', format='png')
+ plt.savefig(os.path.join(output_dir,
f'{benchmark}_queries_speedup_rel.png'), format='png')
-def generate_query_abs_speedup_chart(baseline, comparison, label1: str,
label2: str, benchmark: str, title: str, common_queries=None):
+def generate_query_abs_speedup_chart(baseline, comparison, label1: str,
label2: str, benchmark: str, title: str, common_queries=None, output_dir: str =
'.'):
if common_queries is None:
common_queries = range(1, query_count(benchmark)+1)
results = []
@@ -197,9 +198,9 @@ def generate_query_abs_speedup_chart(baseline, comparison,
label1: str, label2:
ax.yaxis.grid(True)
# Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_speedup_abs.png', format='png')
+ plt.savefig(os.path.join(output_dir,
f'{benchmark}_queries_speedup_abs.png'), format='png')
-def generate_query_comparison_chart(results, labels, benchmark: str, title:
str, common_queries=None):
+def generate_query_comparison_chart(results, labels, benchmark: str, title:
str, common_queries=None, output_dir: str = '.'):
if common_queries is None:
common_queries = range(1, query_count(benchmark)+1)
queries = []
@@ -235,9 +236,9 @@ def generate_query_comparison_chart(results, labels,
benchmark: str, title: str,
ax.legend()
# Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_compare.png', format='png')
+ plt.savefig(os.path.join(output_dir, f'{benchmark}_queries_compare.png'),
format='png')
-def generate_summary(results, labels, benchmark: str, title: str,
common_queries=None):
+def generate_summary(results, labels, benchmark: str, title: str,
common_queries=None, output_dir: str = '.'):
if common_queries is None:
common_queries = range(1, query_count(benchmark)+1)
timings = []
@@ -267,7 +268,7 @@ def generate_summary(results, labels, benchmark: str,
title: str, common_queries
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}',
va='bottom') # va: vertical alignment
- plt.savefig(f'{benchmark}_allqueries.png', format='png')
+ plt.savefig(os.path.join(output_dir, f'{benchmark}_allqueries.png'),
format='png')
def query_count(benchmark: str):
if benchmark == "tpch":
@@ -277,7 +278,7 @@ def query_count(benchmark: str):
else:
raise "invalid benchmark name"
-def main(files, labels, benchmark: str, title: str):
+def main(files, labels, benchmark: str, title: str, output_dir: str = '.'):
results = []
for filename in files:
with open(filename) as f:
@@ -287,11 +288,11 @@ def main(files, labels, benchmark: str, title: str):
if not common_queries:
logger.error("No queries found in common across all result files")
return
- generate_summary(results, labels, benchmark, title, common_queries)
- generate_query_comparison_chart(results, labels, benchmark, title,
common_queries)
+ generate_summary(results, labels, benchmark, title, common_queries,
output_dir=output_dir)
+ generate_query_comparison_chart(results, labels, benchmark, title,
common_queries, output_dir=output_dir)
if len(files) == 2:
- generate_query_abs_speedup_chart(results[0], results[1], labels[0],
labels[1], benchmark, title, common_queries)
- generate_query_rel_speedup_chart(results[0], results[1], labels[0],
labels[1], benchmark, title, common_queries)
+ generate_query_abs_speedup_chart(results[0], results[1], labels[0],
labels[1], benchmark, title, common_queries, output_dir=output_dir)
+ generate_query_rel_speedup_chart(results[0], results[1], labels[0],
labels[1], benchmark, title, common_queries, output_dir=output_dir)
if __name__ == '__main__':
argparse = argparse.ArgumentParser(description='Generate comparison')
@@ -299,5 +300,6 @@ if __name__ == '__main__':
argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch
or tpcds)')
argparse.add_argument('--title', type=str, help='Chart title')
+ argparse.add_argument('--output-dir', type=str, default='.',
help='Directory to write PNGs to (default: cwd)')
args = argparse.parse_args()
- main(args.filenames, args.labels, args.benchmark, args.title)
+ main(args.filenames, args.labels, args.benchmark, args.title,
output_dir=args.output_dir)
diff --git a/benchmarks/tpc/tpcbench.py b/benchmarks/tpc/tpcbench.py
index 036d7b0e9..a654474a6 100644
--- a/benchmarks/tpc/tpcbench.py
+++ b/benchmarks/tpc/tpcbench.py
@@ -56,6 +56,21 @@ def result_hash(rows):
return h.hexdigest()
+def first_keyword(sql):
+ """Return the first non-comment, non-whitespace keyword in lowercase."""
+ for line in sql.splitlines():
+ stripped = line.strip()
+ if not stripped or stripped.startswith("--"):
+ continue
+ return stripped.split(None, 1)[0].lstrip("(").lower()
+ return ""
+
+
+def is_select_statement(sql):
+ """Classify a SQL statement: True if it is a SELECT/WITH query, False if
DDL."""
+ return first_keyword(sql) in ("select", "with")
+
+
def main(
benchmark: str,
data_path: str,
@@ -147,44 +162,56 @@ def main(
queries_to_run = range(1, num_queries + 1)
for query in queries_to_run:
- spark.sparkContext.setJobDescription(f"{benchmark} q{query}")
-
path = f"{query_path}/q{query}.sql"
print(f"\nRunning query {query} from {path}")
with open(path, "r") as f:
text = f.read()
- queries = text.split(";")
+
+ statements = [s.strip() for s in text.split(";") if s.strip()]
+ select_indices = [i for i, s in enumerate(statements) if
is_select_statement(s)]
+ multi = len(select_indices) > 1
+
+ for stmt_idx, sql in enumerate(statements):
+ sql = sql.replace("create view", "create temp view")
+ is_query = stmt_idx in select_indices
+ if is_query:
+ if multi:
+ suffix = chr(ord("a") + select_indices.index(stmt_idx))
+ query_label = f"{query}{suffix}"
+ else:
+ query_label = str(query)
+ spark.sparkContext.setJobDescription(f"{benchmark}
q{query_label}")
+ print(f"Executing query {query_label}: {sql[:100]}...")
+ else:
+ print(f"Executing DDL (not timed): {sql[:100]}...")
start_time = time.time()
- for sql in queries:
- sql = sql.strip().replace("create view", "create temp
view")
- if len(sql) > 0:
- print(f"Executing: {sql[:100]}...")
- df = spark.sql(sql)
- df.explain("formatted")
-
- if write_path is not None:
- if len(df.columns) > 0:
- output_path = f"{write_path}/q{query}"
- deduped = dedup_columns(df)
-
deduped.orderBy(*deduped.columns).coalesce(1).write.mode("overwrite").parquet(output_path)
- print(f"Results written to {output_path}")
- else:
- rows = df.collect()
- row_count = len(rows)
- row_hash = result_hash(rows)
- print(f"Query {query} returned {row_count} rows,
hash={row_hash}")
-
- end_time = time.time()
- elapsed = end_time - start_time
- print(f"Query {query} took {elapsed:.2f} seconds")
-
- query_result = results.setdefault(query, {"durations": []})
- query_result["durations"].append(round(elapsed, 3))
- if "row_count" not in query_result and not write_path:
- query_result["row_count"] = row_count
- query_result["result_hash"] = row_hash
+ df = spark.sql(sql)
+ df.explain("formatted")
+
+ if is_query and write_path is not None:
+ if len(df.columns) > 0:
+ output_path = f"{write_path}/q{query_label}"
+ deduped = dedup_columns(df)
+
deduped.orderBy(*deduped.columns).coalesce(1).write.mode("overwrite").parquet(output_path)
+ print(f"Results written to {output_path}")
+ elapsed = time.time() - start_time
+ print(f"Query {query_label} took {elapsed:.2f} seconds")
+ query_result = results.setdefault(query_label,
{"durations": []})
+ query_result["durations"].append(round(elapsed, 3))
+ else:
+ rows = df.collect()
+ elapsed = time.time() - start_time
+ if is_query:
+ row_count = len(rows)
+ row_hash = result_hash(rows)
+ print(f"Query {query_label} took {elapsed:.2f}
seconds, returned {row_count} rows, hash={row_hash}")
+ query_result = results.setdefault(query_label,
{"durations": []})
+ query_result["durations"].append(round(elapsed, 3))
+ if "row_count" not in query_result:
+ query_result["row_count"] = row_count
+ query_result["result_hash"] = row_hash
iter_end_time = time.time()
print(f"\nIteration {iteration + 1} took {iter_end_time -
iter_start_time:.2f} seconds")
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpcds_allqueries.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_allqueries.png
new file mode 100644
index 000000000..211309a76
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_allqueries.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_compare.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_compare.png
new file mode 100644
index 000000000..22820b81e
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_compare.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_abs.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_abs.png
new file mode 100644
index 000000000..6e49ae4a2
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_abs.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_rel.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_rel.png
new file mode 100644
index 000000000..4f25fa863
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpcds_queries_speedup_rel.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpch_allqueries.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_allqueries.png
new file mode 100644
index 000000000..ff4d79b6a
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_allqueries.png differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_compare.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_compare.png
new file mode 100644
index 000000000..e86d43ab3
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_compare.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_abs.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_abs.png
new file mode 100644
index 000000000..016e7226e
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_abs.png
differ
diff --git
a/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_rel.png
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_rel.png
new file mode 100644
index 000000000..bf4ba2f3b
Binary files /dev/null and
b/docs/source/_static/images/benchmark-results/0.16.0/tpch_queries_speedup_rel.png
differ
diff --git a/docs/source/contributor-guide/benchmark-results/tpc-ds.md
b/docs/source/contributor-guide/benchmark-results/tpc-ds.md
index 23e54612d..7a988a508 100644
--- a/docs/source/contributor-guide/benchmark-results/tpc-ds.md
+++ b/docs/source/contributor-guide/benchmark-results/tpc-ds.md
@@ -21,7 +21,23 @@ under the License.
The following benchmarks were performed on an EKS cluster (`r6i.24xlarge`
instances with EBS storage) with data stored in S3.
-The tracking issue for improving TPC-DS performance is
[#858](https://github.com/apache/datafusion-comet/issues/858).
+## Benchmark Results
+
+Total time to run all queries (lower is better).
+
+
+
+Per-query breakdown showing the relative performance of Spark and Comet.
+
+
+
+How much Comet accelerates each query in relative terms.
+
+
+
+How much Comet accelerates each query in absolute terms.
+
+
## Configuration
@@ -53,19 +69,3 @@ spark.memory.offHeap.enabled=true
spark.memory.offHeap.size=32G
spark.comet.memoryPool.fraction=0.8
```
-
-## Benchmark Results
-
-
-
-Here is a breakdown showing relative performance of Spark and Comet for each
query.
-
-
-
-The following chart shows how much Comet currently accelerates each query from
the benchmark in relative terms.
-
-
-
-The following chart shows how much Comet currently accelerates each query from
the benchmark in absolute terms.
-
-
diff --git a/docs/source/contributor-guide/benchmark-results/tpc-h.md
b/docs/source/contributor-guide/benchmark-results/tpc-h.md
index 09ef122e4..5a3ca2141 100644
--- a/docs/source/contributor-guide/benchmark-results/tpc-h.md
+++ b/docs/source/contributor-guide/benchmark-results/tpc-h.md
@@ -21,6 +21,26 @@ under the License.
The following benchmarks were performed on an EKS cluster (`r6i.24xlarge`
instances with EBS storage) with data stored in S3.
+## Benchmark Results
+
+Total time to run all queries (lower is better).
+
+
+
+The following charts are based on the tuned run using hash join.
+
+Per-query breakdown showing the relative performance of Spark and Comet.
+
+
+
+How much Comet accelerates each query in relative terms.
+
+
+
+How much Comet accelerates each query in absolute terms.
+
+
+
## Configuration
Common:
@@ -51,35 +71,9 @@ spark.memory.offHeap.enabled=true
spark.memory.offHeap.size=32G
```
-Comet (Tuned):
+### Comet (Tuned)
```properties
-spark.executor.memory=32G
-spark.executor.memoryOverhead=10G
-spark.memory.offHeap.enabled=true
-spark.memory.offHeap.size=32G
spark.comet.exec.replaceSortMergeJoin=true
spark.comet.memoryPool.fraction=0.8
```
-
-## Benchmark Results
-
-The following chart shows benchmark results comparing Spark to Comet, both
with Comet's default settings, and with Hash Join enabled in Comet.
-
-Comet's Hash Join does not support spilling yet, so it isn't suitable for all
workloads.
-
-
-
-## Comet (with Hash Join enabled)
-
-Here is a breakdown showing relative performance of Spark and Comet for each
query.
-
-
-
-The following chart shows how much Comet currently accelerates each query from
the benchmark in relative terms.
-
-
-
-The following chart shows how much Comet currently accelerates each query from
the benchmark in absolute terms.
-
-
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]