This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/asf-site by this push:
new dea407c9 Publish built docs triggered by
1f81c3812490d35b488795ad597e4e6d3f9e114d
dea407c9 is described below
commit dea407c9d8cd0b0179a578bb23514fa35a8c3ff7
Author: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri May 31 16:36:44 2024 +0000
Publish built docs triggered by 1f81c3812490d35b488795ad597e4e6d3f9e114d
---
.../spark-8-exec-5-runs.json | 184 +++++++++++++++++++
.../datafusion-python-8-cores.json | 73 ++++++++
.../comet-8-exec-5-runs.json | 201 +++++++++++++++++++++
_sources/contributor-guide/benchmarking.md.txt | 113 ++++++++----
_static/images/tpch_allqueries.png | Bin 0 -> 23677 bytes
_static/images/tpch_queries_compare.png | Bin 0 -> 27388 bytes
_static/images/tpch_queries_speedup.png | Bin 0 -> 38821 bytes
contributor-guide/benchmarking.html | 130 +++++++++----
searchindex.js | 2 +-
9 files changed, 631 insertions(+), 72 deletions(-)
diff --git
a/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json
b/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json
new file mode 100644
index 00000000..012b05c3
--- /dev/null
+++ b/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json
@@ -0,0 +1,184 @@
+{
+ "engine": "datafusion-comet",
+ "benchmark": "tpch",
+ "data_path": "/mnt/bigdata/tpch/sf100/",
+ "query_path": "../../tpch/queries",
+ "spark_conf": {
+ "spark.driver.extraJavaOptions": "-Djava.net.preferIPv6Addresses=false
-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add- [...]
+ "spark.sql.warehouse.dir":
"file:/home/andy/git/apache/datafusion-benchmarks/runners/datafusion-comet/spark-warehouse",
+ "spark.app.id": "app-20240528090804-0041",
+ "spark.app.submitTime": "1716908883258",
+ "spark.executor.memory": "8G",
+ "spark.master": "spark://woody:7077",
+ "spark.executor.id": "driver",
+ "spark.executor.instances": "8",
+ "spark.app.name": "DataFusion Comet Benchmark derived from TPC-H /
TPC-DS",
+ "spark.driver.memory": "8G",
+ "spark.rdd.compress": "True",
+ "spark.executor.extraJavaOptions":
"-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --ad [...]
+ "spark.serializer.objectStreamReset": "100",
+ "spark.cores.max": "8",
+ "spark.submit.pyFiles": "",
+ "spark.executor.cores": "1",
+ "spark.submit.deployMode": "client",
+ "spark.sql.autoBroadcastJoinThreshold": "-1",
+ "spark.eventLog.enabled": "false",
+ "spark.app.startTime": "1716908883579",
+ "spark.driver.port": "33725",
+ "spark.driver.host": "woody.lan"
+ },
+ "1": [
+ 76.91316103935242,
+ 79.55859923362732,
+ 81.10397529602051,
+ 79.01998662948608,
+ 79.1286551952362
+ ],
+ "2": [
+ 23.977370262145996,
+ 22.214473247528076,
+ 22.686659812927246,
+ 22.016682386398315,
+ 21.766324520111084
+ ],
+ "3": [
+ 22.700742721557617,
+ 21.980144739151,
+ 21.876065969467163,
+ 21.661516189575195,
+ 21.69345998764038
+ ],
+ "4": [
+ 17.377647638320923,
+ 16.249598264694214,
+ 16.15747308731079,
+ 16.128843069076538,
+ 16.04338026046753
+ ],
+ "5": [
+ 44.38863182067871,
+ 45.47764492034912,
+ 45.76063895225525,
+ 45.16393995285034,
+ 60.848369121551514
+ ],
+ "6": [
+ 3.2041075229644775,
+ 2.970944881439209,
+ 2.891291856765747,
+ 2.9719409942626953,
+ 3.0702600479125977
+ ],
+ "7": [
+ 24.369274377822876,
+ 24.684266567230225,
+ 24.146574020385742,
+ 24.023175716400146,
+ 30.56047773361206
+ ],
+ "8": [
+ 46.46081209182739,
+ 45.9838604927063,
+ 46.341185092926025,
+ 45.833823919296265,
+ 46.61182403564453
+ ],
+ "9": [
+ 67.67960548400879,
+ 67.34667444229126,
+ 70.34601259231567,
+ 71.24095153808594,
+ 84.38811421394348
+ ],
+ "10": [
+ 19.16477870941162,
+ 19.081010580062866,
+ 19.501060009002686,
+ 19.165698528289795,
+ 20.216782331466675
+ ],
+ "11": [
+ 17.158706426620483,
+ 17.05184030532837,
+ 17.714542150497437,
+ 17.004602909088135,
+ 17.700096130371094
+ ],
+ "12": [
+ 11.654477834701538,
+ 11.805298805236816,
+ 11.822469234466553,
+ 12.79678750038147,
+ 13.64478850364685
+ ],
+ "13": [
+ 20.430822372436523,
+ 20.18759250640869,
+ 21.26596975326538,
+ 21.234288454055786,
+ 20.189200162887573
+ ],
+ "14": [
+ 5.60215950012207,
+ 5.160705089569092,
+ 5.080057382583618,
+ 4.937625408172607,
+ 5.853632688522339
+ ],
+ "15": [
+ 14.17775845527649,
+ 13.898571729660034,
+ 14.215840578079224,
+ 14.316090106964111,
+ 14.356236457824707
+ ],
+ "16": [
+ 6.252386808395386,
+ 6.010213375091553,
+ 6.054978370666504,
+ 5.886059522628784,
+ 5.923115253448486
+ ],
+ "17": [
+ 71.41593313217163,
+ 70.25399804115295,
+ 72.07622528076172,
+ 72.27566242218018,
+ 72.20579051971436
+ ],
+ "18": [
+ 65.72738265991211,
+ 65.47461080551147,
+ 67.14260482788086,
+ 65.95489883422852,
+ 69.51795554161072
+ ],
+ "19": [
+ 7.1520891189575195,
+ 6.516514301300049,
+ 6.580992698669434,
+ 6.486274242401123,
+ 6.418147087097168
+ ],
+ "20": [
+ 12.619760036468506,
+ 12.235978126525879,
+ 12.116347551345825,
+ 12.161245584487915,
+ 12.30910348892212
+ ],
+ "21": [
+ 60.795483350753784,
+ 60.484593629837036,
+ 61.27316427230835,
+ 60.475560426712036,
+ 81.21473670005798
+ ],
+ "22": [
+ 8.926804065704346,
+ 8.113754034042358,
+ 8.029133796691895,
+ 7.99291467666626,
+ 8.439452648162842
+ ]
+}
\ No newline at end of file
diff --git
a/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json
b/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json
new file mode 100644
index 00000000..f032d536
--- /dev/null
+++ b/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json
@@ -0,0 +1,73 @@
+{
+ "engine": "datafusion-python",
+ "datafusion-version": "38.0.1",
+ "benchmark": "tpch",
+ "data_path": "/mnt/bigdata/tpch/sf100/",
+ "query_path": "../../tpch/queries/",
+ "1": [
+ 7.410699844360352
+ ],
+ "2": [
+ 2.966364622116089
+ ],
+ "3": [
+ 3.988652467727661
+ ],
+ "4": [
+ 1.8821499347686768
+ ],
+ "5": [
+ 6.957948684692383
+ ],
+ "6": [
+ 1.779731273651123
+ ],
+ "7": [
+ 14.559604167938232
+ ],
+ "8": [
+ 7.062309265136719
+ ],
+ "9": [
+ 14.908353805541992
+ ],
+ "10": [
+ 7.73533296585083
+ ],
+ "11": [
+ 2.346423387527466
+ ],
+ "12": [
+ 2.7248904705047607
+ ],
+ "13": [
+ 6.38663387298584
+ ],
+ "14": [
+ 2.4675676822662354
+ ],
+ "15": [
+ 4.799000024795532
+ ],
+ "16": [
+ 1.9091999530792236
+ ],
+ "17": [
+ 19.230653762817383
+ ],
+ "18": [
+ 25.15683078765869
+ ],
+ "19": [
+ 4.2268781661987305
+ ],
+ "20": [
+ 8.66620659828186
+ ],
+ "21": [
+ 17.696006059646606
+ ],
+ "22": [
+ 1.3805692195892334
+ ]
+}
\ No newline at end of file
diff --git
a/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json
b/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json
new file mode 100644
index 00000000..38142151
--- /dev/null
+++ b/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json
@@ -0,0 +1,201 @@
+{
+ "engine": "datafusion-comet",
+ "benchmark": "tpch",
+ "data_path": "/mnt/bigdata/tpch/sf100/",
+ "query_path": "../../tpch/queries",
+ "spark_conf": {
+ "spark.comet.explainFallback.enabled": "true",
+ "spark.jars":
"file:///home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+ "spark.comet.cast.allowIncompatible": "true",
+ "spark.executor.extraClassPath":
"/home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+ "spark.executor.memory": "8G",
+ "spark.comet.exec.shuffle.enabled": "true",
+ "spark.app.name": "DataFusion Comet Benchmark derived from TPC-H /
TPC-DS",
+ "spark.driver.port": "36573",
+ "spark.sql.adaptive.coalescePartitions.enabled": "false",
+ "spark.app.startTime": "1716923498046",
+ "spark.comet.batchSize": "8192",
+ "spark.app.id": "app-20240528131138-0043",
+ "spark.serializer.objectStreamReset": "100",
+ "spark.app.initial.jar.urls":
"spark://woody.lan:36573/jars/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+ "spark.submit.deployMode": "client",
+ "spark.sql.autoBroadcastJoinThreshold": "-1",
+ "spark.comet.exec.all.enabled": "true",
+ "spark.eventLog.enabled": "false",
+ "spark.driver.host": "woody.lan",
+ "spark.driver.extraJavaOptions": "-Djava.net.preferIPv6Addresses=false
-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add- [...]
+ "spark.sql.warehouse.dir":
"file:/home/andy/git/apache/datafusion-benchmarks/runners/datafusion-comet/spark-warehouse",
+ "spark.shuffle.manager":
"org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager",
+ "spark.comet.exec.enabled": "true",
+ "spark.repl.local.jars":
"file:///home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+ "spark.executor.id": "driver",
+ "spark.master": "spark://woody:7077",
+ "spark.executor.instances": "8",
+ "spark.comet.exec.shuffle.mode": "auto",
+ "spark.sql.extensions": "org.apache.comet.CometSparkSessionExtensions",
+ "spark.driver.memory": "8G",
+ "spark.driver.extraClassPath":
"/home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+ "spark.rdd.compress": "True",
+ "spark.executor.extraJavaOptions":
"-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --ad [...]
+ "spark.cores.max": "8",
+ "spark.comet.enabled": "true",
+ "spark.app.submitTime": "1716923497738",
+ "spark.submit.pyFiles": "",
+ "spark.executor.cores": "1",
+ "spark.comet.parquet.io.enabled": "false"
+ },
+ "1": [
+ 32.121661901474,
+ 27.997092485427856,
+ 27.756758451461792,
+ 28.55236315727234,
+ 28.332542181015015
+ ],
+ "2": [
+ 18.269107580184937,
+ 16.200955629348755,
+ 16.194639682769775,
+ 16.745808839797974,
+ 16.59864115715027
+ ],
+ "3": [
+ 17.265466690063477,
+ 17.069786310195923,
+ 17.12887978553772,
+ 19.33678102493286,
+ 18.182055234909058
+ ],
+ "4": [
+ 8.367004156112671,
+ 8.172023296356201,
+ 8.023266077041626,
+ 8.350765228271484,
+ 8.258736610412598
+ ],
+ "5": [
+ 34.10048794746399,
+ 32.69314408302307,
+ 33.21383595466614,
+ 36.391114473342896,
+ 39.00048065185547
+ ],
+ "6": [
+ 3.1693499088287354,
+ 3.044705390930176,
+ 3.047694206237793,
+ 3.2817511558532715,
+ 3.274174928665161
+ ],
+ "7": [
+ 25.369214296340942,
+ 24.020941257476807,
+ 24.0787034034729,
+ 28.47402787208557,
+ 28.23443365097046
+ ],
+ "8": [
+ 40.06126809120178,
+ 39.828824281692505,
+ 45.250510454177856,
+ 44.406742572784424,
+ 48.98451232910156
+ ],
+ "9": [
+ 62.822797775268555,
+ 61.26328158378601,
+ 64.95581865310669,
+ 69.51708793640137,
+ 73.52380013465881
+ ],
+ "10": [
+ 20.55334782600403,
+ 20.546096324920654,
+ 20.57452392578125,
+ 22.84211039543152,
+ 23.724371671676636
+ ],
+ "11": [
+ 11.068235158920288,
+ 10.715423822402954,
+ 11.353424310684204,
+ 11.37632942199707,
+ 11.530814170837402
+ ],
+ "12": [
+ 10.264788389205933,
+ 8.67864990234375,
+ 8.845952033996582,
+ 8.593009233474731,
+ 8.540803909301758
+ ],
+ "13": [
+ 9.603406190872192,
+ 9.648627042770386,
+ 13.040799140930176,
+ 10.154011249542236,
+ 9.716034412384033
+ ],
+ "14": [
+ 6.20926308631897,
+ 6.0385496616363525,
+ 7.674488544464111,
+ 10.53052043914795,
+ 7.661675691604614
+ ],
+ "15": [
+ 11.466301918029785,
+ 11.473632097244263,
+ 11.279382228851318,
+ 13.291078329086304,
+ 12.81026816368103
+ ],
+ "16": [
+ 8.096073865890503,
+ 7.73410701751709,
+ 7.742897272109985,
+ 8.477537631988525,
+ 7.821273326873779
+ ],
+ "17": [
+ 43.69264578819275,
+ 43.33040428161621,
+ 46.291987657547,
+ 54.654345989227295,
+ 54.37124800682068
+ ],
+ "18": [
+ 27.205485105514526,
+ 26.785916090011597,
+ 27.331408262252808,
+ 29.946768760681152,
+ 28.037617444992065
+ ],
+ "19": [
+ 8.100102186203003,
+ 7.845783472061157,
+ 8.52329158782959,
+ 8.907397985458374,
+ 9.13755488395691
+ ],
+ "20": [
+ 13.09695029258728,
+ 12.683861255645752,
+ 15.612725019454956,
+ 13.361177206039429,
+ 16.614356517791748
+ ],
+ "21": [
+ 43.69623780250549,
+ 43.26758122444153,
+ 46.91650056838989,
+ 47.875754833221436,
+ 57.9763662815094
+ ],
+ "22": [
+ 4.5090577602386475,
+ 4.420571804046631,
+ 4.639787673950195,
+ 5.118046998977661,
+ 5.017346143722534
+ ]
+}
\ No newline at end of file
diff --git a/_sources/contributor-guide/benchmarking.md.txt
b/_sources/contributor-guide/benchmarking.md.txt
index 502b35c2..3e9a61ef 100644
--- a/_sources/contributor-guide/benchmarking.md.txt
+++ b/_sources/contributor-guide/benchmarking.md.txt
@@ -19,44 +19,87 @@ under the License.
# Comet Benchmarking Guide
-To track progress on performance, we regularly run benchmarks derived from
TPC-H and TPC-DS. Benchmarking scripts are
-available in the [DataFusion
Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository.
+To track progress on performance, we regularly run benchmarks derived from
TPC-H and TPC-DS. Data generation and
+benchmarking documentation and scripts are available in the [DataFusion
Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository.
-Here is an example command for running the benchmarks. This command will need
to be adapted based on the Spark
-environment and location of data files.
+Here are example commands for running the benchmarks against a Spark cluster.
This command will need to be
+adapted based on the Spark environment and location of data files.
-This command assumes that `datafusion-benchmarks` is checked out in a parallel
directory to `datafusion-comet`.
+These commands are intended to be run from the `runners/datafusion-comet`
directory in the `datafusion-benchmarks`
+repository.
+
+## Running Benchmarks Against Apache Spark
```shell
-$SPARK_HOME/bin/spark-submit \
- --master "local[*]" \
- --conf spark.driver.memory=8G \
- --conf spark.executor.memory=64G \
- --conf spark.executor.cores=16 \
- --conf spark.cores.max=16 \
- --conf spark.eventLog.enabled=true \
- --conf spark.sql.autoBroadcastJoinThreshold=-1 \
- --jars $COMET_JAR \
- --conf spark.driver.extraClassPath=$COMET_JAR \
- --conf spark.executor.extraClassPath=$COMET_JAR \
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
- --conf spark.comet.enabled=true \
- --conf spark.comet.exec.enabled=true \
- --conf spark.comet.exec.all.enabled=true \
- --conf spark.comet.cast.allowIncompatible=true \
- --conf spark.comet.explainFallback.enabled=true \
- --conf spark.comet.parquet.io.enabled=false \
- --conf spark.comet.batchSize=8192 \
- --conf spark.comet.columnar.shuffle.enabled=false \
- --conf spark.comet.exec.shuffle.enabled=true \
- --conf
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
\
- --conf spark.sql.adaptive.coalescePartitions.enabled=false \
- --conf spark.comet.shuffle.enforceMode.enabled=true \
- ../datafusion-benchmarks/runners/datafusion-comet/tpcbench.py \
- --benchmark tpch \
- --data /mnt/bigdata/tpch/sf100-parquet/ \
- --queries ../datafusion-benchmarks/tpch/queries
+$SPARK_HOME/bin/spark-submit \
+ --master $SPARK_MASTER \
+ --conf spark.driver.memory=8G \
+ --conf spark.executor.memory=32G \
+ --conf spark.executor.cores=8 \
+ --conf spark.cores.max=8 \
+ --conf spark.sql.autoBroadcastJoinThreshold=-1 \
+ tpcbench.py \
+ --benchmark tpch \
+ --data /mnt/bigdata/tpch/sf100/ \
+ --queries ../../tpch/queries \
+ --iterations 5
```
-Comet performance can be compared to regular Spark performance by running the
benchmark twice, once with
-`spark.comet.enabled` set to `true` and once with it set to `false`.
\ No newline at end of file
+## Running Benchmarks Against Apache Spark with Apache DataFusion Comet Enabled
+
+```shell
+$SPARK_HOME/bin/spark-submit \
+ --master $SPARK_MASTER \
+ --conf spark.driver.memory=8G \
+ --conf spark.executor.memory=64G \
+ --conf spark.executor.cores=8 \
+ --conf spark.cores.max=8 \
+ --conf spark.sql.autoBroadcastJoinThreshold=-1 \
+ --jars $COMET_JAR \
+ --conf spark.driver.extraClassPath=$COMET_JAR \
+ --conf spark.executor.extraClassPath=$COMET_JAR \
+ --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
+ --conf spark.comet.enabled=true \
+ --conf spark.comet.exec.enabled=true \
+ --conf spark.comet.exec.all.enabled=true \
+ --conf spark.comet.cast.allowIncompatible=true \
+ --conf spark.comet.explainFallback.enabled=true \
+ --conf spark.comet.parquet.io.enabled=false \
+ --conf spark.comet.batchSize=8192 \
+ --conf spark.comet.exec.shuffle.enabled=true \
+ --conf spark.comet.exec.shuffle.mode=auto \
+ --conf
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
\
+ --conf spark.sql.adaptive.coalescePartitions.enabled=false \
+ tpcbench.py \
+ --benchmark tpch \
+ --data /mnt/bigdata/tpch/sf100/ \
+ --queries ../../tpch/queries \
+ --iterations 5
+```
+
+## Current Performance
+
+Comet is not yet achieving full DataFusion speeds in all cases, but with
future work we aim to provide a 2x-4x speedup
+for many use cases.
+
+The following benchmarks were performed on a Linux workstation with PCIe 5,
AMD 7950X CPU (16 cores), 128 GB RAM, and
+data stored locally on NVMe storage. Performance characteristics will vary in
different environments and we encourage
+you to run these benchmarks in your own environments.
+
+
+
+Here is a breakdown showing relative performance of Spark, Comet, and
DataFusion for each TPC-H query.
+
+
+
+The following chart shows how much Comet currently accelerates each query from
the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help
achieve even greater speedups in the future.
+
+
+
+The raw results of these benchmarks in JSON format is available here:
+
+- [Spark](./benchmark-results/2024-05-30/spark-8-exec-5-runs.json)
+- [Comet](./benchmark-results/2024-05-30/comet-8-exec-5-runs.json)
+- [DataFusion](./benchmark-results/2024-05-30/datafusion-python-8-cores.json)
+
diff --git a/_static/images/tpch_allqueries.png
b/_static/images/tpch_allqueries.png
new file mode 100644
index 00000000..a6788d5a
Binary files /dev/null and b/_static/images/tpch_allqueries.png differ
diff --git a/_static/images/tpch_queries_compare.png
b/_static/images/tpch_queries_compare.png
new file mode 100644
index 00000000..92768061
Binary files /dev/null and b/_static/images/tpch_queries_compare.png differ
diff --git a/_static/images/tpch_queries_speedup.png
b/_static/images/tpch_queries_speedup.png
new file mode 100644
index 00000000..fb417ff1
Binary files /dev/null and b/_static/images/tpch_queries_speedup.png differ
diff --git a/contributor-guide/benchmarking.html
b/contributor-guide/benchmarking.html
index 1d8ce7de..f4fcd2de 100644
--- a/contributor-guide/benchmarking.html
+++ b/contributor-guide/benchmarking.html
@@ -255,9 +255,29 @@ under the License.
<div class="toc-item">
+<div class="tocsection onthispage pt-5 pb-3">
+ <i class="fas fa-list"></i> On this page
+</div>
<nav id="bd-toc-nav">
-
+ <ul class="visible nav section-nav flex-column">
+ <li class="toc-h2 nav-item toc-entry">
+ <a class="reference internal nav-link"
href="#running-benchmarks-against-apache-spark">
+ Running Benchmarks Against Apache Spark
+ </a>
+ </li>
+ <li class="toc-h2 nav-item toc-entry">
+ <a class="reference internal nav-link"
href="#running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled">
+ Running Benchmarks Against Apache Spark with Apache DataFusion Comet Enabled
+ </a>
+ </li>
+ <li class="toc-h2 nav-item toc-entry">
+ <a class="reference internal nav-link" href="#current-performance">
+ Current Performance
+ </a>
+ </li>
+</ul>
+
</nav>
</div>
@@ -304,43 +324,81 @@ under the License.
-->
<section id="comet-benchmarking-guide">
<h1>Comet Benchmarking Guide<a class="headerlink"
href="#comet-benchmarking-guide" title="Link to this heading">¶</a></h1>
-<p>To track progress on performance, we regularly run benchmarks derived from
TPC-H and TPC-DS. Benchmarking scripts are
-available in the <a class="reference external"
href="https://github.com/apache/datafusion-benchmarks">DataFusion
Benchmarks</a> GitHub repository.</p>
-<p>Here is an example command for running the benchmarks. This command will
need to be adapted based on the Spark
-environment and location of data files.</p>
-<p>This command assumes that <code class="docutils literal notranslate"><span
class="pre">datafusion-benchmarks</span></code> is checked out in a parallel
directory to <code class="docutils literal notranslate"><span
class="pre">datafusion-comet</span></code>.</p>
-<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span><span
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span
class="se">\ </span>
-<span class="w"> </span>--master<span class="w"> </span><span
class="s2">"local[*]"</span><span class="w"> </span><span
class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.driver.memory<span class="o">=</span>8G<span class="w">
</span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.executor.memory<span class="o">=</span>64G<span class="w">
</span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.executor.cores<span class="o">=</span><span
class="m">16</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w"> </span>spark.cores.max<span
class="o">=</span><span class="m">16</span><span class="w"> </span><span
class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.eventLog.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span
class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--jars<span class="w"> </span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.driver.extraClassPath<span class="o">=</span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.executor.extraClassPath<span class="o">=</span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.sql.extensions<span
class="o">=</span>org.apache.comet.CometSparkSessionExtensions<span class="w">
</span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.all.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.cast.allowIncompatible<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.explainFallback.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.parquet.io.enabled<span class="o">=</span><span
class="nb">false</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.batchSize<span class="o">=</span><span
class="m">8192</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.columnar.shuffle.enabled<span class="o">=</span><span
class="nb">false</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.shuffle.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.shuffle.manager<span
class="o">=</span>org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager<span
class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.sql.adaptive.coalescePartitions.enabled<span
class="o">=</span><span class="nb">false</span><span class="w"> </span><span
class="se">\ </span>
-<span class="w"> </span>--conf<span class="w">
</span>spark.comet.shuffle.enforceMode.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
-<span class="w">
</span>../datafusion-benchmarks/runners/datafusion-comet/tpcbench.py<span
class="w"> </span><span class="se">\</span>
-<span class="w"> </span>--benchmark<span class="w"> </span>tpch<span
class="w"> </span><span class="se">\ </span>
-<span class="w"> </span>--data<span class="w">
</span>/mnt/bigdata/tpch/sf100-parquet/<span class="w"> </span><span
class="se">\ </span>
-<span class="w"> </span>--queries<span class="w">
</span>../datafusion-benchmarks/tpch/queries<span class="w"> </span>
+<p>To track progress on performance, we regularly run benchmarks derived from
TPC-H and TPC-DS. Data generation and
+benchmarking documentation and scripts are available in the <a
class="reference external"
href="https://github.com/apache/datafusion-benchmarks">DataFusion
Benchmarks</a> GitHub repository.</p>
+<p>Here are example commands for running the benchmarks against a Spark
cluster. This command will need to be
+adapted based on the Spark environment and location of data files.</p>
+<p>These commands are intended to be run from the <code class="docutils
literal notranslate"><span class="pre">runners/datafusion-comet</span></code>
directory in the <code class="docutils literal notranslate"><span
class="pre">datafusion-benchmarks</span></code>
+repository.</p>
+<section id="running-benchmarks-against-apache-spark">
+<h2>Running Benchmarks Against Apache Spark<a class="headerlink"
href="#running-benchmarks-against-apache-spark" title="Link to this
heading">¶</a></h2>
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span><span
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--master<span class="w"> </span><span
class="nv">$SPARK_MASTER</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.driver.memory<span class="o">=</span>8G<span class="w">
</span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.executor.memory<span class="o">=</span>32G<span class="w">
</span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.executor.cores<span class="o">=</span><span
class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w"> </span>spark.cores.max<span
class="o">=</span><span class="m">8</span><span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>tpcbench.py<span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--benchmark<span class="w"> </span>tpch<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--data<span class="w">
</span>/mnt/bigdata/tpch/sf100/<span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--queries<span class="w">
</span>../../tpch/queries<span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--iterations<span class="w"> </span><span
class="m">5</span>
+</pre></div>
+</div>
+</section>
+<section
id="running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled">
+<h2>Running Benchmarks Against Apache Spark with Apache DataFusion Comet
Enabled<a class="headerlink"
href="#running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled"
title="Link to this heading">¶</a></h2>
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span><span
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--master<span class="w"> </span><span
class="nv">$SPARK_MASTER</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.driver.memory<span class="o">=</span>8G<span class="w">
</span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.executor.memory<span class="o">=</span>64G<span class="w">
</span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.executor.cores<span class="o">=</span><span
class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w"> </span>spark.cores.max<span
class="o">=</span><span class="m">8</span><span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--jars<span class="w"> </span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.driver.extraClassPath<span class="o">=</span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.executor.extraClassPath<span class="o">=</span><span
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.sql.extensions<span
class="o">=</span>org.apache.comet.CometSparkSessionExtensions<span class="w">
</span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.all.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.cast.allowIncompatible<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.explainFallback.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.parquet.io.enabled<span class="o">=</span><span
class="nb">false</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.batchSize<span class="o">=</span><span
class="m">8192</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.shuffle.enabled<span class="o">=</span><span
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.comet.exec.shuffle.mode<span class="o">=</span>auto<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.shuffle.manager<span
class="o">=</span>org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--conf<span class="w">
</span>spark.sql.adaptive.coalescePartitions.enabled<span
class="o">=</span><span class="nb">false</span><span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>tpcbench.py<span class="w"> </span><span
class="se">\</span>
+<span class="w"> </span>--benchmark<span class="w"> </span>tpch<span
class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--data<span class="w">
</span>/mnt/bigdata/tpch/sf100/<span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--queries<span class="w">
</span>../../tpch/queries<span class="w"> </span><span class="se">\</span>
+<span class="w"> </span>--iterations<span class="w"> </span><span
class="m">5</span>
</pre></div>
</div>
-<p>Comet performance can be compared to regular Spark performance by running
the benchmark twice, once with
-<code class="docutils literal notranslate"><span
class="pre">spark.comet.enabled</span></code> set to <code class="docutils
literal notranslate"><span class="pre">true</span></code> and once with it set
to <code class="docutils literal notranslate"><span
class="pre">false</span></code>.</p>
+</section>
+<section id="current-performance">
+<h2>Current Performance<a class="headerlink" href="#current-performance"
title="Link to this heading">¶</a></h2>
+<p>Comet is not yet achieving full DataFusion speeds in all cases, but with
future work we aim to provide a 2x-4x speedup
+for many use cases.</p>
+<p>The following benchmarks were performed on a Linux workstation with PCIe 5,
AMD 7950X CPU (16 cores), 128 GB RAM, and
+data stored locally on NVMe storage. Performance characteristics will vary in
different environments and we encourage
+you to run these benchmarks in your own environments.</p>
+<p><img alt="" src="../_static/images/tpch_allqueries.png" /></p>
+<p>Here is a breakdown showing relative performance of Spark, Comet, and
DataFusion for each TPC-H query.</p>
+<p><img alt="" src="../_static/images/tpch_queries_compare.png" /></p>
+<p>The following chart shows how much Comet currently accelerates each query
from the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help
achieve even greater speedups in the future.</p>
+<p><img alt="" src="../_static/images/tpch_queries_speedup.png" /></p>
+<p>The raw results of these benchmarks in JSON format is available here:</p>
+<ul class="simple">
+<li><p><a class="reference download internal" download=""
href="../_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json"><span
class="xref download myst">Spark</span></a></p></li>
+<li><p><a class="reference download internal" download=""
href="../_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json"><span
class="xref download myst">Comet</span></a></p></li>
+<li><p><a class="reference download internal" download=""
href="../_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json"><span
class="xref download myst">DataFusion</span></a></p></li>
+</ul>
+</section>
</section>
diff --git a/searchindex.js b/searchindex.js
index 3b9b5384..63c1069c 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"ANSI mode": [[8, "ansi-mode"]], "API
Differences Between Spark Versions": [[0,
"api-differences-between-spark-versions"]], "ASF Links": [[7, null]], "Adding
Spark-side Tests for the New Expression": [[0,
"adding-spark-side-tests-for-the-new-expression"]], "Adding a New Expression":
[[0, "adding-a-new-expression"]], "Adding a New Scalar Function Expression":
[[0, "adding-a-new-scalar-function-expression"]], "Adding the Expression To the
Protobuf Definition" [...]
\ No newline at end of file
+Search.setIndex({"alltitles": {"ANSI mode": [[8, "ansi-mode"]], "API
Differences Between Spark Versions": [[0,
"api-differences-between-spark-versions"]], "ASF Links": [[7, null]], "Adding
Spark-side Tests for the New Expression": [[0,
"adding-spark-side-tests-for-the-new-expression"]], "Adding a New Expression":
[[0, "adding-a-new-expression"]], "Adding a New Scalar Function Expression":
[[0, "adding-a-new-scalar-function-expression"]], "Adding the Expression To the
Protobuf Definition" [...]
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]