(datafusion-comet) branch asf-site updated: Publish built docs triggered by 1f81c3812490d35b488795ad597e4e6d3f9e114d

github-bot Fri, 31 May 2024 09:37:45 -0700

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/asf-site by this push:
     new dea407c9 Publish built docs triggered by 
1f81c3812490d35b488795ad597e4e6d3f9e114d
dea407c9 is described below

commit dea407c9d8cd0b0179a578bb23514fa35a8c3ff7
Author: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri May 31 16:36:44 2024 +0000

    Publish built docs triggered by 1f81c3812490d35b488795ad597e4e6d3f9e114d
---
 .../spark-8-exec-5-runs.json                       | 184 +++++++++++++++++++
 .../datafusion-python-8-cores.json                 |  73 ++++++++
 .../comet-8-exec-5-runs.json                       | 201 +++++++++++++++++++++
 _sources/contributor-guide/benchmarking.md.txt     | 113 ++++++++----
 _static/images/tpch_allqueries.png                 | Bin 0 -> 23677 bytes
 _static/images/tpch_queries_compare.png            | Bin 0 -> 27388 bytes
 _static/images/tpch_queries_speedup.png            | Bin 0 -> 38821 bytes
 contributor-guide/benchmarking.html                | 130 +++++++++----
 searchindex.js                                     |   2 +-
 9 files changed, 631 insertions(+), 72 deletions(-)

diff --git 
a/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json 
b/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json
new file mode 100644
index 00000000..012b05c3
--- /dev/null
+++ b/_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json
@@ -0,0 +1,184 @@
+{
+    "engine": "datafusion-comet",
+    "benchmark": "tpch",
+    "data_path": "/mnt/bigdata/tpch/sf100/",
+    "query_path": "../../tpch/queries",
+    "spark_conf": {
+        "spark.driver.extraJavaOptions": "-Djava.net.preferIPv6Addresses=false 
-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED 
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED 
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED 
--add-opens=java.base/java.io=ALL-UNNAMED 
--add-opens=java.base/java.net=ALL-UNNAMED 
--add-opens=java.base/java.nio=ALL-UNNAMED 
--add-opens=java.base/java.util=ALL-UNNAMED 
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add- [...]
+        "spark.sql.warehouse.dir": 
"file:/home/andy/git/apache/datafusion-benchmarks/runners/datafusion-comet/spark-warehouse",
+        "spark.app.id": "app-20240528090804-0041",
+        "spark.app.submitTime": "1716908883258",
+        "spark.executor.memory": "8G",
+        "spark.master": "spark://woody:7077",
+        "spark.executor.id": "driver",
+        "spark.executor.instances": "8",
+        "spark.app.name": "DataFusion Comet Benchmark derived from TPC-H / 
TPC-DS",
+        "spark.driver.memory": "8G",
+        "spark.rdd.compress": "True",
+        "spark.executor.extraJavaOptions": 
"-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions 
--add-opens=java.base/java.lang=ALL-UNNAMED 
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED 
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED 
--add-opens=java.base/java.io=ALL-UNNAMED 
--add-opens=java.base/java.net=ALL-UNNAMED 
--add-opens=java.base/java.nio=ALL-UNNAMED 
--add-opens=java.base/java.util=ALL-UNNAMED 
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --ad [...]
+        "spark.serializer.objectStreamReset": "100",
+        "spark.cores.max": "8",
+        "spark.submit.pyFiles": "",
+        "spark.executor.cores": "1",
+        "spark.submit.deployMode": "client",
+        "spark.sql.autoBroadcastJoinThreshold": "-1",
+        "spark.eventLog.enabled": "false",
+        "spark.app.startTime": "1716908883579",
+        "spark.driver.port": "33725",
+        "spark.driver.host": "woody.lan"
+    },
+    "1": [
+        76.91316103935242,
+        79.55859923362732,
+        81.10397529602051,
+        79.01998662948608,
+        79.1286551952362
+    ],
+    "2": [
+        23.977370262145996,
+        22.214473247528076,
+        22.686659812927246,
+        22.016682386398315,
+        21.766324520111084
+    ],
+    "3": [
+        22.700742721557617,
+        21.980144739151,
+        21.876065969467163,
+        21.661516189575195,
+        21.69345998764038
+    ],
+    "4": [
+        17.377647638320923,
+        16.249598264694214,
+        16.15747308731079,
+        16.128843069076538,
+        16.04338026046753
+    ],
+    "5": [
+        44.38863182067871,
+        45.47764492034912,
+        45.76063895225525,
+        45.16393995285034,
+        60.848369121551514
+    ],
+    "6": [
+        3.2041075229644775,
+        2.970944881439209,
+        2.891291856765747,
+        2.9719409942626953,
+        3.0702600479125977
+    ],
+    "7": [
+        24.369274377822876,
+        24.684266567230225,
+        24.146574020385742,
+        24.023175716400146,
+        30.56047773361206
+    ],
+    "8": [
+        46.46081209182739,
+        45.9838604927063,
+        46.341185092926025,
+        45.833823919296265,
+        46.61182403564453
+    ],
+    "9": [
+        67.67960548400879,
+        67.34667444229126,
+        70.34601259231567,
+        71.24095153808594,
+        84.38811421394348
+    ],
+    "10": [
+        19.16477870941162,
+        19.081010580062866,
+        19.501060009002686,
+        19.165698528289795,
+        20.216782331466675
+    ],
+    "11": [
+        17.158706426620483,
+        17.05184030532837,
+        17.714542150497437,
+        17.004602909088135,
+        17.700096130371094
+    ],
+    "12": [
+        11.654477834701538,
+        11.805298805236816,
+        11.822469234466553,
+        12.79678750038147,
+        13.64478850364685
+    ],
+    "13": [
+        20.430822372436523,
+        20.18759250640869,
+        21.26596975326538,
+        21.234288454055786,
+        20.189200162887573
+    ],
+    "14": [
+        5.60215950012207,
+        5.160705089569092,
+        5.080057382583618,
+        4.937625408172607,
+        5.853632688522339
+    ],
+    "15": [
+        14.17775845527649,
+        13.898571729660034,
+        14.215840578079224,
+        14.316090106964111,
+        14.356236457824707
+    ],
+    "16": [
+        6.252386808395386,
+        6.010213375091553,
+        6.054978370666504,
+        5.886059522628784,
+        5.923115253448486
+    ],
+    "17": [
+        71.41593313217163,
+        70.25399804115295,
+        72.07622528076172,
+        72.27566242218018,
+        72.20579051971436
+    ],
+    "18": [
+        65.72738265991211,
+        65.47461080551147,
+        67.14260482788086,
+        65.95489883422852,
+        69.51795554161072
+    ],
+    "19": [
+        7.1520891189575195,
+        6.516514301300049,
+        6.580992698669434,
+        6.486274242401123,
+        6.418147087097168
+    ],
+    "20": [
+        12.619760036468506,
+        12.235978126525879,
+        12.116347551345825,
+        12.161245584487915,
+        12.30910348892212
+    ],
+    "21": [
+        60.795483350753784,
+        60.484593629837036,
+        61.27316427230835,
+        60.475560426712036,
+        81.21473670005798
+    ],
+    "22": [
+        8.926804065704346,
+        8.113754034042358,
+        8.029133796691895,
+        7.99291467666626,
+        8.439452648162842
+    ]
+}
\ No newline at end of file
diff --git 
a/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json 
b/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json
new file mode 100644
index 00000000..f032d536
--- /dev/null
+++ b/_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json
@@ -0,0 +1,73 @@
+{
+    "engine": "datafusion-python",
+    "datafusion-version": "38.0.1",
+    "benchmark": "tpch",
+    "data_path": "/mnt/bigdata/tpch/sf100/",
+    "query_path": "../../tpch/queries/",
+    "1": [
+        7.410699844360352
+    ],
+    "2": [
+        2.966364622116089
+    ],
+    "3": [
+        3.988652467727661
+    ],
+    "4": [
+        1.8821499347686768
+    ],
+    "5": [
+        6.957948684692383
+    ],
+    "6": [
+        1.779731273651123
+    ],
+    "7": [
+        14.559604167938232
+    ],
+    "8": [
+        7.062309265136719
+    ],
+    "9": [
+        14.908353805541992
+    ],
+    "10": [
+        7.73533296585083
+    ],
+    "11": [
+        2.346423387527466
+    ],
+    "12": [
+        2.7248904705047607
+    ],
+    "13": [
+        6.38663387298584
+    ],
+    "14": [
+        2.4675676822662354
+    ],
+    "15": [
+        4.799000024795532
+    ],
+    "16": [
+        1.9091999530792236
+    ],
+    "17": [
+        19.230653762817383
+    ],
+    "18": [
+        25.15683078765869
+    ],
+    "19": [
+        4.2268781661987305
+    ],
+    "20": [
+        8.66620659828186
+    ],
+    "21": [
+        17.696006059646606
+    ],
+    "22": [
+        1.3805692195892334
+    ]
+}
\ No newline at end of file
diff --git 
a/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json 
b/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json
new file mode 100644
index 00000000..38142151
--- /dev/null
+++ b/_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json
@@ -0,0 +1,201 @@
+{
+    "engine": "datafusion-comet",
+    "benchmark": "tpch",
+    "data_path": "/mnt/bigdata/tpch/sf100/",
+    "query_path": "../../tpch/queries",
+    "spark_conf": {
+        "spark.comet.explainFallback.enabled": "true",
+        "spark.jars": 
"file:///home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+        "spark.comet.cast.allowIncompatible": "true",
+        "spark.executor.extraClassPath": 
"/home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+        "spark.executor.memory": "8G",
+        "spark.comet.exec.shuffle.enabled": "true",
+        "spark.app.name": "DataFusion Comet Benchmark derived from TPC-H / 
TPC-DS",
+        "spark.driver.port": "36573",
+        "spark.sql.adaptive.coalescePartitions.enabled": "false",
+        "spark.app.startTime": "1716923498046",
+        "spark.comet.batchSize": "8192",
+        "spark.app.id": "app-20240528131138-0043",
+        "spark.serializer.objectStreamReset": "100",
+        "spark.app.initial.jar.urls": 
"spark://woody.lan:36573/jars/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+        "spark.submit.deployMode": "client",
+        "spark.sql.autoBroadcastJoinThreshold": "-1",
+        "spark.comet.exec.all.enabled": "true",
+        "spark.eventLog.enabled": "false",
+        "spark.driver.host": "woody.lan",
+        "spark.driver.extraJavaOptions": "-Djava.net.preferIPv6Addresses=false 
-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED 
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED 
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED 
--add-opens=java.base/java.io=ALL-UNNAMED 
--add-opens=java.base/java.net=ALL-UNNAMED 
--add-opens=java.base/java.nio=ALL-UNNAMED 
--add-opens=java.base/java.util=ALL-UNNAMED 
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add- [...]
+        "spark.sql.warehouse.dir": 
"file:/home/andy/git/apache/datafusion-benchmarks/runners/datafusion-comet/spark-warehouse",
+        "spark.shuffle.manager": 
"org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager",
+        "spark.comet.exec.enabled": "true",
+        "spark.repl.local.jars": 
"file:///home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+        "spark.executor.id": "driver",
+        "spark.master": "spark://woody:7077",
+        "spark.executor.instances": "8",
+        "spark.comet.exec.shuffle.mode": "auto",
+        "spark.sql.extensions": "org.apache.comet.CometSparkSessionExtensions",
+        "spark.driver.memory": "8G",
+        "spark.driver.extraClassPath": 
"/home/andy/git/apache/datafusion-comet/spark/target/comet-spark-spark3.4_2.12-0.1.0-SNAPSHOT.jar",
+        "spark.rdd.compress": "True",
+        "spark.executor.extraJavaOptions": 
"-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions 
--add-opens=java.base/java.lang=ALL-UNNAMED 
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED 
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED 
--add-opens=java.base/java.io=ALL-UNNAMED 
--add-opens=java.base/java.net=ALL-UNNAMED 
--add-opens=java.base/java.nio=ALL-UNNAMED 
--add-opens=java.base/java.util=ALL-UNNAMED 
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED --ad [...]
+        "spark.cores.max": "8",
+        "spark.comet.enabled": "true",
+        "spark.app.submitTime": "1716923497738",
+        "spark.submit.pyFiles": "",
+        "spark.executor.cores": "1",
+        "spark.comet.parquet.io.enabled": "false"
+    },
+    "1": [
+        32.121661901474,
+        27.997092485427856,
+        27.756758451461792,
+        28.55236315727234,
+        28.332542181015015
+    ],
+    "2": [
+        18.269107580184937,
+        16.200955629348755,
+        16.194639682769775,
+        16.745808839797974,
+        16.59864115715027
+    ],
+    "3": [
+        17.265466690063477,
+        17.069786310195923,
+        17.12887978553772,
+        19.33678102493286,
+        18.182055234909058
+    ],
+    "4": [
+        8.367004156112671,
+        8.172023296356201,
+        8.023266077041626,
+        8.350765228271484,
+        8.258736610412598
+    ],
+    "5": [
+        34.10048794746399,
+        32.69314408302307,
+        33.21383595466614,
+        36.391114473342896,
+        39.00048065185547
+    ],
+    "6": [
+        3.1693499088287354,
+        3.044705390930176,
+        3.047694206237793,
+        3.2817511558532715,
+        3.274174928665161
+    ],
+    "7": [
+        25.369214296340942,
+        24.020941257476807,
+        24.0787034034729,
+        28.47402787208557,
+        28.23443365097046
+    ],
+    "8": [
+        40.06126809120178,
+        39.828824281692505,
+        45.250510454177856,
+        44.406742572784424,
+        48.98451232910156
+    ],
+    "9": [
+        62.822797775268555,
+        61.26328158378601,
+        64.95581865310669,
+        69.51708793640137,
+        73.52380013465881
+    ],
+    "10": [
+        20.55334782600403,
+        20.546096324920654,
+        20.57452392578125,
+        22.84211039543152,
+        23.724371671676636
+    ],
+    "11": [
+        11.068235158920288,
+        10.715423822402954,
+        11.353424310684204,
+        11.37632942199707,
+        11.530814170837402
+    ],
+    "12": [
+        10.264788389205933,
+        8.67864990234375,
+        8.845952033996582,
+        8.593009233474731,
+        8.540803909301758
+    ],
+    "13": [
+        9.603406190872192,
+        9.648627042770386,
+        13.040799140930176,
+        10.154011249542236,
+        9.716034412384033
+    ],
+    "14": [
+        6.20926308631897,
+        6.0385496616363525,
+        7.674488544464111,
+        10.53052043914795,
+        7.661675691604614
+    ],
+    "15": [
+        11.466301918029785,
+        11.473632097244263,
+        11.279382228851318,
+        13.291078329086304,
+        12.81026816368103
+    ],
+    "16": [
+        8.096073865890503,
+        7.73410701751709,
+        7.742897272109985,
+        8.477537631988525,
+        7.821273326873779
+    ],
+    "17": [
+        43.69264578819275,
+        43.33040428161621,
+        46.291987657547,
+        54.654345989227295,
+        54.37124800682068
+    ],
+    "18": [
+        27.205485105514526,
+        26.785916090011597,
+        27.331408262252808,
+        29.946768760681152,
+        28.037617444992065
+    ],
+    "19": [
+        8.100102186203003,
+        7.845783472061157,
+        8.52329158782959,
+        8.907397985458374,
+        9.13755488395691
+    ],
+    "20": [
+        13.09695029258728,
+        12.683861255645752,
+        15.612725019454956,
+        13.361177206039429,
+        16.614356517791748
+    ],
+    "21": [
+        43.69623780250549,
+        43.26758122444153,
+        46.91650056838989,
+        47.875754833221436,
+        57.9763662815094
+    ],
+    "22": [
+        4.5090577602386475,
+        4.420571804046631,
+        4.639787673950195,
+        5.118046998977661,
+        5.017346143722534
+    ]
+}
\ No newline at end of file
diff --git a/_sources/contributor-guide/benchmarking.md.txt 
b/_sources/contributor-guide/benchmarking.md.txt
index 502b35c2..3e9a61ef 100644
--- a/_sources/contributor-guide/benchmarking.md.txt
+++ b/_sources/contributor-guide/benchmarking.md.txt
@@ -19,44 +19,87 @@ under the License.
 
 # Comet Benchmarking Guide
 
-To track progress on performance, we regularly run benchmarks derived from 
TPC-H and TPC-DS. Benchmarking scripts are
-available in the [DataFusion 
Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository.
+To track progress on performance, we regularly run benchmarks derived from 
TPC-H and TPC-DS. Data generation and 
+benchmarking documentation and scripts are available in the [DataFusion 
Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository.
 
-Here is an example command for running the benchmarks. This command will need 
to be adapted based on the Spark 
-environment and location of data files.
+Here are example commands for running the benchmarks against a Spark cluster. 
This command will need to be 
+adapted based on the Spark environment and location of data files.
 
-This command assumes that `datafusion-benchmarks` is checked out in a parallel 
directory to `datafusion-comet`.
+These commands are intended to be run from the `runners/datafusion-comet` 
directory in the `datafusion-benchmarks` 
+repository.
+
+## Running Benchmarks Against Apache Spark
 
 ```shell
-$SPARK_HOME/bin/spark-submit \ 
-    --master "local[*]" \ 
-    --conf spark.driver.memory=8G \ 
-    --conf spark.executor.memory=64G \ 
-    --conf spark.executor.cores=16 \ 
-    --conf spark.cores.max=16 \ 
-    --conf spark.eventLog.enabled=true \ 
-    --conf spark.sql.autoBroadcastJoinThreshold=-1 \ 
-    --jars $COMET_JAR \ 
-    --conf spark.driver.extraClassPath=$COMET_JAR \ 
-    --conf spark.executor.extraClassPath=$COMET_JAR \ 
-    --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ 
-    --conf spark.comet.enabled=true \ 
-    --conf spark.comet.exec.enabled=true \ 
-    --conf spark.comet.exec.all.enabled=true \ 
-    --conf spark.comet.cast.allowIncompatible=true \ 
-    --conf spark.comet.explainFallback.enabled=true \ 
-    --conf spark.comet.parquet.io.enabled=false \ 
-    --conf spark.comet.batchSize=8192 \ 
-    --conf spark.comet.columnar.shuffle.enabled=false \ 
-    --conf spark.comet.exec.shuffle.enabled=true \ 
-    --conf 
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
 \ 
-    --conf spark.sql.adaptive.coalescePartitions.enabled=false \ 
-    --conf spark.comet.shuffle.enforceMode.enabled=true \
-    ../datafusion-benchmarks/runners/datafusion-comet/tpcbench.py \
-    --benchmark tpch \ 
-    --data /mnt/bigdata/tpch/sf100-parquet/ \ 
-    --queries ../datafusion-benchmarks/tpch/queries 
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.memory=32G \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=8 \
+    --conf spark.sql.autoBroadcastJoinThreshold=-1 \
+    tpcbench.py \
+    --benchmark tpch \
+    --data /mnt/bigdata/tpch/sf100/ \
+    --queries ../../tpch/queries \
+    --iterations 5
 ```
 
-Comet performance can be compared to regular Spark performance by running the 
benchmark twice, once with 
-`spark.comet.enabled` set to `true` and once with it set to `false`. 
\ No newline at end of file
+## Running Benchmarks Against Apache Spark with Apache DataFusion Comet Enabled
+
+```shell
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.memory=64G \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=8 \
+    --conf spark.sql.autoBroadcastJoinThreshold=-1 \
+    --jars $COMET_JAR \
+    --conf spark.driver.extraClassPath=$COMET_JAR \
+    --conf spark.executor.extraClassPath=$COMET_JAR \
+    --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
+    --conf spark.comet.enabled=true \
+    --conf spark.comet.exec.enabled=true \
+    --conf spark.comet.exec.all.enabled=true \
+    --conf spark.comet.cast.allowIncompatible=true \
+    --conf spark.comet.explainFallback.enabled=true \
+    --conf spark.comet.parquet.io.enabled=false \
+    --conf spark.comet.batchSize=8192 \
+    --conf spark.comet.exec.shuffle.enabled=true \
+    --conf spark.comet.exec.shuffle.mode=auto \
+    --conf 
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
 \
+    --conf spark.sql.adaptive.coalescePartitions.enabled=false \
+    tpcbench.py \
+    --benchmark tpch \
+    --data /mnt/bigdata/tpch/sf100/ \
+    --queries ../../tpch/queries \
+    --iterations 5
+```
+
+## Current Performance
+
+Comet is not yet achieving full DataFusion speeds in all cases, but with 
future work we aim to provide a 2x-4x speedup
+for many use cases.
+
+The following benchmarks were performed on a Linux workstation with PCIe 5, 
AMD 7950X CPU (16 cores), 128 GB RAM, and 
+data stored locally on NVMe storage. Performance characteristics will vary in 
different environments and we encourage 
+you to run these benchmarks in your own environments.
+
+![](../../_static/images/tpch_allqueries.png)
+
+Here is a breakdown showing relative performance of Spark, Comet, and 
DataFusion for each TPC-H query.
+
+![](../../_static/images/tpch_queries_compare.png)
+
+The following chart shows how much Comet currently accelerates each query from 
the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help 
achieve even greater speedups in the future.
+
+![](../../_static/images/tpch_queries_speedup.png)
+
+The raw results of these benchmarks in JSON format is available here:
+
+- [Spark](./benchmark-results/2024-05-30/spark-8-exec-5-runs.json)
+- [Comet](./benchmark-results/2024-05-30/comet-8-exec-5-runs.json)
+- [DataFusion](./benchmark-results/2024-05-30/datafusion-python-8-cores.json)
+ 
diff --git a/_static/images/tpch_allqueries.png 
b/_static/images/tpch_allqueries.png
new file mode 100644
index 00000000..a6788d5a
Binary files /dev/null and b/_static/images/tpch_allqueries.png differ
diff --git a/_static/images/tpch_queries_compare.png 
b/_static/images/tpch_queries_compare.png
new file mode 100644
index 00000000..92768061
Binary files /dev/null and b/_static/images/tpch_queries_compare.png differ
diff --git a/_static/images/tpch_queries_speedup.png 
b/_static/images/tpch_queries_speedup.png
new file mode 100644
index 00000000..fb417ff1
Binary files /dev/null and b/_static/images/tpch_queries_speedup.png differ
diff --git a/contributor-guide/benchmarking.html 
b/contributor-guide/benchmarking.html
index 1d8ce7de..f4fcd2de 100644
--- a/contributor-guide/benchmarking.html
+++ b/contributor-guide/benchmarking.html
@@ -255,9 +255,29 @@ under the License.
               
               <div class="toc-item">
                 
+<div class="tocsection onthispage pt-5 pb-3">
+    <i class="fas fa-list"></i> On this page
+</div>
 
 <nav id="bd-toc-nav">
-    
+    <ul class="visible nav section-nav flex-column">
+ <li class="toc-h2 nav-item toc-entry">
+  <a class="reference internal nav-link" 
href="#running-benchmarks-against-apache-spark">
+   Running Benchmarks Against Apache Spark
+  </a>
+ </li>
+ <li class="toc-h2 nav-item toc-entry">
+  <a class="reference internal nav-link" 
href="#running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled">
+   Running Benchmarks Against Apache Spark with Apache DataFusion Comet Enabled
+  </a>
+ </li>
+ <li class="toc-h2 nav-item toc-entry">
+  <a class="reference internal nav-link" href="#current-performance">
+   Current Performance
+  </a>
+ </li>
+</ul>
+
 </nav>
               </div>
               
@@ -304,43 +324,81 @@ under the License.
 -->
 <section id="comet-benchmarking-guide">
 <h1>Comet Benchmarking Guide<a class="headerlink" 
href="#comet-benchmarking-guide" title="Link to this heading">¶</a></h1>
-<p>To track progress on performance, we regularly run benchmarks derived from 
TPC-H and TPC-DS. Benchmarking scripts are
-available in the <a class="reference external" 
href="https://github.com/apache/datafusion-benchmarks";>DataFusion 
Benchmarks</a> GitHub repository.</p>
-<p>Here is an example command for running the benchmarks. This command will 
need to be adapted based on the Spark
-environment and location of data files.</p>
-<p>This command assumes that <code class="docutils literal notranslate"><span 
class="pre">datafusion-benchmarks</span></code> is checked out in a parallel 
directory to <code class="docutils literal notranslate"><span 
class="pre">datafusion-comet</span></code>.</p>
-<div class="highlight-shell notranslate"><div 
class="highlight"><pre><span></span><span 
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span 
class="se">\ </span>
-<span class="w">    </span>--master<span class="w"> </span><span 
class="s2">&quot;local[*]&quot;</span><span class="w"> </span><span 
class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.driver.memory<span class="o">=</span>8G<span class="w"> 
</span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.memory<span class="o">=</span>64G<span class="w"> 
</span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.cores<span class="o">=</span><span 
class="m">16</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> </span>spark.cores.max<span 
class="o">=</span><span class="m">16</span><span class="w"> </span><span 
class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.eventLog.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span 
class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--jars<span class="w"> </span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.driver.extraClassPath<span class="o">=</span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.extraClassPath<span class="o">=</span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.extensions<span 
class="o">=</span>org.apache.comet.CometSparkSessionExtensions<span class="w"> 
</span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.all.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.cast.allowIncompatible<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.explainFallback.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.parquet.io.enabled<span class="o">=</span><span 
class="nb">false</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.batchSize<span class="o">=</span><span 
class="m">8192</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.columnar.shuffle.enabled<span class="o">=</span><span 
class="nb">false</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.shuffle.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.shuffle.manager<span 
class="o">=</span>org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager<span
 class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.adaptive.coalescePartitions.enabled<span 
class="o">=</span><span class="nb">false</span><span class="w"> </span><span 
class="se">\ </span>
-<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.shuffle.enforceMode.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
-<span class="w">    
</span>../datafusion-benchmarks/runners/datafusion-comet/tpcbench.py<span 
class="w"> </span><span class="se">\</span>
-<span class="w">    </span>--benchmark<span class="w"> </span>tpch<span 
class="w"> </span><span class="se">\ </span>
-<span class="w">    </span>--data<span class="w"> 
</span>/mnt/bigdata/tpch/sf100-parquet/<span class="w"> </span><span 
class="se">\ </span>
-<span class="w">    </span>--queries<span class="w"> 
</span>../datafusion-benchmarks/tpch/queries<span class="w"> </span>
+<p>To track progress on performance, we regularly run benchmarks derived from 
TPC-H and TPC-DS. Data generation and
+benchmarking documentation and scripts are available in the <a 
class="reference external" 
href="https://github.com/apache/datafusion-benchmarks";>DataFusion 
Benchmarks</a> GitHub repository.</p>
+<p>Here are example commands for running the benchmarks against a Spark 
cluster. This command will need to be
+adapted based on the Spark environment and location of data files.</p>
+<p>These commands are intended to be run from the <code class="docutils 
literal notranslate"><span class="pre">runners/datafusion-comet</span></code> 
directory in the <code class="docutils literal notranslate"><span 
class="pre">datafusion-benchmarks</span></code>
+repository.</p>
+<section id="running-benchmarks-against-apache-spark">
+<h2>Running Benchmarks Against Apache Spark<a class="headerlink" 
href="#running-benchmarks-against-apache-spark" title="Link to this 
heading">¶</a></h2>
+<div class="highlight-shell notranslate"><div 
class="highlight"><pre><span></span><span 
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--master<span class="w"> </span><span 
class="nv">$SPARK_MASTER</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.driver.memory<span class="o">=</span>8G<span class="w"> 
</span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.memory<span class="o">=</span>32G<span class="w"> 
</span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.cores<span class="o">=</span><span 
class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> </span>spark.cores.max<span 
class="o">=</span><span class="m">8</span><span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span 
class="w"> </span><span class="se">\</span>
+<span class="w">    </span>tpcbench.py<span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--benchmark<span class="w"> </span>tpch<span 
class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--data<span class="w"> 
</span>/mnt/bigdata/tpch/sf100/<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--queries<span class="w"> 
</span>../../tpch/queries<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--iterations<span class="w"> </span><span 
class="m">5</span>
+</pre></div>
+</div>
+</section>
+<section 
id="running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled">
+<h2>Running Benchmarks Against Apache Spark with Apache DataFusion Comet 
Enabled<a class="headerlink" 
href="#running-benchmarks-against-apache-spark-with-apache-datafusion-comet-enabled"
 title="Link to this heading">¶</a></h2>
+<div class="highlight-shell notranslate"><div 
class="highlight"><pre><span></span><span 
class="nv">$SPARK_HOME</span>/bin/spark-submit<span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--master<span class="w"> </span><span 
class="nv">$SPARK_MASTER</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.driver.memory<span class="o">=</span>8G<span class="w"> 
</span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.memory<span class="o">=</span>64G<span class="w"> 
</span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.cores<span class="o">=</span><span 
class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> </span>spark.cores.max<span 
class="o">=</span><span class="m">8</span><span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.autoBroadcastJoinThreshold<span class="o">=</span>-1<span 
class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--jars<span class="w"> </span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.driver.extraClassPath<span class="o">=</span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.executor.extraClassPath<span class="o">=</span><span 
class="nv">$COMET_JAR</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.extensions<span 
class="o">=</span>org.apache.comet.CometSparkSessionExtensions<span class="w"> 
</span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.all.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.cast.allowIncompatible<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.explainFallback.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.parquet.io.enabled<span class="o">=</span><span 
class="nb">false</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.batchSize<span class="o">=</span><span 
class="m">8192</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.shuffle.enabled<span class="o">=</span><span 
class="nb">true</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.comet.exec.shuffle.mode<span class="o">=</span>auto<span 
class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.shuffle.manager<span 
class="o">=</span>org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager<span
 class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--conf<span class="w"> 
</span>spark.sql.adaptive.coalescePartitions.enabled<span 
class="o">=</span><span class="nb">false</span><span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>tpcbench.py<span class="w"> </span><span 
class="se">\</span>
+<span class="w">    </span>--benchmark<span class="w"> </span>tpch<span 
class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--data<span class="w"> 
</span>/mnt/bigdata/tpch/sf100/<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--queries<span class="w"> 
</span>../../tpch/queries<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--iterations<span class="w"> </span><span 
class="m">5</span>
 </pre></div>
 </div>
-<p>Comet performance can be compared to regular Spark performance by running 
the benchmark twice, once with
-<code class="docutils literal notranslate"><span 
class="pre">spark.comet.enabled</span></code> set to <code class="docutils 
literal notranslate"><span class="pre">true</span></code> and once with it set 
to <code class="docutils literal notranslate"><span 
class="pre">false</span></code>.</p>
+</section>
+<section id="current-performance">
+<h2>Current Performance<a class="headerlink" href="#current-performance" 
title="Link to this heading">¶</a></h2>
+<p>Comet is not yet achieving full DataFusion speeds in all cases, but with 
future work we aim to provide a 2x-4x speedup
+for many use cases.</p>
+<p>The following benchmarks were performed on a Linux workstation with PCIe 5, 
AMD 7950X CPU (16 cores), 128 GB RAM, and
+data stored locally on NVMe storage. Performance characteristics will vary in 
different environments and we encourage
+you to run these benchmarks in your own environments.</p>
+<p><img alt="" src="../_static/images/tpch_allqueries.png" /></p>
+<p>Here is a breakdown showing relative performance of Spark, Comet, and 
DataFusion for each TPC-H query.</p>
+<p><img alt="" src="../_static/images/tpch_queries_compare.png" /></p>
+<p>The following chart shows how much Comet currently accelerates each query 
from the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help 
achieve even greater speedups in the future.</p>
+<p><img alt="" src="../_static/images/tpch_queries_speedup.png" /></p>
+<p>The raw results of these benchmarks in JSON format is available here:</p>
+<ul class="simple">
+<li><p><a class="reference download internal" download="" 
href="../_downloads/070a7cb1baf1f521ddb4801111e33a5b/spark-8-exec-5-runs.json"><span
 class="xref download myst">Spark</span></a></p></li>
+<li><p><a class="reference download internal" download="" 
href="../_downloads/83125e9b621c9cfd1edf3b330eb34508/comet-8-exec-5-runs.json"><span
 class="xref download myst">Comet</span></a></p></li>
+<li><p><a class="reference download internal" download="" 
href="../_downloads/3713390967a196004589150584f165d3/datafusion-python-8-cores.json"><span
 class="xref download myst">DataFusion</span></a></p></li>
+</ul>
+</section>
 </section>
 
 
diff --git a/searchindex.js b/searchindex.js
index 3b9b5384..63c1069c 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"ANSI mode": [[8, "ansi-mode"]], "API 
Differences Between Spark Versions": [[0, 
"api-differences-between-spark-versions"]], "ASF Links": [[7, null]], "Adding 
Spark-side Tests for the New Expression": [[0, 
"adding-spark-side-tests-for-the-new-expression"]], "Adding a New Expression": 
[[0, "adding-a-new-expression"]], "Adding a New Scalar Function Expression": 
[[0, "adding-a-new-scalar-function-expression"]], "Adding the Expression To the 
Protobuf Definition" [...]
\ No newline at end of file
+Search.setIndex({"alltitles": {"ANSI mode": [[8, "ansi-mode"]], "API 
Differences Between Spark Versions": [[0, 
"api-differences-between-spark-versions"]], "ASF Links": [[7, null]], "Adding 
Spark-side Tests for the New Expression": [[0, 
"adding-spark-side-tests-for-the-new-expression"]], "Adding a New Expression": 
[[0, "adding-a-new-expression"]], "Adding a New Scalar Function Expression": 
[[0, "adding-a-new-scalar-function-expression"]], "Adding the Expression To the 
Protobuf Definition" [...]
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch asf-site updated: Publish built docs triggered by 1f81c3812490d35b488795ad597e4e6d3f9e114d

Reply via email to