This is an automated email from the ASF dual-hosted git repository.
andygrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 8f3cee591 docs: Documentation updates in preparation for 0.16 release
(#4244)
8f3cee591 is described below
commit 8f3cee591eab4b1a78be7223eb3f87d1e7e58905
Author: Andy Grove <[email protected]>
AuthorDate: Thu May 7 15:09:55 2026 -0600
docs: Documentation updates in preparation for 0.16 release (#4244)
---
.../main/scala/org/apache/comet/CometConf.scala | 9 +--
dev/release/build-release-comet.sh | 5 +-
docs/source/about/gluten_comparison.md | 4 +-
docs/source/about/index.md | 73 ----------------------
docs/source/asf/index.md | 5 ++
docs/source/conf.py | 9 +--
docs/source/contributor-guide/benchmarking.md | 10 +++
docs/source/contributor-guide/index.md | 13 ++++
docs/source/contributor-guide/roadmap.md | 10 +--
.../contributor-guide/spark_expressions_support.md | 14 ++---
.../contributor-guide/sql_error_propagation.md | 5 +-
docs/source/index.md | 33 +++++-----
docs/source/user-guide/index.md | 8 +++
.../latest/compatibility/expressions/index.md | 1 +
.../user-guide/latest/compatibility/index.md | 1 +
.../user-guide/latest/compatibility/scans.md | 17 ++++-
.../latest/compatibility/spark-versions.md | 21 ++++---
docs/source/user-guide/latest/datasources.md | 17 +++--
docs/source/user-guide/latest/iceberg.md | 14 +++--
docs/source/user-guide/latest/index.rst | 10 +++
docs/source/user-guide/latest/installation.md | 50 ++++++++-------
docs/source/user-guide/latest/kubernetes.md | 6 +-
docs/source/user-guide/latest/source.md | 11 ++++
.../main/scala/org/apache/comet/GenerateDocs.scala | 2 +-
24 files changed, 187 insertions(+), 161 deletions(-)
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala
b/common/src/main/scala/org/apache/comet/CometConf.scala
index d3f51dfbe..9b376837f 100644
--- a/common/src/main/scala/org/apache/comet/CometConf.scala
+++ b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -94,12 +94,9 @@ object CometConf extends ShimCometConf {
.createWithEnvVarOrDefault("ENABLE_COMET", true)
val COMET_NATIVE_SCAN_ENABLED: ConfigEntry[Boolean] =
conf("spark.comet.scan.enabled")
- .category(CATEGORY_SCAN)
- .doc(
- "Whether to enable native scans. When this is turned on, Spark will use
Comet to " +
- "read supported data sources (currently only Parquet is supported
natively). Note " +
- "that to enable native vectorized execution, both this config and " +
- "`spark.comet.exec.enabled` need to be enabled.")
+ .category(CATEGORY_TESTING)
+ .doc("Whether to enable native scans. Intended for use in Comet's own test
suites to " +
+ "selectively disable native scans; not intended for production use.")
.booleanConf
.createWithDefault(true)
diff --git a/dev/release/build-release-comet.sh
b/dev/release/build-release-comet.sh
index d0171eb98..23363c876 100755
--- a/dev/release/build-release-comet.sh
+++ b/dev/release/build-release-comet.sh
@@ -209,7 +209,10 @@ LOCAL_REPO=$(mktemp -d /tmp/comet-staging-repo-XXXXX)
./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.4 -P scala-2.13
-DskipTests install
./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.5 -P scala-2.12
-DskipTests install
./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.5 -P scala-2.13
-DskipTests install
-./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.0 -P scala-2.13
-DskipTests install
+# The spark-4.x profiles pin their own Scala 2.13.x patch versions to match the
+# corresponding Spark release, so the scala-2.13 profile is not used here.
+./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.0
-DskipTests install
+./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.1
-DskipTests install
echo "Installed to local repo: ${LOCAL_REPO}"
diff --git a/docs/source/about/gluten_comparison.md
b/docs/source/about/gluten_comparison.md
index 86dcad56a..c7807eabc 100644
--- a/docs/source/about/gluten_comparison.md
+++ b/docs/source/about/gluten_comparison.md
@@ -62,8 +62,8 @@ code, then we suggest benchmarking with both solutions and
choosing the fastest
Both projects target a similar set of Spark releases.
-Comet supports Spark 3.4, 3.5, and 4.0 in production builds, with experimental
builds also published for
-Spark 4.1 and the Spark 4.2 preview. See the [Spark version compatibility
guide] for the exact patch versions and
+Comet supports Spark 3.4, 3.5, 4.0, and 4.1 in production builds, with an
experimental build also published for
+the Spark 4.2 preview. See the [Spark version compatibility guide] for the
exact patch versions and
JDK/Scala combinations.
[Spark version compatibility guide]:
/user-guide/latest/compatibility/spark-versions.md
diff --git a/docs/source/about/index.md b/docs/source/about/index.md
deleted file mode 100644
index 43d25f8db..000000000
--- a/docs/source/about/index.md
+++ /dev/null
@@ -1,73 +0,0 @@
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements. See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership. The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied. See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-
-# Comet Overview
-
-Apache DataFusion Comet is a high-performance accelerator for Apache Spark,
built on top of the powerful
-[Apache DataFusion] query engine. Comet is designed to significantly enhance
the
-performance of Apache Spark workloads while leveraging commodity hardware and
seamlessly integrating with the
-Spark ecosystem without requiring any code changes.
-
-[Apache DataFusion]: https://datafusion.apache.org
-
-The following diagram provides an overview of Comet's architecture.
-
-
-
-## Architecture
-
-The following diagram shows how Comet integrates with Apache Spark.
-
-
-
-## Feature Parity with Apache Spark
-
-The project strives to keep feature parity with Apache Spark, that is,
-users should expect the same behavior (w.r.t features, configurations,
-query results, etc) with Comet turned on or turned off in their Spark
-jobs. In addition, Comet extension should automatically detect unsupported
-features and fallback to Spark engine.
-
-## Comparison with other open-source Spark accelerators
-
-There are two other major open-source Spark accelerators:
-
-- [Apache Gluten (incubating)](https://github.com/apache/incubator-gluten)
-- [NVIDIA Spark RAPIDS](https://github.com/NVIDIA/spark-rapids)
-
-We have a detailed guide [comparing Apache DataFusion Comet with Apache
Gluten].
-
-Spark RAPIDS is a solution that provides hardware acceleration on NVIDIA GPUs.
Comet does not require specialized
-hardware.
-
-[comparing Apache DataFusion Comet with Apache Gluten]: gluten_comparison.md
-
-## Getting Started
-
-Refer to the [Comet Installation Guide] to get started.
-
-[Comet Installation Guide]: /user-guide/latest/installation.md
-
-```{toctree}
-:maxdepth: 1
-:caption: About
-:hidden:
-
-Comparison with Gluten <gluten_comparison>
-```
diff --git a/docs/source/asf/index.md b/docs/source/asf/index.md
index 3d7c9810d..e461f68d4 100644
--- a/docs/source/asf/index.md
+++ b/docs/source/asf/index.md
@@ -19,9 +19,14 @@ under the License.
# ASF Links
+Apache DataFusion Comet is part of the Apache Software Foundation. The links
below point to ASF
+resources covering licensing, donations, security reporting, and the
Foundation's code of conduct.
+Select a link from the navigation menu.
+
```{toctree}
:maxdepth: 1
:caption: ASF Links
+:hidden:
Apache Software Foundation <https://apache.org>
License <https://www.apache.org/licenses/>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index faf670973..311e6fd75 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -134,16 +134,17 @@ html_sidebars = {
"**": ["docs-sidebar.html"],
}
-# tell myst_parser to auto-generate anchor links for headers h1, h2, h3
-myst_heading_anchors = 3
+# tell myst_parser to auto-generate anchor links for headers h1, h2, h3, h4
+myst_heading_anchors = 4
# enable nice rendering of checkboxes for the task lists
myst_enable_extensions = ["colon_fence", "deflist", "tasklist"]
redirects = {
- "overview.html": "about/index.html",
+ "overview.html": "index.html",
+ "about/index.html": "../index.html",
"gluten_comparison.html": "about/gluten_comparison.html",
- "user-guide/overview.html": "../about/overview.html",
+ "user-guide/overview.html": "../index.html",
"user-guide/gluten_comparison.html": "../about/gluten_comparison.html",
"user-guide/compatibility.html": "latest/compatibility.html",
"user-guide/configs.html": "latest/configs.html",
diff --git a/docs/source/contributor-guide/benchmarking.md
b/docs/source/contributor-guide/benchmarking.md
index ecc4fcf27..4bfd5ae57 100644
--- a/docs/source/contributor-guide/benchmarking.md
+++ b/docs/source/contributor-guide/benchmarking.md
@@ -39,3 +39,13 @@ Available benchmarking guides:
- [TPC-DS Benchmarking with spark-sql-perf](benchmarking_spark_sql_perf.md)
We also have many micro benchmarks that can be run from an IDE located
[here](https://github.com/apache/datafusion-comet/tree/main/spark/src/test/scala/org/apache/spark/sql/benchmark).
+
+```{toctree}
+:hidden:
+
+benchmark-results/tpc-h
+benchmark-results/tpc-ds
+benchmarking_macos
+benchmarking_aws_ec2
+benchmarking_spark_sql_perf
+```
diff --git a/docs/source/contributor-guide/index.md
b/docs/source/contributor-guide/index.md
index 20e73c742..77c73d68d 100644
--- a/docs/source/contributor-guide/index.md
+++ b/docs/source/contributor-guide/index.md
@@ -19,9 +19,21 @@ under the License.
# Comet Contributor Guide
+The Comet contributor guide is for developers working on Comet itself. It
covers the project
+architecture, the JVM and native code layout, the Arrow FFI bridge, JVM and
native shuffle, and
+how data and plans flow between Spark and the DataFusion execution engine.
+
+It also documents day-to-day workflows including building and testing locally,
debugging,
+benchmarking, profiling, tracing, running the SQL test suites, adding new
operators and
+expressions, triaging bugs, and the Comet release process.
+
+New contributors should start with the Getting Started page. Select a topic
from the navigation
+menu to read more.
+
```{toctree}
:maxdepth: 2
:caption: Contributor Guide
+:hidden:
Getting Started <contributing>
Comet Plugin Overview <plugin_overview>
@@ -30,6 +42,7 @@ JVM Shuffle <jvm_shuffle>
Native Shuffle <native_shuffle>
Development Guide <development>
Debugging Guide <debugging>
+ANSI Error Propagation <sql_error_propagation>
Benchmarking Guide <benchmarking>
Adding a New Operator <adding_a_new_operator>
Adding a New Expression <adding_a_new_expression>
diff --git a/docs/source/contributor-guide/roadmap.md
b/docs/source/contributor-guide/roadmap.md
index 8d293c068..bae4ddba5 100644
--- a/docs/source/contributor-guide/roadmap.md
+++ b/docs/source/contributor-guide/roadmap.md
@@ -43,14 +43,16 @@ significant family of Spark expressions in one effort.
## Dynamic Partition Pruning
-Both Iceberg table scans and Parquet V1 native scans (`CometNativeScanExec`)
support non-AQE Dynamic Partition Pruning
-(DPP) filters generated by Spark's `PlanDynamicPruningFilters` optimizer rule
([#3349], [#3511]). However, Spark's
-`PlanAdaptiveDynamicPruningFilters` optimizer rule runs after Comet's rules,
so DPP with Adaptive Query Execution
-requires a redesign of Comet's plan translation. This effort can be tracked at
[#3510].
+Native Parquet scans (`CometNativeScanExec`) support Dynamic Partition Pruning
(DPP) both with and without
+Adaptive Query Execution. Non-AQE DPP landed in [#4011] and AQE DPP with
broadcast reuse landed in [#4112].
+Iceberg native scans currently support non-AQE DPP only ([#3349], [#3511]);
extending broadcast reuse to AQE
+DPP for Iceberg is tracked at [#3510].
[#3349]: https://github.com/apache/datafusion-comet/pull/3349
[#3510]: https://github.com/apache/datafusion-comet/issues/3510
[#3511]: https://github.com/apache/datafusion-comet/pull/3511
+[#4011]: https://github.com/apache/datafusion-comet/pull/4011
+[#4112]: https://github.com/apache/datafusion-comet/pull/4112
## TPC-H and TPC-DS Performance
diff --git a/docs/source/contributor-guide/spark_expressions_support.md
b/docs/source/contributor-guide/spark_expressions_support.md
index 71f4310a8..588ae5b45 100644
--- a/docs/source/contributor-guide/spark_expressions_support.md
+++ b/docs/source/contributor-guide/spark_expressions_support.md
@@ -356,15 +356,15 @@
- [x] `/`
- [x] abs
- [x] acos
-- [ ] acosh
+- [x] acosh
- [x] asin
-- [ ] asinh
+- [x] asinh
- [x] atan
- [x] atan2
-- [ ] atanh
+- [x] atanh
- [x] bin
- [ ] bround
-- [ ] cbrt
+- [x] cbrt
- [x] ceil
- [x] ceiling
- [ ] conv
@@ -372,7 +372,7 @@
- [x] cosh
- [x] cot
- [ ] csc
-- [ ] degrees
+- [x] degrees
- [ ] div
- [ ] e
- [x] exp
@@ -390,12 +390,12 @@
- [x] log2
- [x] mod
- [x] negative
-- [ ] pi
+- [x] pi
- [ ] pmod
- [x] positive
- [x] pow
- [x] power
-- [ ] radians
+- [x] radians
- [x] rand
- [x] randn
- [ ] random
diff --git a/docs/source/contributor-guide/sql_error_propagation.md
b/docs/source/contributor-guide/sql_error_propagation.md
index 7becfabe7..a27408510 100644
--- a/docs/source/contributor-guide/sql_error_propagation.md
+++ b/docs/source/contributor-guide/sql_error_propagation.md
@@ -398,8 +398,9 @@ def convertToSparkException(e:
CometQueryExecutionException): Throwable = {
### `ShimSparkErrorConverter` calls the real Spark API
-Because Spark's `QueryExecutionErrors` API changes between Spark versions
(3.4, 3.5, 4.0),
-there is a separate implementation per version (in `spark-3.4/`, `spark-3.5/`,
`spark-4.0/`).
+Because Spark's `QueryExecutionErrors` API changes between Spark versions
(3.4, 3.5, and the 4.x line),
+there is a separate implementation per branch (in `spark-3.4/`, `spark-3.5/`,
and `spark-4.x/`, which is
+shared by Spark 4.0, 4.1, and 4.2).

diff --git a/docs/source/index.md b/docs/source/index.md
index 75c5db07b..ba421a603 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -40,21 +40,14 @@ Comet also accelerates Apache Iceberg, when performing
Parquet scans from Spark.
Comet delivers a performance speedup for many queries, enabling faster data
processing and shorter time-to-insights.
-The following chart shows the time it takes to run the 22 TPC-H queries
against 100 GB of data in Parquet format
-using a single executor with 8 cores. See the [Comet Benchmarking
Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
-for details of the environment used for these benchmarks.
+The following charts demonstrate Comet accelerating TPC-H @ 1 TB. See the
[Comet Benchmarking
Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
+for details.
-When using Comet, the overall run time is reduced from 687 seconds to 302
seconds, a 2.2x speedup.
-
-
+
Here is a breakdown showing relative performance of Spark and Comet for each
TPC-H query.
-
-
-These benchmarks can be reproduced in any environment using the documentation
in the
-[Comet Benchmarking Guide](/contributor-guide/benchmarking.md). We encourage
-you to run your own benchmarks.
+
## Use Commodity Hardware
@@ -68,12 +61,26 @@ Comet aims for 100% compatibility with all supported
versions of Apache Spark, a
your existing Spark deployments and workflows seamlessly. With no code changes
required, you can immediately harness
the benefits of Comet's acceleration capabilities without disrupting your
Spark applications.
+The project strives to keep feature parity with Apache Spark, that is, users
should expect the same behavior (w.r.t
+features, configurations, query results, etc) with Comet turned on or turned
off in their Spark jobs. In addition,
+the Comet extension automatically detects unsupported features and falls back
to the Spark engine.
+
## Tight Integration with Apache DataFusion
Comet tightly integrates with the core Apache DataFusion project, leveraging
its powerful execution engine. With
seamless interoperability between Comet and DataFusion, you can achieve
optimal performance and efficiency in your
Spark workloads.
+## Architecture
+
+The following diagram provides an overview of Comet's architecture.
+
+
+
+The following diagram shows how Comet integrates with Apache Spark.
+
+
+
## Active Community
Comet boasts a vibrant and active community of developers, contributors, and
users dedicated to advancing the
@@ -86,8 +93,6 @@ To get started with Apache DataFusion Comet, follow the
[DataFusion Slack and Discord
channels](https://datafusion.apache.org/contributor-guide/communication.html)
to connect
with other users, ask questions, and share your experiences with Comet.
-Follow [Apache DataFusion Comet
Overview](https://datafusion.apache.org/comet/about/index.html) to get more
detailed information
-
## Contributing
We welcome contributions from the community to help improve and enhance Apache
DataFusion Comet. Whether it's fixing
@@ -100,8 +105,8 @@ shaping the future of Comet. Check out our
:caption: Index
:hidden:
-Comet Overview <about/index>
User Guide <user-guide/index>
Contributor Guide <contributor-guide/index>
+Comparison with Gluten <about/gluten_comparison>
ASF Links <asf/index>
```
diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md
index 8fe9f9328..65736c1d4 100644
--- a/docs/source/user-guide/index.md
+++ b/docs/source/user-guide/index.md
@@ -19,9 +19,17 @@ under the License.
# Comet User Guide
+The Comet user guide covers installation, configuration, supported data
sources, supported operators
+and expressions, and tuning advice for running Apache Spark with Comet
acceleration.
+
+User guides are published for each release. The development snapshot tracks
the upcoming release and
+may include features and fixes that are not yet generally available. Select a
version from the
+navigation menu to view its guide.
+
```{toctree}
:maxdepth: 2
:caption: User Guides
+:hidden:
0.16.0-SNAPSHOT <latest/index>
0.15.x <0.15/index>
diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md
b/docs/source/user-guide/latest/compatibility/expressions/index.md
index 9fbec44f0..ba0c4b5d5 100644
--- a/docs/source/user-guide/latest/compatibility/expressions/index.md
+++ b/docs/source/user-guide/latest/compatibility/expressions/index.md
@@ -27,6 +27,7 @@ Compatibility notes are grouped by expression category:
```{toctree}
:maxdepth: 1
+:hidden:
aggregate
array
diff --git a/docs/source/user-guide/latest/compatibility/index.md
b/docs/source/user-guide/latest/compatibility/index.md
index 7bda0570d..1ba9d9e18 100644
--- a/docs/source/user-guide/latest/compatibility/index.md
+++ b/docs/source/user-guide/latest/compatibility/index.md
@@ -32,6 +32,7 @@ This guide documents areas where Comet's behavior is known to
differ from Spark.
```{toctree}
:maxdepth: 1
+:hidden:
scans
floating-point
diff --git a/docs/source/user-guide/latest/compatibility/scans.md
b/docs/source/user-guide/latest/compatibility/scans.md
index d68c59d56..d713dd4e8 100644
--- a/docs/source/user-guide/latest/compatibility/scans.md
+++ b/docs/source/user-guide/latest/compatibility/scans.md
@@ -48,7 +48,6 @@ The following features are not supported by either scan
implementation, and Come
- Spark's Datasource V2 API. When `spark.sql.sources.useV1SourceList` does not
include `parquet`, Spark uses the
V2 API for Parquet scans. The DataFusion-based implementations only support
the V1 API.
- Spark metadata columns (e.g., `_metadata.file_path`)
-- No support for AQE Dynamic Partition Pruning (DPP). Non-AQE DPP is supported.
The following shared limitation may produce incorrect results without falling
back to Spark:
@@ -81,6 +80,22 @@ requires `spark.comet.exec.enabled=true` because the scan
node must be wrapped b
are detected at read time and raise a `SparkRuntimeException` with error
class `_LEGACY_ERROR_TEMP_2093`,
matching Spark's behavior.
+The following `native_datafusion` limitations may produce incorrect results on
Spark versions prior to 4.0
+without falling back to Spark:
+
+- Reading `TimestampLTZ` as `TimestampNTZ`. On Spark 3.x, Spark raises an
error per
+ [SPARK-36182](https://issues.apache.org/jira/browse/SPARK-36182) because LTZ
encodes UTC-adjusted instants
+ that cannot be safely reinterpreted as timezone-free values. Comet does not
raise this error and instead
+ returns the raw UTC instant as a `TimestampNTZ` value. This applies to all
LTZ physical encodings (INT96,
+ TIMESTAMP_MICROS, TIMESTAMP_MILLIS). On Spark 4.0+, this read is permitted
+ ([SPARK-47447](https://issues.apache.org/jira/browse/SPARK-47447)) and Comet
matches Spark's behavior.
+ See [#4219](https://github.com/apache/datafusion-comet/issues/4219).
+
+- Unsupported Parquet type conversions. Spark raises schema incompatibility
errors for certain conversions
+ (e.g., reading INT32 as BIGINT, reading BINARY as TIMESTAMP, unsupported
decimal precision changes).
+ The `native_datafusion` scan may not detect these mismatches and could
return unexpected values instead
+ of raising an error. See
[#3720](https://github.com/apache/datafusion-comet/issues/3720).
+
## `native_iceberg_compat` Limitations
The `native_iceberg_compat` scan has the following additional limitation that
may produce incorrect results
diff --git a/docs/source/user-guide/latest/compatibility/spark-versions.md
b/docs/source/user-guide/latest/compatibility/spark-versions.md
index 115b1595b..663be37a2 100644
--- a/docs/source/user-guide/latest/compatibility/spark-versions.md
+++ b/docs/source/user-guide/latest/compatibility/spark-versions.md
@@ -28,10 +28,22 @@ compatibility guide.
Spark 3.4.3 is supported with Java 11/17 and Scala 2.12/2.13.
+### Known Limitations
+
+- **Reading `TimestampLTZ` as `TimestampNTZ`**: Spark 3.4 raises an error for
this operation
+ (SPARK-36182), but Comet's `native_datafusion` scan silently returns the raw
UTC value instead.
+ See [Parquet Compatibility](scans.md#native_datafusion-limitations) for
details.
+
## Spark 3.5
Spark 3.5.8 is supported with Java 11/17 and Scala 2.12/2.13.
+### Known Limitations
+
+- **Reading `TimestampLTZ` as `TimestampNTZ`**: Spark 3.5 raises an error for
this operation
+ (SPARK-36182), but Comet's `native_datafusion` scan silently returns the raw
UTC value instead.
+ See [Parquet Compatibility](scans.md#native_datafusion-limitations) for
details.
+
## Spark 4.0
Spark 4.0.2 is supported with Java 17 and Scala 2.13.
@@ -42,14 +54,9 @@ Spark 4.0.2 is supported with Java 17 and Scala 2.13.
[#4051](https://github.com/apache/datafusion-comet/issues/4051)): Spark 4.0
introduced collation
support. Non-default collated strings are not yet supported by Comet and
will fall back to Spark.
-## Spark 4.1 (Experimental)
-
-Spark 4.1.1 is provided as experimental support with Java 17 and Scala 2.13.
+## Spark 4.1
-```{warning}
-Spark 4.1 support is experimental and intended for development and testing
only. It should not be used
-in production.
-```
+Spark 4.1.1 is supported with Java 17/21 and Scala 2.13.
### Known Limitations
diff --git a/docs/source/user-guide/latest/datasources.md
b/docs/source/user-guide/latest/datasources.md
index bbf45dba9..065b719ba 100644
--- a/docs/source/user-guide/latest/datasources.md
+++ b/docs/source/user-guide/latest/datasources.md
@@ -23,10 +23,9 @@
### Parquet
-When `spark.comet.scan.enabled` is enabled, Parquet scans will be performed
natively by Comet if all data types
-in the schema are supported. When this option is not enabled, the scan will
fall back to Spark. In this case,
-enabling `spark.comet.convert.parquet.enabled` will immediately convert the
data into Arrow format, allowing native
-execution to happen after that, but the process may not be efficient.
+Parquet scans are performed natively by Comet if all data types in the schema
are supported. When the scan
+falls back to Spark, enabling `spark.comet.convert.parquet.enabled` will
immediately convert the data into
+Arrow format, allowing native execution to happen after that, but the process
may not be efficient.
### Apache Iceberg
@@ -182,7 +181,7 @@ the `object_store` crate's format.
This implementation maintains compatibility with existing Hadoop S3A
configurations, so existing code will
continue to work as long as the configurations are supported and can be
translated without loss of functionality.
-#### Root CA Certificates
+### Root CA Certificates
One major difference between Spark and Comet is the mechanism for discovering
Root
CA Certificates. Spark uses the JVM to read CA Certificates from the Java
Trust Store, but native Comet
@@ -190,7 +189,7 @@ scans use system Root CA Certificates (typically stored
in `/etc/ssl/certs` on Linux). These scans will not be able to interact with
S3 if the Root CA Certificates are not
installed.
-#### Supported Credential Providers
+### Supported Credential Providers
AWS credential providers can be configured using the
`fs.s3a.aws.credentials.provider` configuration. The following table shows the
supported credential providers and their configuration options:
@@ -209,7 +208,7 @@ AWS credential providers can be configured using the
`fs.s3a.aws.credentials.pro
Multiple credential providers can be specified in a comma-separated list using
the `fs.s3a.aws.credentials.provider` configuration, just as Hadoop AWS
supports. If `fs.s3a.aws.credentials.provider` is not configured, Hadoop S3A's
default credential provider chain will be used. All configuration options also
support bucket-specific overrides using the pattern
`fs.s3a.bucket.{bucket-name}.{option}`.
-#### Additional S3 Configuration Options
+### Additional S3 Configuration Options
Beyond credential providers, the `native_datafusion` and
`native_iceberg_compat` implementations support additional
S3 configuration options:
@@ -223,7 +222,7 @@ S3 configuration options:
All configuration options support bucket-specific overrides using the pattern
`fs.s3a.bucket.{bucket-name}.{option}`.
-#### Examples
+### Examples
The following examples demonstrate how to configure S3 access with the
`native_datafusion` and `native_iceberg_compat`
Parquet scan implementations using different authentication methods.
@@ -256,7 +255,7 @@ $SPARK_HOME/bin/spark-shell \
...
```
-#### Limitations
+### Limitations
The S3 support of `native_datafusion` and `native_iceberg_compat` has the
following limitations:
diff --git a/docs/source/user-guide/latest/iceberg.md
b/docs/source/user-guide/latest/iceberg.md
index cb6fdab2c..5c63ae9ad 100644
--- a/docs/source/user-guide/latest/iceberg.md
+++ b/docs/source/user-guide/latest/iceberg.md
@@ -25,13 +25,17 @@ Comet's native Iceberg reader relies on reflection to
extract `FileScanTask`s fr
then serialized to Comet's native execution engine (see
[PR #2528](https://github.com/apache/datafusion-comet/pull/2528)).
-The example below uses Spark's package downloader to retrieve Comet 0.14.0 and
Iceberg
+The example below uses Spark's package downloader to retrieve Comet
$COMET_VERSION and Iceberg
1.8.1, but Comet has been tested with Iceberg 1.5, 1.7, 1.8, 1.9, and 1.10.
The native Iceberg
reader is enabled by default. To disable it, set
`spark.comet.scan.icebergNative.enabled=false`.
+The example uses the Spark 3.5 / Scala 2.12 build of Comet; substitute the
Comet artifact
+matching your Spark and Scala versions (Comet also ships Spark 3.5 / Scala
2.13 and Spark
+4.0/4.1 / Scala 2.13 jars; see the [installation guide](installation.md) for
the full list).
+
```shell
$SPARK_HOME/bin/spark-shell \
- --packages
org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1
\
+ --packages
org.apache.datafusion:comet-spark-spark3.5_2.12:$COMET_VERSION,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1
\
--repositories https://repo1.maven.org/maven2/ \
--conf
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
\
--conf
spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkCatalog \
@@ -39,7 +43,6 @@ $SPARK_HOME/bin/spark-shell \
--conf spark.sql.catalog.spark_catalog.warehouse=/tmp/warehouse \
--conf spark.plugins=org.apache.spark.CometPlugin \
--conf
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
\
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
--conf spark.comet.explainFallback.enabled=true \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=2g
@@ -106,7 +109,7 @@ configure Spark to use a REST catalog with Comet's native
Iceberg scan:
```shell
$SPARK_HOME/bin/spark-shell \
- --packages
org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1
\
+ --packages
org.apache.datafusion:comet-spark-spark3.5_2.12:$COMET_VERSION,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1
\
--repositories https://repo1.maven.org/maven2/ \
--conf
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
\
--conf spark.sql.catalog.rest_cat=org.apache.iceberg.spark.SparkCatalog \
@@ -115,7 +118,6 @@ $SPARK_HOME/bin/spark-shell \
--conf spark.sql.catalog.rest_cat.warehouse=/tmp/warehouse \
--conf spark.plugins=org.apache.spark.CometPlugin \
--conf
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
\
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
--conf spark.comet.explainFallback.enabled=true \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=2g
@@ -141,6 +143,8 @@ The following scenarios will fall back to Spark's native
Iceberg reader:
- Scans with residual filters using `truncate`, `bucket`, `year`, `month`,
`day`, or `hour`
transform functions (partition pruning still works, but row-level filtering
of these
transforms falls back)
+- Dynamic Partition Pruning under Adaptive Query Execution (non-AQE DPP is
supported);
+ see [#3510](https://github.com/apache/datafusion-comet/issues/3510)
### Task input metrics
diff --git a/docs/source/user-guide/latest/index.rst
b/docs/source/user-guide/latest/index.rst
index 480ec4f70..314a0a51b 100644
--- a/docs/source/user-guide/latest/index.rst
+++ b/docs/source/user-guide/latest/index.rst
@@ -22,10 +22,20 @@
Comet $COMET_VERSION User Guide
================================
+This guide covers Comet $COMET_VERSION: how to install it, build it from
source, configure it for
+your Spark deployment, and get the best results from it. It also documents the
data sources, data
+types, operators, and expressions that Comet supports, along with a
compatibility guide describing
+known differences from Apache Spark.
+
+Operational topics include reading and understanding Comet query plans,
tuning, available metrics,
+and integration guides for Apache Iceberg and Kubernetes. Select a topic from
the navigation menu
+to read more.
+
.. _toc.user-guide-links-$COMET_VERSION:
.. toctree::
:maxdepth: 1
:caption: Comet $COMET_VERSION User Guide
+ :hidden:
Installing Comet <installation>
Building From Source <source>
diff --git a/docs/source/user-guide/latest/installation.md
b/docs/source/user-guide/latest/installation.md
index 3da84e210..4b7717b68 100644
--- a/docs/source/user-guide/latest/installation.md
+++ b/docs/source/user-guide/latest/installation.md
@@ -25,8 +25,14 @@ Make sure the following requirements are met and software
installed on your mach
### Supported Operating Systems
-- Linux
-- Apple macOS (Intel and Apple Silicon)
+The published Comet jar files in Maven Central bundle native libraries for
Linux only (amd64 and arm64). macOS
+users must [build from source](source.md).
+
+| Operating System | Published Maven Jars | Build from Source |
+| --------------------------- | -------------------- | ----------------- |
+| Linux (amd64) | Yes | Yes |
+| Linux (arm64) | Yes | Yes |
+| Apple macOS (Apple Silicon) | No | Yes |
### Supported Spark Versions
@@ -34,7 +40,7 @@ Comet $COMET_VERSION supports the following versions of
Apache Spark. Refer to t
in the [Compatibility Guide] for more information, such as known limitations
per Spark version.
[Spark Version Compatibility]: compatibility/spark-versions.md
-[Compatibility Guide]: compatibility
+[Compatibility Guide]: compatibility/index.md
We recommend only using Comet with Spark versions where we currently have both
Comet and Spark tests enabled in CI.
Other versions may work well enough for development and evaluation purposes.
@@ -44,6 +50,7 @@ Other versions may work well enough for development and
evaluation purposes.
| 3.4.3 | 11/17 | 2.12/2.13 | Yes | Yes
|
| 3.5.8 | 11/17 | 2.12/2.13 | Yes | Yes
|
| 4.0.2 | 17/21 | 2.13 | Yes | Yes
|
+| 4.1.1 | 17/21 | 2.13 | Yes | Yes
|
Note that we do not test the full matrix of supported Java and Scala versions
in CI for every Spark version.
@@ -52,7 +59,6 @@ use only and should not be used in production yet.
| Spark Version | Java Version | Scala Version | Comet Tests in CI | Spark
SQL Tests in CI |
| -------------- | ------------ | ------------- | ----------------- |
--------------------- |
-| 4.1.1 | 17 | 2.13 | Yes | No
|
| 4.2.0-preview4 | 17 | 2.13 | No | No
|
Note that Comet may not fully work with proprietary forks of Apache Spark such
as the Spark versions offered by
@@ -86,7 +92,6 @@ Here are the direct links for downloading the Comet
$COMET_VERSION jar file.
- [Comet plugin for Spark 3.5 / Scala
2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/$COMET_VERSION/comet-spark-spark3.5_2.13-$COMET_VERSION.jar)
- [Comet plugin for Spark 4.0 / Scala
2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.0_2.13/$COMET_VERSION/comet-spark-spark4.0_2.13-$COMET_VERSION.jar)
- [Comet plugin for Spark 4.1 / Scala
2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.1_2.13/$COMET_VERSION/comet-spark-spark4.1_2.13-$COMET_VERSION.jar)
-- [Comet plugin for Spark 4.2 / Scala 2.13
(Experimental)](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.2_2.13/$COMET_VERSION/comet-spark-spark4.2_2.13-$COMET_VERSION.jar)
<!-- ENDIF -->
## Building from source
@@ -115,7 +120,7 @@ $SPARK_HOME/bin/spark-shell \
--conf
spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
\
--conf spark.comet.explainFallback.enabled=true \
--conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g
+ --conf spark.memory.offHeap.size=4g
```
### Verify Comet enabled for Spark SQL query
@@ -126,6 +131,16 @@ Create a test Parquet source
scala> (0 until 10).toDF("a").write.mode("overwrite").parquet("/tmp/test")
```
+Comet will log output similar to:
+
+```shell
+INFO core/src/lib.rs: Comet native library version $COMET_VERSION initialized
+WARN CometExecRule: Comet cannot execute some parts of this plan natively (set
spark.comet.explainFallback.enabled=false to disable this logging):
+ Execute InsertIntoHadoopFsRelationCommand [COMET: Native support for
operator DataWritingCommandExec is disabled. Set
spark.comet.parquet.write.enabled=true to enable it.]
++- WriteFiles
+ +- LocalTableScan [COMET: Native support for operator LocalTableScanExec
is disabled. Set spark.comet.exec.localTableScan.enabled=true to enable it.]
+```
+
Query the data from the test source and check:
- INFO message shows the native Comet library has been initialized.
@@ -134,24 +149,15 @@ Query the data from the test source and check:
```scala
scala> spark.read.parquet("/tmp/test").createOrReplaceTempView("t1")
scala> spark.sql("select * from t1 where a > 5").explain
-INFO src/lib.rs: Comet native library initialized
-== Physical Plan ==
- *(1) ColumnarToRow
- +- CometFilter [a#14], (isnotnull(a#14) AND (a#14 > 5))
- +- CometScan parquet [a#14] Batched: true, DataFilters:
[isnotnull(a#14), (a#14 > 5)],
- Format: CometParquet, Location: InMemoryFileIndex(1
paths)[file:/tmp/test], PartitionFilters: [],
- PushedFilters: [IsNotNull(a), GreaterThan(a,5)], ReadSchema:
struct<a:int>
```
-With the configuration `spark.comet.explainFallback.enabled=true`, Comet will
log any reasons that prevent a plan from
-being executed natively.
+Comet will log output similar to:
-```scala
-scala> Seq(1,2,3,4).toDF("a").write.parquet("/tmp/test.parquet")
-WARN CometSparkSessionExtensions$CometExecRule: Comet cannot execute some
parts of this plan natively because:
- - LocalTableScan is not supported
- - WriteFiles is not supported
- - Execute InsertIntoHadoopFsRelationCommand is not supported
+```shell
+== Physical Plan ==
+CometNativeColumnarToRow
++- CometFilter [a#6], (isnotnull(a#6) AND (a#6 > 5))
+ +- CometNativeScan parquet [a#6] Batched: true, DataFilters:
[isnotnull(a#6), (a#6 > 5)], Format: CometParquet, Location:
InMemoryFileIndex(1 paths)[file:/tmp/test], PartitionFilters: [],
PushedFilters: [IsNotNull(a), GreaterThan(a,5)], ReadSchema: struct<a:int>
```
## Additional Configuration
@@ -160,7 +166,7 @@ Depending on your deployment mode you may also need to set
the driver & executor
explicitly contain Comet otherwise Spark may use a different class-loader for
the Comet components than its internal
components which will then fail at runtime. For example:
-```
+```shell
--driver-class-path spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar
```
diff --git a/docs/source/user-guide/latest/kubernetes.md
b/docs/source/user-guide/latest/kubernetes.md
index fd84b7ad9..c06c469f3 100644
--- a/docs/source/user-guide/latest/kubernetes.md
+++ b/docs/source/user-guide/latest/kubernetes.md
@@ -69,13 +69,13 @@ metadata:
spec:
type: Scala
mode: cluster
- image: apache/datafusion-comet:0.7.0-spark4.1.1-scala2.13-java17
+ image: apache/datafusion-comet:$COMET_VERSION-spark3.5.5-scala2.12-java11
imagePullPolicy: IfNotPresent
mainClass: org.apache.spark.examples.SparkPi
mainApplicationFile:
local:///opt/spark/examples/jars/spark-examples_2.13-4.1.1.jar
sparkConf:
- "spark.executor.extraClassPath":
"/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar"
- "spark.driver.extraClassPath":
"/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar"
+ "spark.executor.extraClassPath":
"/opt/spark/jars/comet-spark-spark3.5_2.12-$COMET_VERSION.jar"
+ "spark.driver.extraClassPath":
"/opt/spark/jars/comet-spark-spark3.5_2.12-$COMET_VERSION.jar"
"spark.plugins": "org.apache.spark.CometPlugin"
"spark.comet.enabled": "true"
"spark.comet.exec.enabled": "true"
diff --git a/docs/source/user-guide/latest/source.md
b/docs/source/user-guide/latest/source.md
index 6ae43be56..841c4e208 100644
--- a/docs/source/user-guide/latest/source.md
+++ b/docs/source/user-guide/latest/source.md
@@ -23,6 +23,15 @@ It is sometimes preferable to build from source for a
specific platform.
## Using a Published Source Release
+<!-- IF_SNAPSHOT -->
+
+This documentation is for the current development version of Comet. Published
source releases are only available for released versions.
+To use this version of Comet, see the following section on building from the
GitHubn repository.
+
+<!-- ENDIF -->
+
+<!-- IF_RELEASE -->
+
Official source releases can be downloaded from
https://dist.apache.org/repos/dist/release/datafusion/
```console
@@ -41,6 +50,8 @@ Build
make release-nogit PROFILES="-Pspark-4.1"
```
+<!-- ENDIF -->
+
## Building from the GitHub repository
Clone the repository:
diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
index e35f90e5c..870fb5e47 100644
--- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
+++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
@@ -237,7 +237,7 @@ object GenerateDocs {
compat.nonEmpty || incompat.nonEmpty || unsupported.nonEmpty
}
for ((name, compat, incompat, unsupported) <- sorted) {
- w.write(s"\n### $name\n".getBytes)
+ w.write(s"\n## $name\n".getBytes)
if (compat.nonEmpty) {
w.write(
("\nThe following differences from Spark are always present and do
not require" +
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]