This is an automated email from the ASF dual-hosted git repository. mbutrovich pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push: new 3654973d9 chore: Improve process for generating dynamic content into documentation (#2017) 3654973d9 is described below commit 3654973d910c442d97791cc5e031524acf8a1cc0 Author: Andy Grove <agr...@apache.org> AuthorDate: Fri Jul 11 13:11:12 2025 -0600 chore: Improve process for generating dynamic content into documentation (#2017) --- docs/source/user-guide/compatibility.md | 174 +++++++++++---------- docs/source/user-guide/configs.md | 4 + docs/templates/compatibility-template.md | 149 ------------------ docs/templates/configs-template.md | 30 ---- .../main/scala/org/apache/comet/GenerateDocs.scala | 54 +++++-- 5 files changed, 132 insertions(+), 279 deletions(-) diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md index 84c4aab0e..ab911474e 100644 --- a/docs/source/user-guide/compatibility.md +++ b/docs/source/user-guide/compatibility.md @@ -131,94 +131,102 @@ Cast operations in Comet fall into three levels of support: The following cast operations are generally compatible with Spark except for the differences noted here. -| From Type | To Type | Notes | -| --------- | ------- | --------------------------------------------------------------------------------------------------------------- | -| boolean | byte | | -| boolean | short | | -| boolean | integer | | -| boolean | long | | -| boolean | float | | -| boolean | double | | -| boolean | string | | -| byte | boolean | | -| byte | short | | -| byte | integer | | -| byte | long | | -| byte | float | | -| byte | double | | -| byte | decimal | | -| byte | string | | -| short | boolean | | -| short | byte | | -| short | integer | | -| short | long | | -| short | float | | -| short | double | | -| short | decimal | | -| short | string | | -| integer | boolean | | -| integer | byte | | -| integer | short | | -| integer | long | | -| integer | float | | -| integer | double | | -| integer | string | | -| long | boolean | | -| long | byte | | -| long | short | | -| long | integer | | -| long | float | | -| long | double | | -| long | string | | -| float | boolean | | -| float | byte | | -| float | short | | -| float | integer | | -| float | long | | -| float | double | | -| float | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | -| double | boolean | | -| double | byte | | -| double | short | | -| double | integer | | -| double | long | | -| double | float | | -| double | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | -| decimal | byte | | -| decimal | short | | -| decimal | integer | | -| decimal | long | | -| decimal | float | | -| decimal | double | | -| decimal | decimal | | -| decimal | string | There can be formatting differences in some case due to Spark using scientific notation where Comet does not | -| string | boolean | | -| string | byte | | -| string | short | | -| string | integer | | -| string | long | | -| string | binary | | -| string | date | Only supports years between 262143 BC and 262142 AD | -| date | string | | -| timestamp | long | | -| timestamp | string | | -| timestamp | date | | +<!-- WARNING! DO NOT MANUALLY MODIFY CONTENT BETWEEN THE BEGIN AND END TAGS --> + +<!--BEGIN:COMPAT_CAST_TABLE--> +| From Type | To Type | Notes | +|-|-|-| +| boolean | byte | | +| boolean | short | | +| boolean | integer | | +| boolean | long | | +| boolean | float | | +| boolean | double | | +| boolean | string | | +| byte | boolean | | +| byte | short | | +| byte | integer | | +| byte | long | | +| byte | float | | +| byte | double | | +| byte | decimal | | +| byte | string | | +| short | boolean | | +| short | byte | | +| short | integer | | +| short | long | | +| short | float | | +| short | double | | +| short | decimal | | +| short | string | | +| integer | boolean | | +| integer | byte | | +| integer | short | | +| integer | long | | +| integer | float | | +| integer | double | | +| integer | string | | +| long | boolean | | +| long | byte | | +| long | short | | +| long | integer | | +| long | float | | +| long | double | | +| long | string | | +| float | boolean | | +| float | byte | | +| float | short | | +| float | integer | | +| float | long | | +| float | double | | +| float | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| double | boolean | | +| double | byte | | +| double | short | | +| double | integer | | +| double | long | | +| double | float | | +| double | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| decimal | byte | | +| decimal | short | | +| decimal | integer | | +| decimal | long | | +| decimal | float | | +| decimal | double | | +| decimal | decimal | | +| decimal | string | There can be formatting differences in some case due to Spark using scientific notation where Comet does not | +| string | boolean | | +| string | byte | | +| string | short | | +| string | integer | | +| string | long | | +| string | binary | | +| string | date | Only supports years between 262143 BC and 262142 AD | +| date | string | | +| timestamp | long | | +| timestamp | string | | +| timestamp | date | | +<!--END:COMPAT_CAST_TABLE--> ### Incompatible Casts The following cast operations are not compatible with Spark for all inputs and are disabled by default. -| From Type | To Type | Notes | -| --------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| integer | decimal | No overflow check | -| long | decimal | No overflow check | -| float | decimal | There can be rounding differences | -| double | decimal | There can be rounding differences | -| string | float | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. | -| string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. | -| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits | -| string | timestamp | Not all valid formats are supported | -| binary | string | Only works for binary data representing valid UTF-8 strings | +<!-- WARNING! DO NOT MANUALLY MODIFY CONTENT BETWEEN THE BEGIN AND END TAGS --> + +<!--BEGIN:INCOMPAT_CAST_TABLE--> +| From Type | To Type | Notes | +|-|-|-| +| integer | decimal | No overflow check | +| long | decimal | No overflow check | +| float | decimal | There can be rounding differences | +| double | decimal | There can be rounding differences | +| string | float | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. | +| string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. | +| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits | +| string | timestamp | Not all valid formats are supported | +| binary | string | Only works for binary data representing valid UTF-8 strings | +<!--END:INCOMPAT_CAST_TABLE--> ### Unsupported Casts diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6544909aa..00adc5173 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -27,6 +27,9 @@ TO MODIFY THIS CONTENT MAKE SURE THAT YOU MAKE YOUR CHANGES TO THE TEMPLATE FILE Comet provides the following configuration settings. +<!-- WARNING! DO NOT MANUALLY MODIFY CONTENT BETWEEN THE BEGIN AND END TAGS --> + +<!--BEGIN:CONFIG_TABLE--> | Config | Description | Default Value | |--------|-------------|---------------| | spark.comet.batchSize | The columnar batch size, i.e., the maximum number of rows that a batch can contain. | 8192 | @@ -93,3 +96,4 @@ Comet provides the following configuration settings. | spark.comet.shuffle.preferDictionary.ratio | The ratio of total values to distinct values in a string column to decide whether to prefer dictionary encoding when shuffling the column. If the ratio is higher than this config, dictionary encoding will be used on shuffling string column. This config is effective if it is higher than 1.0. Note that this config is only used when `spark.comet.exec.shuffle.mode` is `jvm`. | 10.0 | | spark.comet.shuffle.sizeInBytesMultiplier | Comet reports smaller sizes for shuffle due to using Arrow's columnar memory format and this can result in Spark choosing a different join strategy due to the estimated size of the exchange being smaller. Comet will multiple sizeInBytes by this amount to avoid regressions in join strategy. | 1.0 | | spark.comet.sparkToColumnar.supportedOperatorList | A comma-separated list of operators that will be converted to Arrow columnar format when 'spark.comet.sparkToColumnar.enabled' is true | Range,InMemoryTableScan | +<!--END:CONFIG_TABLE--> diff --git a/docs/templates/compatibility-template.md b/docs/templates/compatibility-template.md deleted file mode 100644 index d26874820..000000000 --- a/docs/templates/compatibility-template.md +++ /dev/null @@ -1,149 +0,0 @@ -<!--- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<!-- - TO MODIFY THIS CONTENT MAKE SURE THAT YOU MAKE YOUR CHANGES TO THE TEMPLATE FILE - (docs/templates/compatibility-template.md) AND NOT THE GENERATED FILE - (docs/source/user-guide/compatibility.md) OTHERWISE YOUR CHANGES MAY BE LOST ---> - -# Compatibility Guide - -Comet aims to provide consistent results with the version of Apache Spark that is being used. - -This guide offers information about areas of functionality where there are known differences. - -## Parquet - -### Data Type Support - -Comet does not support reading decimals encoded in binary format. - -### Parquet Scans - -Comet currently has three distinct implementations of the Parquet scan operator. The configuration property -`spark.comet.scan.impl` is used to select an implementation. - -| Implementation | Description | -| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `native_comet` | This is the default implementation. It provides strong compatibility with Spark but does not support complex types. | -| `native_datafusion` | This implementation delegates to DataFusion's `DataSourceExec`. | -| `native_iceberg_compat` | This implementation also delegates to DataFusion's `DataSourceExec` but uses a hybrid approach of JVM and native code. This scan is designed to be integrated with Iceberg in the future. | - -The new (and currently experimental) `native_datafusion` and `native_iceberg_compat` scans provide the following benefits over the `native_comet` -implementation: - -- Leverages the DataFusion community's ongoing improvements to `DataSourceExec` -- Provides support for reading complex types (structs, arrays, and maps) -- Removes the use of reusable mutable-buffers in Comet, which is complex to maintain -- Improves performance - -The new scans currently have the following limitations: - -Issues common to both `native_datafusion` and `native_iceberg_compat`: - -- When reading Parquet files written by systems other than Spark that contain columns with the logical types `UINT_8` - or `UINT_16`, Comet will produce different results than Spark because Spark does not preserve or understand these - logical types. Arrow-based readers, such as DataFusion and Comet do respect these types and read the data as unsigned - rather than signed. By default, Comet will fall back to Spark when scanning Parquet files containing `byte` or `short` - types (regardless of the logical type). This behavior can be disabled by setting - `spark.comet.scan.allowIncompatible=true`. -- There is a known performance issue when pushing filters down to Parquet. See the [Comet Tuning Guide] for more - information. -- Reading maps containing complex types can result in errors or incorrect results [#1754] -- `PARQUET_FIELD_ID_READ_ENABLED` is not respected [#1758] -- There are failures in the Spark SQL test suite when enabling these new scans (tracking issues: [#1542] and [#1545]). -- No support for default values that are nested types (e.g., maps, arrays, structs). Literal default values are supported. -- Setting Spark configs `ignoreMissingFiles` or `ignoreCorruptFiles` to `true` is not compatible with `native_datafusion` scan. - -Issues specific to `native_datafusion`: - -- Bucketed scans are not supported -- No support for row indexes - -[#1545]: https://github.com/apache/datafusion-comet/issues/1545 -[#1542]: https://github.com/apache/datafusion-comet/issues/1542 -[#1754]: https://github.com/apache/datafusion-comet/issues/1754 -[#1758]: https://github.com/apache/datafusion-comet/issues/1758 -[Comet Tuning Guide]: tuning.md - -## ANSI mode - -Comet currently ignores ANSI mode in most cases, and therefore can produce different results than Spark. By default, -Comet will fall back to Spark if ANSI mode is enabled. To enable Comet to accelerate queries when ANSI mode is enabled, -specify `spark.comet.ansi.enabled=true` in the Spark configuration. Comet's ANSI support is experimental and should not -be used in production. - -There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support. - -## Floating number comparison - -Spark normalizes NaN and zero for floating point numbers for several cases. See `NormalizeFloatingNumbers` optimization rule in Spark. -However, one exception is comparison. Spark does not normalize NaN and zero when comparing values -because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`). But the comparison -functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)). -So Comet will add additional normalization expression of NaN and zero for comparison. - -There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted -separately [#1824](https://github.com/apache/datafusion-comet/issues/1824). - -## Incompatible Expressions - -Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions -will fall back to Spark but can be enabled by setting `spark.comet.expression.allowIncompatible=true`. - -## Array Expressions - -Comet has experimental support for a number of array expressions. These are experimental and currently marked -as incompatible and can be enabled by setting `spark.comet.expression.allowIncompatible=true`. - -## Regular Expressions - -Comet uses the Rust regexp crate for evaluating regular expressions, and this has different behavior from Java's -regular expression engine. Comet will fall back to Spark for patterns that are known to produce different results, but -this can be overridden by setting `spark.comet.regexp.allowIncompatible=true`. - -## Cast - -Cast operations in Comet fall into three levels of support: - -- **Compatible**: The results match Apache Spark -- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs - will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting - `spark.comet.cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not - recommended for production use. -- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to - Spark. - -### Compatible Casts - -The following cast operations are generally compatible with Spark except for the differences noted here. - -<!--COMPAT_CAST_TABLE--> - -### Incompatible Casts - -The following cast operations are not compatible with Spark for all inputs and are disabled by default. - -<!--INCOMPAT_CAST_TABLE--> - -### Unsupported Casts - -Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the -[tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. diff --git a/docs/templates/configs-template.md b/docs/templates/configs-template.md deleted file mode 100644 index c6076afd7..000000000 --- a/docs/templates/configs-template.md +++ /dev/null @@ -1,30 +0,0 @@ -<!--- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<!-- - TO MODIFY THIS CONTENT MAKE SURE THAT YOU MAKE YOUR CHANGES TO THE TEMPLATE FILE - (docs/templates/configs-template.md) AND NOT THE GENERATED FILE - (docs/source/user-guide/configs.md) OTHERWISE YOUR CHANGES MAY BE LOST ---> - -# Comet Configuration Settings - -Comet provides the following configuration settings. - -<!--CONFIG_TABLE--> diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index d69eecfa0..d8cc62cf9 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -19,9 +19,9 @@ package org.apache.comet -import java.io.{BufferedOutputStream, FileOutputStream} +import java.io.{BufferedOutputStream, BufferedReader, FileOutputStream, FileReader} -import scala.io.Source +import scala.collection.mutable.ListBuffer import org.apache.spark.sql.catalyst.expressions.Cast @@ -40,11 +40,12 @@ object GenerateDocs { } private def generateConfigReference(): Unit = { - val templateFilename = "docs/templates/configs-template.md" - val outputFilename = "docs/source/user-guide/configs.md" - val w = new BufferedOutputStream(new FileOutputStream(outputFilename)) - for (line <- Source.fromFile(templateFilename).getLines()) { - if (line.trim == "<!--CONFIG_TABLE-->") { + val filename = "docs/source/user-guide/configs.md" + val lines = readFile(filename) + val w = new BufferedOutputStream(new FileOutputStream(filename)) + for (line <- lines) { + w.write(s"${line.stripTrailing()}\n".getBytes) + if (line.trim == "<!--BEGIN:CONFIG_TABLE-->") { val publicConfigs = CometConf.allConfs.filter(_.isPublic) val confs = publicConfigs.sortBy(_.key) w.write("| Config | Description | Default Value |\n".getBytes) @@ -56,19 +57,18 @@ object GenerateDocs { w.write(s"| ${conf.key} | ${conf.doc.trim} | ${conf.defaultValueString} |\n".getBytes) } } - } else { - w.write(s"${line.trim}\n".getBytes) } } w.close() } private def generateCompatibilityGuide(): Unit = { - val templateFilename = "docs/templates/compatibility-template.md" - val outputFilename = "docs/source/user-guide/compatibility.md" - val w = new BufferedOutputStream(new FileOutputStream(outputFilename)) - for (line <- Source.fromFile(templateFilename).getLines()) { - if (line.trim == "<!--COMPAT_CAST_TABLE-->") { + val filename = "docs/source/user-guide/compatibility.md" + val lines = readFile(filename) + val w = new BufferedOutputStream(new FileOutputStream(filename)) + for (line <- lines) { + w.write(s"${line.stripTrailing()}\n".getBytes) + if (line.trim == "<!--BEGIN:COMPAT_CAST_TABLE-->") { w.write("| From Type | To Type | Notes |\n".getBytes) w.write("|-|-|-|\n".getBytes) for (fromType <- CometCast.supportedTypes) { @@ -86,7 +86,7 @@ object GenerateDocs { } } } - } else if (line.trim == "<!--INCOMPAT_CAST_TABLE-->") { + } else if (line.trim == "<!--BEGIN:INCOMPAT_CAST_TABLE-->") { w.write("| From Type | To Type | Notes |\n".getBytes) w.write("|-|-|-|\n".getBytes) for (fromType <- CometCast.supportedTypes) { @@ -103,10 +103,30 @@ object GenerateDocs { } } } - } else { - w.write(s"${line.trim}\n".getBytes) } } w.close() } + + /** Read file into memory */ + private def readFile(filename: String): Seq[String] = { + val r = new BufferedReader(new FileReader(filename)) + val buffer = new ListBuffer[String]() + var line = r.readLine() + var skipping = false + while (line != null) { + if (line.startsWith("<!--BEGIN:")) { + buffer += line + skipping = true + } else if (line.startsWith("<!--END:")) { + buffer += line + skipping = false + } else if (!skipping) { + buffer += line + } + line = r.readLine() + } + r.close() + buffer.toSeq + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org