This is an automated email from the ASF dual-hosted git repository.
mbutrovich pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new b351e3391 docs: update docs and tuning guide related to native shuffle
(#2487)
b351e3391 is described below
commit b351e3391a2eaac6dbb5913417261f0addb3778e
Author: Matt Butrovich <[email protected]>
AuthorDate: Mon Sep 29 12:14:33 2025 -0400
docs: update docs and tuning guide related to native shuffle (#2487)
---
docs/source/user-guide/latest/tuning.md | 8 ++++----
.../main/scala/org/apache/comet/rules/CometExecRule.scala | 13 ++++++++++---
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/docs/source/user-guide/latest/tuning.md
b/docs/source/user-guide/latest/tuning.md
index efa6ebaae..a35e32857 100644
--- a/docs/source/user-guide/latest/tuning.md
+++ b/docs/source/user-guide/latest/tuning.md
@@ -208,14 +208,14 @@ back to Spark for shuffle operations.
#### Native Shuffle
-Comet provides a fully native shuffle implementation, which generally provides
the best performance. However,
-native shuffle currently only supports `HashPartitioning` and
`SinglePartitioning` and has some restrictions on
-supported data types.
+Comet provides a fully native shuffle implementation, which generally provides
the best performance. Native shuffle
+supports `HashPartitioning`, `RangePartitioning` and `SinglePartitioning` but
currently only supports primitive type
+partitioning keys. Columns that are not partitioning keys may contain complex
types like maps, structs, and arrays.
#### Columnar (JVM) Shuffle
Comet Columnar shuffle is JVM-based and supports `HashPartitioning`,
`RoundRobinPartitioning`, `RangePartitioning`, and
-`SinglePartitioning`. This shuffle implementation supports more data types
than native shuffle.
+`SinglePartitioning`. This shuffle implementation supports complex data types
as partitioning keys.
### Shuffle Compression
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
index 56299ad4e..f572417bd 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -769,9 +769,9 @@ case class CometExecRule(session: SparkSession) extends
Rule[SparkPlan] {
/**
* Determine which data types are supported as partition columns in native
shuffle.
*
- * For Hash Partition this defines the key that determines how data should
be collocated for
- * operations like `groupByKey`, `reduceByKey` or `join`. Native code does
not support hashing
- * complex types, see hash_funcs/utils.rs
+ * For HashPartitioning this defines the key that determines how data
should be collocated for
+ * operations like `groupByKey`, `reduceByKey`, or `join`. Native code
does not support
+ * hashing complex types, see hash_funcs/utils.rs
*/
def supportedHashPartitioningDataType(dt: DataType): Boolean = dt match {
case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _:
LongType |
@@ -782,6 +782,13 @@ case class CometExecRule(session: SparkSession) extends
Rule[SparkPlan] {
false
}
+ /**
+ * Determine which data types are supported as partition columns in native
shuffle.
+ *
+ * For RangePartitioning this defines the key that determines how data
should be collocated
+ * for operations like `orderBy`, `repartitionByRange`. Native code does
not support sorting
+ * complex types.
+ */
def supportedRangePartitioningDataType(dt: DataType): Boolean = dt match {
case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _:
LongType |
_: FloatType | _: DoubleType | _: StringType | _: BinaryType | _:
TimestampType |
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]