This is an automated email from the ASF dual-hosted git repository.
yuanzhou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 33732d9903 [VL] Fix Hudi scan fallback (#9419)
33732d9903 is described below
commit 33732d9903e630f5548238bb4b2feafe5f45744a
Author: Shiyan Xu <[email protected]>
AuthorDate: Tue Apr 29 11:25:08 2025 -0500
[VL] Fix Hudi scan fallback (#9419)
When file format is NewHoodieParquetFileFormat, the condition was matched
and the plan was passed to velox for execution. But the result wasn't correct -
merging wasn't performed, only parquet records were returned.
We should fallback to spark for NewHoodieParquetFileFormat
---
.../org/apache/gluten/execution/HudiScanTransformer.scala | 14 ++++++++++++++
.../org/apache/gluten/execution/OffloadHudiScan.scala | 4 ++--
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git
a/gluten-hudi/src/main/scala/org/apache/gluten/execution/HudiScanTransformer.scala
b/gluten-hudi/src/main/scala/org/apache/gluten/execution/HudiScanTransformer.scala
index 76a818c96e..267580e9a3 100644
---
a/gluten-hudi/src/main/scala/org/apache/gluten/execution/HudiScanTransformer.scala
+++
b/gluten-hudi/src/main/scala/org/apache/gluten/execution/HudiScanTransformer.scala
@@ -90,4 +90,18 @@ object HudiScanTransformer {
scanExec.disableBucketedScan
)
}
+
+ /**
+ * Check if the Hudi file format is supported for native engine execution.
+ *
+ * If not, Gluten will fall back to Spark execution.
+ */
+ def isSupportedHudiFileFormat(fileFormatName: String): Boolean = {
+ // Support formats like:
+ //
"org.apache.spark.sql.execution.datasources.parquet.Spark35LegacyHoodieParquetFileFormat"
+ //
"org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat"
+ // But exclude "NewHoodieParquetFileFormat"
+ !fileFormatName.endsWith("NewHoodieParquetFileFormat") &&
+ fileFormatName.endsWith("HoodieParquetFileFormat")
+ }
}
diff --git
a/gluten-hudi/src/main/scala/org/apache/gluten/execution/OffloadHudiScan.scala
b/gluten-hudi/src/main/scala/org/apache/gluten/execution/OffloadHudiScan.scala
index f937dba28c..f2cc24ceaf 100644
---
a/gluten-hudi/src/main/scala/org/apache/gluten/execution/OffloadHudiScan.scala
+++
b/gluten-hudi/src/main/scala/org/apache/gluten/execution/OffloadHudiScan.scala
@@ -24,9 +24,9 @@ import org.apache.spark.sql.execution.SparkPlan
case class OffloadHudiScan() extends OffloadSingleNode {
override def offload(plan: SparkPlan): SparkPlan = {
plan match {
- // Hudi has multiple file format definitions whose names end with
"HoodieParquetFileFormat".
case scan: org.apache.spark.sql.execution.FileSourceScanExec
- if
scan.relation.fileFormat.getClass.getName.endsWith("HoodieParquetFileFormat") =>
+ if HudiScanTransformer.isSupportedHudiFileFormat(
+ scan.relation.fileFormat.getClass.getName) =>
HudiScanTransformer(scan)
case other => other
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]