(datafusion-comet) branch main updated: fix: fall back on nested types for default values (#1799)

agrove Wed, 28 May 2025 11:19:12 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new bab70d2a9 fix: fall back on nested types for default values (#1799)
bab70d2a9 is described below

commit bab70d2a9d2760fdf71a7749a568135d60272648
Author: Matt Butrovich <mbutrov...@users.noreply.github.com>
AuthorDate: Wed May 28 14:18:55 2025 -0400

    fix: fall back on nested types for default values (#1799)
    
    * Fall back on nested types for default values.
    
    * Update compatibility guide.
    
    * Address PR feedback.
    
    Co-authored-by: Andy Grove <agr...@apache.org>
    
    * Fix compilation after applying suggestion.
    
    * Refactor check for nested types in default values.
    
    * Add comment.
    
    ---------
    
    Co-authored-by: Andy Grove <agr...@apache.org>
---
 docs/source/user-guide/compatibility.md            |  1 +
 docs/templates/compatibility-template.md           |  1 +
 .../org/apache/comet/rules/CometScanRule.scala     | 22 ++++++++++++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/docs/source/user-guide/compatibility.md 
b/docs/source/user-guide/compatibility.md
index 4316c1c8c..b76e2616c 100644
--- a/docs/source/user-guide/compatibility.md
+++ b/docs/source/user-guide/compatibility.md
@@ -65,6 +65,7 @@ types (regardless of the logical type). This behavior can be 
disabled by setting
 - There is a known performance issue when pushing filters down to Parquet. See 
the [Comet Tuning Guide] for more
 information.
 - There are failures in the Spark SQL test suite when enabling these new scans 
(tracking issues: [#1542] and [#1545]).
+- No support for default values that are nested types (e.g., maps, arrays, 
structs). Literal default values are supported.
 
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
diff --git a/docs/templates/compatibility-template.md 
b/docs/templates/compatibility-template.md
index 191507385..9750c6dc8 100644
--- a/docs/templates/compatibility-template.md
+++ b/docs/templates/compatibility-template.md
@@ -65,6 +65,7 @@ The new scans currently have the following limitations:
 - There is a known performance issue when pushing filters down to Parquet. See 
the [Comet Tuning Guide] for more
   information.
 - There are failures in the Spark SQL test suite when enabling these new scans 
(tracking issues: [#1542] and [#1545]).
+- No support for default values that are nested types (e.g., maps, arrays, 
structs). Literal default values are supported.
 
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala 
b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
index 677d190b0..b258ea10a 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -22,16 +22,17 @@ package org.apache.comet.rules
 import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, 
PlanExpression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, 
GenericInternalRow, PlanExpression}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.util.MetadataColumnHelper
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, 
GenericArrayData, MetadataColumnHelper}
+import 
org.apache.spark.sql.catalyst.util.ResolveDefaultColumns.getExistenceDefaultValues
 import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
 import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.HadoopFsRelation
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{ArrayType, ByteType, DataType, MapType, 
ShortType, StructType}
+import org.apache.spark.sql.types._
 
 import org.apache.comet.{CometConf, DataTypeSupport}
 import org.apache.comet.CometConf._
@@ -118,7 +119,20 @@ case class CometScanRule(session: SparkSession) extends 
Rule[SparkPlan] {
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        val typeChecker = new CometScanTypeChecker(scanImpl)
+        val possibleDefaultValues = 
getExistenceDefaultValues(scanExec.requiredSchema)
+        if (possibleDefaultValues.exists(d => {
+            d != null && (d.isInstanceOf[ArrayBasedMapData] || d
+              .isInstanceOf[GenericInternalRow] || 
d.isInstanceOf[GenericArrayData])
+          })) {
+          // Spark already converted these to Java-native types, so we can't 
check SQL types.
+          // ArrayBasedMapData, GenericInternalRow, GenericArrayData 
correspond to maps, structs,
+          // and arrays respectively.
+          fallbackReasons +=
+            "Full native scan disabled because nested types for default values 
are not supported"
+          return withInfos(scanExec, fallbackReasons.toSet)
+        }
+
+        val typeChecker = CometScanTypeChecker(scanImpl)
         val schemaSupported =
           typeChecker.isSchemaSupported(scanExec.requiredSchema, 
fallbackReasons)
         val partitionSchemaSupported =


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

(datafusion-comet) branch main updated: fix: fall back on nested types for default values (#1799)

Reply via email to