szehon-ho commented on code in PR #56712:
URL: https://github.com/apache/spark/pull/56712#discussion_r3478274821


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala:
##########
@@ -337,23 +377,42 @@ object TableOutputResolver extends SQLConfHelper with 
Logging {
       addError: String => Unit,
       colPath: Seq[String] = Nil,
       defaultValueFillMode: DefaultValueFillMode.Value,
-      enforceFullOutput: Boolean = false): Seq[NamedExpression] = {
+      enforceFullOutput: Boolean = false,
+      autoFilledGenCols: mutable.Set[String] = mutable.Set.empty): 
Seq[NamedExpression] = {
     val matchedCols = mutable.HashSet.empty[String]
     val reordered = expectedCols.flatMap { expectedCol =>
       val matched = inputCols.filter(col => conf.resolver(col.name, 
expectedCol.name))
       val newColPath = colPath :+ expectedCol.name
       if (matched.isEmpty) {
-        val defaultExpr = if (Set(FILL, 
RECURSE).contains(defaultValueFillMode)) {
-          getDefaultValueExprOrNullLit(expectedCol, 
conf.useNullsForMissingDefaultColumnValues)
-        } else {
-          None
-        }
-        if (defaultExpr.isEmpty) {
-          throw 
QueryCompilationErrors.incompatibleDataToTableCannotFindDataError(
-            tableName, newColPath.quoted
-          )
+        // Check for generated column expression first, before falling back to 
defaults,
+        // since getDefaultValueExprOrNullLit may return null for nullable 
columns.
+        GeneratedColumn.getGenerationExpression(expectedCol.metadata) match {
+          case Some(genExprSql) =>
+            autoFilledGenCols += expectedCol.name
+            // The parsed references resolve against the query output (the 
user-provided
+            // columns) later in analysis. References to other generated 
columns are already
+            // rejected at table definition time (see 
GeneratedColumnExpression.validate).
+            val genExpr = CatalystSqlParser.parseExpression(genExprSql)
+            Some(applyColumnMetadata(genExpr, expectedCol))

Review Comment:
   Good catch -- by-name was binding the generation expression to the pre-cast 
`query.output`, so a value-changing store-assignment cast diverged from the 
by-position path. Fixed: by-name now defers resolution until after the 
non-generated columns are resolved/cast, then resolves the generation 
expression against the post-cast stored columns, mirroring by-position via the 
same `resolveGenerationExprReferences` helper. Added two tests (`INSERT by 
name/by position computes generated column from the stored (post-cast) value`) 
using `id INT, doubled INT GENERATED ALWAYS AS (id * 2)` with `VALUES (2.9)`, 
which now correctly yields `(2, 4)` instead of `(2, 5)`.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableConstraints.scala:
##########
@@ -72,6 +79,57 @@ class ResolveTableConstraints(val catalogManager: 
CatalogManager) extends Rule[L
       
.getOrElse(catalogManager.v1SessionCatalog.parser.parseExpression(c.predicateSql()))
   }
 
+  /**
+   * For each user-provided generated column, add a CheckInvariant that 
validates the column
+   * value matches the generation expression. Auto-filled generated columns 
are excluded:
+   * ResolveOutputRelation strips their GENERATION_EXPRESSION metadata from 
the table output,
+   * so they won't appear here.
+   */
+  private def buildGeneratedColumnConstraints(
+      r: DataSourceV2Relation,
+      v2Write: V2WriteCommand): Seq[Expression] = {
+    if (!GeneratedColumn.supportsGeneratedColumnsOnWrite(r.catalog, 
r.table.columns())) {
+      return Seq.empty
+    }
+
+    // Use V2 columns from the table to access both V2 expressions and SQL 
strings.
+    // Only add constraints for generated columns whose GENERATION_EXPRESSION 
metadata
+    // is still present in the table output -- ResolveOutputRelation strips 
the metadata
+    // from auto-filled columns so they are excluded here.
+    val v2Columns = r.table.columns()
+    val resolver = catalogManager.v1SessionCatalog.conf.resolver
+    val userProvidedGenCols = r.output

Review Comment:
   Made the contract explicit: added cross-linking comments at both sites -- 
`ResolveOutputRelation` (where `GENERATION_EXPRESSION` is stripped from 
auto-filled columns) and `ResolveTableConstraints` (the read-back) -- noting 
the ordering is guaranteed by `outputResolved`, and that a future write path 
building the projection without the strip must adjust both sites together (e.g. 
by carrying the auto-filled set on the write command). Kept the metadata-strip 
approach since it's localized; happy to switch to carrying the set on the 
command if you'd prefer.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala:
##########
@@ -337,23 +377,42 @@ object TableOutputResolver extends SQLConfHelper with 
Logging {
       addError: String => Unit,
       colPath: Seq[String] = Nil,
       defaultValueFillMode: DefaultValueFillMode.Value,
-      enforceFullOutput: Boolean = false): Seq[NamedExpression] = {
+      enforceFullOutput: Boolean = false,
+      autoFilledGenCols: mutable.Set[String] = mutable.Set.empty): 
Seq[NamedExpression] = {
     val matchedCols = mutable.HashSet.empty[String]
     val reordered = expectedCols.flatMap { expectedCol =>
       val matched = inputCols.filter(col => conf.resolver(col.name, 
expectedCol.name))
       val newColPath = colPath :+ expectedCol.name
       if (matched.isEmpty) {
-        val defaultExpr = if (Set(FILL, 
RECURSE).contains(defaultValueFillMode)) {
-          getDefaultValueExprOrNullLit(expectedCol, 
conf.useNullsForMissingDefaultColumnValues)
-        } else {
-          None
-        }
-        if (defaultExpr.isEmpty) {
-          throw 
QueryCompilationErrors.incompatibleDataToTableCannotFindDataError(
-            tableName, newColPath.quoted
-          )
+        // Check for generated column expression first, before falling back to 
defaults,
+        // since getDefaultValueExprOrNullLit may return null for nullable 
columns.
+        GeneratedColumn.getGenerationExpression(expectedCol.metadata) match {
+          case Some(genExprSql) =>
+            autoFilledGenCols += expectedCol.name
+            // The parsed references resolve against the query output (the 
user-provided
+            // columns) later in analysis. References to other generated 
columns are already
+            // rejected at table definition time (see 
GeneratedColumnExpression.validate).
+            val genExpr = CatalystSqlParser.parseExpression(genExprSql)
+            Some(applyColumnMetadata(genExpr, expectedCol))
+          case None =>
+            // When a table has generated columns and the config is off, do not
+            // fill missing nullable non-generated columns with null.
+            val useNullAsDefault = conf.useNullsForMissingDefaultColumnValues 
&&

Review Comment:
   Done: extracted `useNullAsDefaultForMissingColumn(expectedCols, conf)` and 
call it from both the by-name and by-position paths so they can't drift.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to