szehon-ho commented on code in PR #56712:
URL: https://github.com/apache/spark/pull/56712#discussion_r3478274821
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala:
##########
@@ -337,23 +377,42 @@ object TableOutputResolver extends SQLConfHelper with
Logging {
addError: String => Unit,
colPath: Seq[String] = Nil,
defaultValueFillMode: DefaultValueFillMode.Value,
- enforceFullOutput: Boolean = false): Seq[NamedExpression] = {
+ enforceFullOutput: Boolean = false,
+ autoFilledGenCols: mutable.Set[String] = mutable.Set.empty):
Seq[NamedExpression] = {
val matchedCols = mutable.HashSet.empty[String]
val reordered = expectedCols.flatMap { expectedCol =>
val matched = inputCols.filter(col => conf.resolver(col.name,
expectedCol.name))
val newColPath = colPath :+ expectedCol.name
if (matched.isEmpty) {
- val defaultExpr = if (Set(FILL,
RECURSE).contains(defaultValueFillMode)) {
- getDefaultValueExprOrNullLit(expectedCol,
conf.useNullsForMissingDefaultColumnValues)
- } else {
- None
- }
- if (defaultExpr.isEmpty) {
- throw
QueryCompilationErrors.incompatibleDataToTableCannotFindDataError(
- tableName, newColPath.quoted
- )
+ // Check for generated column expression first, before falling back to
defaults,
+ // since getDefaultValueExprOrNullLit may return null for nullable
columns.
+ GeneratedColumn.getGenerationExpression(expectedCol.metadata) match {
+ case Some(genExprSql) =>
+ autoFilledGenCols += expectedCol.name
+ // The parsed references resolve against the query output (the
user-provided
+ // columns) later in analysis. References to other generated
columns are already
+ // rejected at table definition time (see
GeneratedColumnExpression.validate).
+ val genExpr = CatalystSqlParser.parseExpression(genExprSql)
+ Some(applyColumnMetadata(genExpr, expectedCol))
Review Comment:
Good catch -- by-name was binding the generation expression to the pre-cast
`query.output`, so a value-changing store-assignment cast diverged from the
by-position path. Fixed: by-name now defers resolution until after the
non-generated columns are resolved/cast, then resolves the generation
expression against the post-cast stored columns, mirroring by-position via the
same `resolveGenerationExprReferences` helper. Added two tests (`INSERT by
name/by position computes generated column from the stored (post-cast) value`)
using `id INT, doubled INT GENERATED ALWAYS AS (id * 2)` with `VALUES (2.9)`,
which now correctly yields `(2, 4)` instead of `(2, 5)`.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableConstraints.scala:
##########
@@ -72,6 +79,57 @@ class ResolveTableConstraints(val catalogManager:
CatalogManager) extends Rule[L
.getOrElse(catalogManager.v1SessionCatalog.parser.parseExpression(c.predicateSql()))
}
+ /**
+ * For each user-provided generated column, add a CheckInvariant that
validates the column
+ * value matches the generation expression. Auto-filled generated columns
are excluded:
+ * ResolveOutputRelation strips their GENERATION_EXPRESSION metadata from
the table output,
+ * so they won't appear here.
+ */
+ private def buildGeneratedColumnConstraints(
+ r: DataSourceV2Relation,
+ v2Write: V2WriteCommand): Seq[Expression] = {
+ if (!GeneratedColumn.supportsGeneratedColumnsOnWrite(r.catalog,
r.table.columns())) {
+ return Seq.empty
+ }
+
+ // Use V2 columns from the table to access both V2 expressions and SQL
strings.
+ // Only add constraints for generated columns whose GENERATION_EXPRESSION
metadata
+ // is still present in the table output -- ResolveOutputRelation strips
the metadata
+ // from auto-filled columns so they are excluded here.
+ val v2Columns = r.table.columns()
+ val resolver = catalogManager.v1SessionCatalog.conf.resolver
+ val userProvidedGenCols = r.output
Review Comment:
Made the contract explicit: added cross-linking comments at both sites --
`ResolveOutputRelation` (where `GENERATION_EXPRESSION` is stripped from
auto-filled columns) and `ResolveTableConstraints` (the read-back) -- noting
the ordering is guaranteed by `outputResolved`, and that a future write path
building the projection without the strip must adjust both sites together (e.g.
by carrying the auto-filled set on the write command). Kept the metadata-strip
approach since it's localized; happy to switch to carrying the set on the
command if you'd prefer.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala:
##########
@@ -337,23 +377,42 @@ object TableOutputResolver extends SQLConfHelper with
Logging {
addError: String => Unit,
colPath: Seq[String] = Nil,
defaultValueFillMode: DefaultValueFillMode.Value,
- enforceFullOutput: Boolean = false): Seq[NamedExpression] = {
+ enforceFullOutput: Boolean = false,
+ autoFilledGenCols: mutable.Set[String] = mutable.Set.empty):
Seq[NamedExpression] = {
val matchedCols = mutable.HashSet.empty[String]
val reordered = expectedCols.flatMap { expectedCol =>
val matched = inputCols.filter(col => conf.resolver(col.name,
expectedCol.name))
val newColPath = colPath :+ expectedCol.name
if (matched.isEmpty) {
- val defaultExpr = if (Set(FILL,
RECURSE).contains(defaultValueFillMode)) {
- getDefaultValueExprOrNullLit(expectedCol,
conf.useNullsForMissingDefaultColumnValues)
- } else {
- None
- }
- if (defaultExpr.isEmpty) {
- throw
QueryCompilationErrors.incompatibleDataToTableCannotFindDataError(
- tableName, newColPath.quoted
- )
+ // Check for generated column expression first, before falling back to
defaults,
+ // since getDefaultValueExprOrNullLit may return null for nullable
columns.
+ GeneratedColumn.getGenerationExpression(expectedCol.metadata) match {
+ case Some(genExprSql) =>
+ autoFilledGenCols += expectedCol.name
+ // The parsed references resolve against the query output (the
user-provided
+ // columns) later in analysis. References to other generated
columns are already
+ // rejected at table definition time (see
GeneratedColumnExpression.validate).
+ val genExpr = CatalystSqlParser.parseExpression(genExprSql)
+ Some(applyColumnMetadata(genExpr, expectedCol))
+ case None =>
+ // When a table has generated columns and the config is off, do not
+ // fill missing nullable non-generated columns with null.
+ val useNullAsDefault = conf.useNullsForMissingDefaultColumnValues
&&
Review Comment:
Done: extracted `useNullAsDefaultForMissingColumn(expectedCols, conf)` and
call it from both the by-name and by-position paths so they can't drift.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]