gengliangwang commented on a change in pull request #35855: URL: https://github.com/apache/spark/pull/35855#discussion_r832321011
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultColumns.scala ########## @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.ResolveDefaultColumns._ +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.optimizer.ConstantFolding +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.AlwaysProcess +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +/** + * This is a rule to process DEFAULT columns in statements such as CREATE/REPLACE TABLE. + * + * Background: CREATE TABLE and ALTER TABLE invocations support setting column default values for + * later operations. Following INSERT, and INSERT MERGE commands may then reference the value + * using the DEFAULT keyword as needed. + * + * Example: + * CREATE TABLE T(a INT DEFAULT 4, b INT NOT NULL DEFAULT 5); + * INSERT INTO T VALUES (1, 2); + * INSERT INTO T VALUES (1, DEFAULT); + * INSERT INTO T VALUES (DEFAULT, 6); + * SELECT * FROM T; + * (1, 2) + * (1, 5) + * (4, 6) + * + * @param catalog the catalog to use for looking up the schema of INSERT INTO table objects. + * @param insert the enclosing INSERT statement for which this rule is processing the query, if any. + */ +case class ResolveDefaultColumns( + catalog: SessionCatalog, insert: Option[InsertIntoStatement] = None) extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsWithPruning( + AlwaysProcess.fn, ruleId) { + + case i@InsertIntoStatement(_, _, _, _, _, _) + if SQLConf.get.enableDefaultColumns && + i.query.collectFirst { case u: UnresolvedInlineTable => u }.isDefined => + val newQuery = ResolveDefaultColumns(catalog, Some(i)).apply(i.query) + i.copy(query = newQuery) + + case table: UnresolvedInlineTable + if SQLConf.get.enableDefaultColumns && insert.isDefined && + table.rows.nonEmpty && table.rows.forall(_.size == table.rows(0).size) => + val expanded: UnresolvedInlineTable = addMissingDefaultColumnValues(table).getOrElse(table) + replaceExplicitDefaultColumnValues(expanded).getOrElse(table) + + case i@InsertIntoStatement(_, _, _, project: Project, _, _) + if SQLConf.get.enableDefaultColumns && !project.resolved => + val helper = ResolveDefaultColumns(catalog, Some(i)) + val replaced: Option[Project] = helper.replaceExplicitDefaultColumnValues(project) + if (replaced.isDefined) i.copy(query = replaced.get) else i + + case i@InsertIntoStatement(_, _, _, project: Project, _, _) + if SQLConf.get.enableDefaultColumns && project.resolved => + val helper = ResolveDefaultColumns(catalog, Some(i)) + val expanded: Project = helper.addMissingDefaultColumnValues(project).getOrElse(project) + val replaced: Option[Project] = helper.replaceExplicitDefaultColumnValues(expanded) + if (replaced.isDefined) i.copy(query = replaced.get) else i + } + + // Helper method to check if an expression is an explicit DEFAULT column reference. + private def isExplicitDefaultColumn(expr: Expression): Boolean = expr match { + case u: UnresolvedAttribute if u.name.equalsIgnoreCase(CURRENT_DEFAULT_COLUMN_NAME) => true + case _ => false + } + + // Helper method to check that an inline table has any explicit DEFAULT column references. + private def hasExplicitDefaultReferences(table: UnresolvedInlineTable): Boolean = + table.rows.nonEmpty && table.rows.forall(_.size == table.rows(0).size) && + table.rows.exists{_.exists{isExplicitDefaultColumn(_)}} + + // Each of the following methods adds a projection over the input plan to generate missing default + // column values. + private def addMissingDefaultColumnValues( + table: UnresolvedInlineTable): Option[UnresolvedInlineTable] = { + assert(insert.isDefined) + val numQueryOutputs: Int = table.rows(0).size + val schema: StructType = getInsertTableSchema.getOrElse(return None) + val schemaWithoutPartitionCols = + StructType(schema.fields.dropRight(insert.get.partitionSpec.size)) + val newDefaultExprs: Seq[Expression] = + getDefaultExprs(numQueryOutputs, schema, schemaWithoutPartitionCols) + val newNames: Seq[String] = + schemaWithoutPartitionCols.fields.drop(numQueryOutputs).map { _.name } + if (newDefaultExprs.isEmpty) return None + Some(table.copy(names = table.names ++ newNames, + rows = table.rows.map { row => row ++ newDefaultExprs })) + } + + private def addMissingDefaultColumnValues(project: Project): Option[Project] = { + val numQueryOutputs: Int = project.projectList.size + val schema: StructType = getInsertTableSchema.getOrElse(return None) + val schemaWithoutPartitionCols = + StructType(schema.fields.dropRight(insert.get.partitionSpec.size)) + val newDefaultExprs: Seq[Expression] = + getDefaultExprs(numQueryOutputs, schema, schemaWithoutPartitionCols) + val newAliases: Seq[NamedExpression] = + newDefaultExprs.zip(schemaWithoutPartitionCols.fields).map { + case (expr, field) => Alias(expr, field.name)() + } + if (newDefaultExprs.isEmpty) return None + Some(project.copy(projectList = project.projectList ++ newAliases)) + } + + // This is a helper for the addMissingDefaultColumnValues methods above. + private def getDefaultExprs(numQueryOutputs: Int, schema: StructType, + schemaWithoutPartitionCols: StructType): Seq[Expression] = { + val remainingFields: Seq[StructField] = schemaWithoutPartitionCols.fields.drop(numQueryOutputs) + val numDefaultExprsToAdd: Int = { + if (SQLConf.get.useNullsForMissingDefaultColumnValues) { + remainingFields.size + } else { + remainingFields.takeWhile(_.metadata.contains(CURRENT_DEFAULT_COLUMN_METADATA_KEY)).size + } + } + Seq.fill(numDefaultExprsToAdd)(UnresolvedAttribute(CURRENT_DEFAULT_COLUMN_NAME)) + } + + // Each of the following methods replaces unresolved "DEFAULT" column references with matching + // default column values. + private def replaceExplicitDefaultColumnValues( + table: UnresolvedInlineTable): Option[UnresolvedInlineTable] = { + assert(insert.isDefined) + val schema: StructType = getInsertTableSchema.getOrElse(return None) + val schemaWithoutPartitionCols = + StructType(schema.fields.dropRight(insert.get.partitionSpec.size)) + val defaultExprs: Seq[Expression] = schemaWithoutPartitionCols.fields.map { + case f if f.metadata.contains(CURRENT_DEFAULT_COLUMN_METADATA_KEY) => analyze(f, "INSERT") + case _ => Literal(null) + } + var replaced = false + val newRows: Seq[Seq[Expression]] = { + table.rows.map { row: Seq[Expression] => + for { + i <- 0 until row.size + expr = row(i) + defaultExpr = if (i < defaultExprs.size) defaultExprs(i) else Literal(null) + } yield expr match { + case u: UnresolvedAttribute if isExplicitDefaultColumn(u) => + replaced = true + defaultExpr + case expr@_ if expr.find{isExplicitDefaultColumn}.isDefined => + // Return a more descriptive error message if the user tries to nest the DEFAULT column + // reference inside some other expression, such as DEFAULT + 1 (this is not allowed). + throw new AnalysisException(DEFAULTS_IN_EXPRESSIONS_ERROR) + case _ => expr + } + } + } + if (!replaced) return None + Some(table.copy(rows = newRows)) + } + + private def replaceExplicitDefaultColumnValues(project: Project): Option[Project] = { + assert(insert.isDefined) + val schema: StructType = getInsertTableSchema.getOrElse(return None) + val schemaWithoutPartitionCols = + StructType(schema.fields.dropRight(insert.get.partitionSpec.size)) + val colNames: Seq[String] = schemaWithoutPartitionCols.fields.map { _.name } + val defaultExprs: Seq[Expression] = schemaWithoutPartitionCols.fields.map { + case f if f.metadata.contains(CURRENT_DEFAULT_COLUMN_METADATA_KEY) => analyze(f, "INSERT") + case _ => Literal(null) + } + var replaced = false + val updated: Seq[NamedExpression] = { + for { + i <- 0 until project.projectList.size + projectExpr = project.projectList(i) + defaultExpr = if (i < defaultExprs.size) defaultExprs(i) else Literal(null) + colName = if (i < colNames.size) colNames(i) else "" + } yield projectExpr match { + case Alias(u: UnresolvedAttribute, _) if isExplicitDefaultColumn(u) => + replaced = true + Alias(defaultExpr, colName)() + case u: UnresolvedAttribute if isExplicitDefaultColumn(u) => + replaced = true + Alias(defaultExpr, colName)() + case expr@_ if expr.find{isExplicitDefaultColumn}.isDefined => + // Return a more descriptive error message if the user tries to nest the DEFAULT column + // reference inside some other expression, such as DEFAULT + 1 (this is not allowed). + throw new AnalysisException(DEFAULTS_IN_EXPRESSIONS_ERROR) + case _ => projectExpr + } + } + if (!replaced) return None + Some(project.copy(projectList = updated)) + } + + /** + * Looks up the schema for the table object of an INSERT INTO statement from the catalog. + * + * @return the schema of the indicated table, or None if the lookup found nothing. + */ + private def getInsertTableSchema: Option[StructType] = { + assert(insert.isDefined) + val lookup = try { + val tableName = insert.get.table match { + case r: UnresolvedRelation => TableIdentifier(r.name) + case r: UnresolvedCatalogRelation => r.tableMeta.identifier + case _ => return None + } + catalog.lookupRelation(tableName) + } catch { + case _: NoSuchTableException => return None + } + lookup match { + case SubqueryAlias(_, r: UnresolvedCatalogRelation) => Some(r.tableMeta.schema) + case _ => None + } + } +} + +/** + * This object contains fields to help process DEFAULT columns with the below rule of the same name. + */ +object ResolveDefaultColumns { + // This column metadata indicates the default value associated with a particular table column that + // is in effect at any given time. Its value begins at the time of the initial CREATE/REPLACE + // TABLE statement with DEFAULT column definition(s), if any. It then changes whenever an ALTER + // TABLE statement SETs the DEFAULT. The intent is for this "current default" to be used by + // UPDATE, INSERT and MERGE, which evaluate each default expression for each row. + val CURRENT_DEFAULT_COLUMN_METADATA_KEY = "CURRENT_DEFAULT" + // This column metadata represents the default value for all existing rows in a table after a + // column has been added. This value is determined at time of CREATE TABLE, REPLACE TABLE, or + // ALTER TABLE ADD COLUMN, and never changes thereafter. The intent is for this "exist default" + // to be used by any scan when the columns in the source row are incomplete. Review comment: I don't quite understand this. What do you mean by "columns in the source row are incomplete"? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
