aokolnychyi commented on a change in pull request #3764: URL: https://github.com/apache/iceberg/pull/3764#discussion_r772557747
########## File path: spark/v3.2/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteUpdateTable.scala ########## @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.expressions.Alias +import org.apache.spark.sql.catalyst.expressions.EqualNullSafe +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.If +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.expressions.Not +import org.apache.spark.sql.catalyst.expressions.SubqueryExpression +import org.apache.spark.sql.catalyst.plans.logical.Assignment +import org.apache.spark.sql.catalyst.plans.logical.Filter +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.catalyst.plans.logical.ReplaceData +import org.apache.spark.sql.catalyst.plans.logical.Union +import org.apache.spark.sql.catalyst.plans.logical.UpdateIcebergTable +import org.apache.spark.sql.connector.iceberg.catalog.SupportsRowLevelOperations +import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE +import org.apache.spark.sql.connector.iceberg.write.SupportsDelta +import org.apache.spark.sql.connector.write.RowLevelOperationTable +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation + +/** + * Assigns a rewrite plan for v2 tables that support rewriting data to handle UPDATE statements. + * + * This rule assumes the commands have been fully resolved and all assignments have been aligned. + * That's why it must be run after AlignRowLevelCommandAssignments. + * + * This rule also must be run in the same batch with DeduplicateRelations in Spark. + */ +object RewriteUpdateTable extends RewriteRowLevelCommand { + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case u @ UpdateIcebergTable(aliasedTable, assignments, cond, None) if u.resolved => + EliminateSubqueryAliases(aliasedTable) match { + case r @ DataSourceV2Relation(tbl: SupportsRowLevelOperations, _, _, _, _) => + val operation = buildRowLevelOperation(tbl, UPDATE) + val table = RowLevelOperationTable(tbl, operation) + val rewritePlan = operation match { + case _: SupportsDelta => + throw new AnalysisException("Delta updates are currently not supported") + case _ if cond.exists(SubqueryExpression.hasSubquery) => + buildReplaceDataWithUnionPlan(r, table, assignments, cond) + case _ => + buildReplaceDataPlan(r, table, assignments, cond) + } + UpdateIcebergTable(r, assignments, cond, Some(rewritePlan)) + + case p => + throw new AnalysisException(s"$p is not an Iceberg table") + } + } + + // build a rewrite plan for sources that support replacing groups of data (e.g. files, partitions) + // if the condition does NOT contain a subquery + private def buildReplaceDataPlan( + relation: DataSourceV2Relation, + table: RowLevelOperationTable, + assignments: Seq[Assignment], + cond: Option[Expression]): ReplaceData = { + + // resolve all needed attrs (e.g. metadata attrs for grouping data on write) + val metadataAttrs = resolveRequiredMetadataAttrs(relation, table.operation) + + // construct a read relation and include all required metadata columns + val readAttrs = dedupAttrs(relation.output ++ metadataAttrs) + val readRelation = relation.copy(table = table, output = readAttrs) + + // build a plan with updated rows + val updateCond = cond.getOrElse(Literal.TrueLiteral) + val allRowsPlan = buildUpdateProjection(readRelation, assignments, updateCond) + + // build a plan to replace read groups in the table + val writeRelation = relation.copy(table = table) + ReplaceData(writeRelation, allRowsPlan, relation) + } + + // build a rewrite plan for sources that support replacing groups of data (e.g. files, partitions) + // if the condition contains a subquery + private def buildReplaceDataWithUnionPlan( + relation: DataSourceV2Relation, + table: RowLevelOperationTable, + assignments: Seq[Assignment], + cond: Option[Expression]): ReplaceData = { + + // resolve all needed attrs (e.g. metadata attrs for grouping data on write) + val metadataAttrs = resolveRequiredMetadataAttrs(relation, table.operation) + + // construct a read relation and include all required metadata columns + // the same read relation will be used to read records that must be updated and be copied over + // DeduplicateRelations will take care of duplicated attr IDs + val readAttrs = dedupAttrs(relation.output ++ metadataAttrs) + val readRelation = relation.copy(table = table, output = readAttrs) + + // build a plan for records that match the cond and should be updated + val updateCond = cond.getOrElse(Literal.TrueLiteral) + val matchedRowsPlan = Filter(updateCond, readRelation) + val updatedRowsPlan = buildUpdateProjection(matchedRowsPlan, assignments) + + // build a plan for records that did not match the cond but had to be copied over + val remainingRowFilter = Not(EqualNullSafe(updateCond, Literal.TrueLiteral)) + val remainingRowsPlan = Filter(remainingRowFilter, readRelation) + + // new state is a union of updated and copied over records + val allRowsPlan = Union(updatedRowsPlan, remainingRowsPlan) Review comment: Renamed it to `updatedAndRemainingRowsPlan`. Still not entirely happy. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
