[GitHub] [iceberg] aokolnychyi commented on a change in pull request #3661: Spark: Implement copy-on-write DELETE

GitBox Tue, 07 Dec 2021 12:54:34 -0800


aokolnychyi commented on a change in pull request #3661:
URL: https://github.com/apache/iceberg/pull/3661#discussion_r764354089




##########
File path: 
spark/v3.2/spark-extensions/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelCommandPruning.scala
##########
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.dynamicpruning
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.And
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.AttributeMap
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.DynamicPruningSubquery
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.ExtendedV2ExpressionUtils
+import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.expressions.PredicateHelper
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.sql.catalyst.planning.RewrittenRowLevelCommand
+import org.apache.spark.sql.catalyst.plans.LeftSemi
+import org.apache.spark.sql.catalyst.plans.logical.Filter
+import org.apache.spark.sql.catalyst.plans.logical.Join
+import org.apache.spark.sql.catalyst.plans.logical.JoinHint
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
+import org.apache.spark.sql.catalyst.plans.logical.RowLevelCommand
+import org.apache.spark.sql.catalyst.plans.logical.Sort
+import org.apache.spark.sql.catalyst.plans.logical.Subquery
+import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION
+import org.apache.spark.sql.catalyst.trees.TreePattern.SORT
+import org.apache.spark.sql.connector.read.SupportsRuntimeFiltering
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
+
+/**
+ * A rule that adds a runtime filter for row-level commands.
+ *
+ * Note that only group-based rewrite plans (i.e. ReplaceData) are taken into 
account.
+ * Row-based rewrite plans are subject to usual runtime filtering.
+ */
+case class RowLevelCommandPruning(spark: SparkSession) extends 
Rule[LogicalPlan] with PredicateHelper {
+
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    // apply special dynamic filtering only for plans that don't support deltas
+    case RewrittenRowLevelCommand(
+        command: RowLevelCommand,
+        DataSourceV2ScanRelation(_, scan: SupportsRuntimeFiltering, _),
+        rewritePlan: ReplaceData) if conf.dynamicPartitionPruningEnabled =>
+
+      // use reference equality to find exactly the required scan relations
+      val newRewritePlan = rewritePlan transformUp {
+        case r: DataSourceV2ScanRelation if r.scan eq scan =>
+          val pruningKeys = 
ExtendedV2ExpressionUtils.resolveRefs[Attribute](scan.filterAttributes, r)
+          val dynamicPruningCond = buildDynamicPruningCondition(r, command, 
pruningKeys)
+          val filter = Filter(dynamicPruningCond, r)
+          // always optimize dynamic filtering subqueries for row-level 
commands as it is important
+          // to rewrite introduced predicates as joins because Spark recently 
stopped optimizing
+          // dynamic subqueries to facilitate broadcast reuse
+          optimizeSubquery(filter)
+      }
+      command.withNewRewritePlan(newRewritePlan)
+  }
+
+  private def buildDynamicPruningCondition(
+      relation: DataSourceV2ScanRelation,
+      command: RowLevelCommand,
+      pruningKeys: Seq[Attribute]): Expression = {
+
+    // construct a filtering plan with the original scan relation
+    val cond = command.condition.getOrElse(Literal.TrueLiteral)
+    val matchingRowsPlan = command match {
+      case m: MergeIntoTable =>

Review comment:
       Removed.

##########
File path: 
spark/v3.2/spark-extensions/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelCommandPruning.scala
##########
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.execution.dynamicpruning
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.And
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.AttributeMap
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.DynamicPruningSubquery
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.ExtendedV2ExpressionUtils
+import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.expressions.PredicateHelper
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.sql.catalyst.planning.RewrittenRowLevelCommand
+import org.apache.spark.sql.catalyst.plans.LeftSemi
+import org.apache.spark.sql.catalyst.plans.logical.Filter
+import org.apache.spark.sql.catalyst.plans.logical.Join
+import org.apache.spark.sql.catalyst.plans.logical.JoinHint
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
+import org.apache.spark.sql.catalyst.plans.logical.RowLevelCommand
+import org.apache.spark.sql.catalyst.plans.logical.Sort
+import org.apache.spark.sql.catalyst.plans.logical.Subquery
+import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION
+import org.apache.spark.sql.catalyst.trees.TreePattern.SORT
+import org.apache.spark.sql.connector.read.SupportsRuntimeFiltering
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
+
+/**
+ * A rule that adds a runtime filter for row-level commands.
+ *
+ * Note that only group-based rewrite plans (i.e. ReplaceData) are taken into 
account.
+ * Row-based rewrite plans are subject to usual runtime filtering.
+ */
+case class RowLevelCommandPruning(spark: SparkSession) extends 
Rule[LogicalPlan] with PredicateHelper {
+
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    // apply special dynamic filtering only for plans that don't support deltas
+    case RewrittenRowLevelCommand(
+        command: RowLevelCommand,
+        DataSourceV2ScanRelation(_, scan: SupportsRuntimeFiltering, _),
+        rewritePlan: ReplaceData) if conf.dynamicPartitionPruningEnabled =>
+
+      // use reference equality to find exactly the required scan relations
+      val newRewritePlan = rewritePlan transformUp {
+        case r: DataSourceV2ScanRelation if r.scan eq scan =>
+          val pruningKeys = 
ExtendedV2ExpressionUtils.resolveRefs[Attribute](scan.filterAttributes, r)
+          val dynamicPruningCond = buildDynamicPruningCondition(r, command, 
pruningKeys)
+          val filter = Filter(dynamicPruningCond, r)
+          // always optimize dynamic filtering subqueries for row-level 
commands as it is important
+          // to rewrite introduced predicates as joins because Spark recently 
stopped optimizing
+          // dynamic subqueries to facilitate broadcast reuse
+          optimizeSubquery(filter)
+      }
+      command.withNewRewritePlan(newRewritePlan)
+  }
+
+  private def buildDynamicPruningCondition(
+      relation: DataSourceV2ScanRelation,
+      command: RowLevelCommand,
+      pruningKeys: Seq[Attribute]): Expression = {
+
+    // construct a filtering plan with the original scan relation
+    val cond = command.condition.getOrElse(Literal.TrueLiteral)
+    val matchingRowsPlan = command match {
+      case m: MergeIntoTable =>
+        Join(relation, m.sourceTable, LeftSemi, Some(cond), JoinHint.NONE)
+
+      case u: UpdateTable =>
+        // UPDATEs with subqueries may be rewritten using a UNION with two 
identical scan relations
+        // each scan relation will get its own dynamic filter that will be 
shared during execution
+        // the analyzer will assign different expr IDs for each scan relation 
output attributes
+        // that's why the condition may refer to invalid attr expr IDs and 
must be transformed
+        val attrMap = AttributeMap(u.table.output.zip(relation.output))
+        val transformedCond = cond transform {
+          case attr: AttributeReference if attrMap.contains(attr) => 
attrMap(attr)
+        }
+        Filter(transformedCond, relation)
+
+      case _ =>
+        Filter(cond, relation)

Review comment:
       Fixed.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] aokolnychyi commented on a change in pull request #3661: Spark: Implement copy-on-write DELETE

Reply via email to