This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f659f8d1b019 [SPARK-47319][SQL] Improve missingInput calculation
f659f8d1b019 is described below
commit f659f8d1b019385ad95673205386b6cbe8f89a49
Author: Peter Toth <[email protected]>
AuthorDate: Fri Mar 8 13:15:35 2024 +0800
[SPARK-47319][SQL] Improve missingInput calculation
### What changes were proposed in this pull request?
This PR improves `QueryPlan.missingInput()` calculation.
### Why are the changes needed?
This seems to be the root cause of `DeduplicateRelations` slowness in some
cases.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing UTs.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45424 from peter-toth/fix-missinginput.
Authored-by: Peter Toth <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../sql/catalyst/expressions/AttributeSet.scala | 20 +++++++++++++-------
.../apache/spark/sql/catalyst/plans/QueryPlan.scala | 8 +++++++-
2 files changed, 20 insertions(+), 8 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index 2628afd8923c..236380b2c030 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -104,13 +104,19 @@ class AttributeSet private (private val baseSet:
mutable.LinkedHashSet[Attribute
* in `other`.
*/
def --(other: Iterable[NamedExpression]): AttributeSet = {
- other match {
- // SPARK-32755: `--` method behave differently under scala 2.12 and 2.13,
- // use a Scala 2.12 based code to maintains the insertion order in Scala
2.13
- case otherSet: AttributeSet =>
- new AttributeSet(baseSet.clone() --= otherSet.baseSet)
- case _ =>
- new AttributeSet(baseSet.clone() --= other.map(a => new
AttributeEquals(a.toAttribute)))
+ if (isEmpty) {
+ AttributeSet.empty
+ } else if (other.isEmpty) {
+ this
+ } else {
+ other match {
+ // SPARK-32755: `--` method behave differently under scala 2.12 and
2.13,
+ // use a Scala 2.12 based code to maintains the insertion order in
Scala 2.13
+ case otherSet: AttributeSet =>
+ new AttributeSet(baseSet.clone() --= otherSet.baseSet)
+ case _ =>
+ new AttributeSet(baseSet.clone() --= other.map(a => new
AttributeEquals(a.toAttribute)))
+ }
}
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 2a62ea1feb03..0f049103542e 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -102,7 +102,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
/**
* Attributes that are referenced by expressions but not provided by this
node's children.
*/
- final def missingInput: AttributeSet = references -- inputSet
+ final def missingInput: AttributeSet = {
+ if (references.isEmpty) {
+ AttributeSet.empty
+ } else {
+ references -- inputSet
+ }
+ }
/**
* Runs [[transformExpressionsDown]] with `rule` on all expressions present
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]