Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/19531#discussion_r147529228
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
---
@@ -28,45 +28,43 @@ import
org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Statistics
import
org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
-object JoinEstimation extends Logging {
+case class JoinEstimation(join: Join) extends Logging {
+
+ private val leftStats = join.left.stats
+ private val rightStats = join.right.stats
+ private val keyStatsAfterJoin = new mutable.HashMap[Attribute,
ColumnStat]()
+
/**
* Estimate statistics after join. Return `None` if the join type is not
supported, or we don't
* have enough statistics for estimation.
*/
- def estimate(join: Join): Option[Statistics] = {
+ def estimate: Option[Statistics] = {
join.joinType match {
case Inner | Cross | LeftOuter | RightOuter | FullOuter =>
- InnerOuterEstimation(join).doEstimate()
+ estimateInnerOuterJoin()
case LeftSemi | LeftAnti =>
- LeftSemiAntiEstimation(join).doEstimate()
+ estimateLeftSemiAntiJoin()
case _ =>
logDebug(s"[CBO] Unsupported join type: ${join.joinType}")
None
}
}
-}
-
-case class InnerOuterEstimation(join: Join) extends Logging {
-
- private val leftStats = join.left.stats
- private val rightStats = join.right.stats
/**
* Estimate output size and number of rows after a join operator, and
update output column stats.
*/
- def doEstimate(): Option[Statistics] = join match {
+ private def estimateInnerOuterJoin(): Option[Statistics] = join match {
case _ if !rowCountsExist(join.left, join.right) =>
None
case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, _, _) =>
// 1. Compute join selectivity
val joinKeyPairs = extractJoinKeysWithColStats(leftKeys, rightKeys)
- val selectivity = joinSelectivity(joinKeyPairs)
+ val innerJoinedRows = joinCardinality(joinKeyPairs)
--- End diff --
Also returns `keyStatsAfterJoin`? If we make it global, we might introduce
bugs if use the `keyStatsAfterJoin ` before calling `joinCardinality `. BTW,
`joinCardinality ` is confusing. We might not realize `keyStatsAfterJoin ` is
updated in `joinCardinality `
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]