Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19531#discussion_r147529228
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
 ---
    @@ -28,45 +28,43 @@ import 
org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Statistics
     import 
org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
     
     
    -object JoinEstimation extends Logging {
    +case class JoinEstimation(join: Join) extends Logging {
    +
    +  private val leftStats = join.left.stats
    +  private val rightStats = join.right.stats
    +  private val keyStatsAfterJoin = new mutable.HashMap[Attribute, 
ColumnStat]()
    +
       /**
        * Estimate statistics after join. Return `None` if the join type is not 
supported, or we don't
        * have enough statistics for estimation.
        */
    -  def estimate(join: Join): Option[Statistics] = {
    +  def estimate: Option[Statistics] = {
         join.joinType match {
           case Inner | Cross | LeftOuter | RightOuter | FullOuter =>
    -        InnerOuterEstimation(join).doEstimate()
    +        estimateInnerOuterJoin()
           case LeftSemi | LeftAnti =>
    -        LeftSemiAntiEstimation(join).doEstimate()
    +        estimateLeftSemiAntiJoin()
           case _ =>
             logDebug(s"[CBO] Unsupported join type: ${join.joinType}")
             None
         }
       }
    -}
    -
    -case class InnerOuterEstimation(join: Join) extends Logging {
    -
    -  private val leftStats = join.left.stats
    -  private val rightStats = join.right.stats
     
       /**
        * Estimate output size and number of rows after a join operator, and 
update output column stats.
        */
    -  def doEstimate(): Option[Statistics] = join match {
    +  private def estimateInnerOuterJoin(): Option[Statistics] = join match {
         case _ if !rowCountsExist(join.left, join.right) =>
           None
     
         case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, _, _) =>
           // 1. Compute join selectivity
           val joinKeyPairs = extractJoinKeysWithColStats(leftKeys, rightKeys)
    -      val selectivity = joinSelectivity(joinKeyPairs)
    +      val innerJoinedRows = joinCardinality(joinKeyPairs)
    --- End diff --
    
    Also returns `keyStatsAfterJoin`? If we make it global, we might introduce 
bugs if use the `keyStatsAfterJoin ` before calling `joinCardinality `. BTW, 
`joinCardinality ` is confusing. We might not realize `keyStatsAfterJoin `  is 
updated in `joinCardinality `  


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to