Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/17918#discussion_r115478033
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
 ---
    @@ -750,24 +739,59 @@ case class FilterEstimation(plan: Filter, 
catalystConf: SQLConf) extends Logging
           }
         }
     
    -    Some(percent.toDouble)
    +    Some(percent)
       }
     
     }
     
    -class ColumnStatsMap {
    -  private val baseMap: mutable.Map[ExprId, (Attribute, ColumnStat)] = 
mutable.HashMap.empty
    +/**
    + * This class contains the original column stats from child, and maintains 
the updated column stats.
    + * We will update the corresponding ColumnStats for a column after we 
apply a predicate condition.
    + * For example, column c has [min, max] value as [0, 100].  In a range 
condition such as
    + * (c > 40 AND c <= 50), we need to set the column's [min, max] value to 
[40, 100] after we
    + * evaluate the first condition c > 40. We also need to set the column's 
[min, max] value to
    + * [40, 50] after we evaluate the second condition c <= 50.
    + *
    + * @param originalMap Original column stats from child.
    + */
    +case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
     
    -  def setInitValues(colStats: AttributeMap[ColumnStat]): Unit = {
    -    baseMap.clear()
    -    baseMap ++= colStats.baseMap
    -  }
    +  /** This map maintains the latest column stats. */
    +  private val updatedMap: mutable.Map[ExprId, (Attribute, ColumnStat)] = 
mutable.HashMap.empty
     
    -  def contains(a: Attribute): Boolean = baseMap.contains(a.exprId)
    +  def contains(a: Attribute): Boolean = updatedMap.contains(a.exprId) || 
originalMap.contains(a)
     
    -  def apply(a: Attribute): ColumnStat = baseMap(a.exprId)._2
    +  /**
    +   * Gets column stat for the given attribute. Prefer the column stat in 
updatedMap than that in
    +   * originalMap, because updatedMap has the latest (updated) column stats.
    +   */
    +  def apply(a: Attribute): ColumnStat = {
    +    if (updatedMap.contains(a.exprId)) {
    +      updatedMap(a.exprId)._2
    +    } else {
    +      originalMap(a)
    +    }
    +  }
     
    -  def update(a: Attribute, stats: ColumnStat): Unit = 
baseMap.update(a.exprId, a -> stats)
    +  /** Updates column stats in updatedMap. */
    +  def update(a: Attribute, stats: ColumnStat): Unit = 
updatedMap.update(a.exprId, a -> stats)
     
    -  def toColumnStats: AttributeMap[ColumnStat] = 
AttributeMap(baseMap.values.toSeq)
    +  /**
    +   * Collects updated column stats, and scales down ndv for other column 
stats if the number of rows
    +   * decreases after this Filter operator.
    +   */
    +  def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
    +    : AttributeMap[ColumnStat] = {
    +    val newColumnStats = originalMap.map { case (attr, oriColStat) =>
    +      // Update ndv based on the overall filter selectivity: scale down 
ndv if the number of rows
    +      // decreases; otherwise keep it unchanged.
    +      val newNdv = EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
    +        newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
    +      val colStat = if (updatedMap.contains(attr.exprId)) 
updatedMap(attr.exprId)._2 else oriColStat
    --- End diff --
    
    ```
    val colStat = updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
    attr -> colStat.copy(distinctCount = newNdv)
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to