Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/19783#discussion_r153975698
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
---
@@ -784,11 +879,16 @@ case class ColumnStatsMap(originalMap:
AttributeMap[ColumnStat]) {
def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
: AttributeMap[ColumnStat] = {
val newColumnStats = originalMap.map { case (attr, oriColStat) =>
- // Update ndv based on the overall filter selectivity: scale down
ndv if the number of rows
- // decreases; otherwise keep it unchanged.
- val newNdv = EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
- newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
val colStat =
updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
+ val newNdv = if (colStat.distinctCount > 1) {
--- End diff --
no need to add extra check here, in `EstimationUtils.updateNdv` we already
check this case. We can just revert the change here.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]