Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/19952#discussion_r156552208
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
---
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends
Logging {
}
/**
- * Returns the selectivity percentage for binary condition in the
column's
- * current valid range [min, max]
- *
- * @param op a binary comparison operator
- * @param histogram a numeric equi-height histogram
- * @param max the upper bound of the current valid range for a given
column
- * @param min the lower bound of the current valid range for a given
column
- * @param datumNumber the numeric value of a literal
- * @return the selectivity percentage for a condition in the current
range.
+ * Computes the possibility of a equal predicate using histogram.
*/
+ private def computeEqualityPossibilityByHistogram(
+ literal: Literal, colStat: ColumnStat): Double = {
+ val datum = EstimationUtils.toDecimal(literal.value,
literal.dataType).toDouble
+ val histogram = colStat.histogram.get
- def computePercentByEquiHeightHgm(
- op: BinaryComparison,
- histogram: Histogram,
- max: Double,
- min: Double,
- datumNumber: Double): Double = {
// find bins where column's current min and max locate. Note that a
column's [min, max]
// range may change due to another condition applied earlier.
- val minBinId = EstimationUtils.findFirstBinForValue(min,
histogram.bins)
- val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+ val min = EstimationUtils.toDecimal(colStat.min.get,
literal.dataType).toDouble
+ val max = EstimationUtils.toDecimal(colStat.max.get,
literal.dataType).toDouble
// compute how many bins the column's current valid range [min, max]
occupies.
- // Note that a column's [min, max] range may vary after we apply some
filter conditions.
- val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId,
minBinId, max, min, histogram)
-
- val datumInBinId = op match {
- case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
- EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
- case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
- EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
- }
+ val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+ upperBound = max,
+ upperBoundInclusive = true,
+ lowerBound = min,
+ lowerBoundInclusive = true,
+ histogram)
+
+ val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
+ upperBound = datum,
+ upperBoundInclusive = true,
+ lowerBound = datum,
+ lowerBoundInclusive = true,
+ histogram)
+
+ numBinsHoldingDatum / numBinsHoldingEntireRange
+ }
- op match {
- // LessThan and LessThanOrEqual share the same logic,
- // but their datumInBinId may be different
- case LessThan(_, _) | LessThanOrEqual(_, _) =>
- EstimationUtils.getOccupationBins(datumInBinId, minBinId,
datumNumber, min,
- histogram) / minToMaxLength
- // GreaterThan and GreaterThanOrEqual share the same logic,
- // but their datumInBinId may be different
- case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
- EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max,
datumNumber,
- histogram) / minToMaxLength
+ /**
+ * Computes the possibility of a comparison predicate using histogram.
+ */
+ private def computeComparisonPossibilityByHistogram(
+ op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double
= {
+ val datum = EstimationUtils.toDecimal(literal.value,
literal.dataType).toDouble
+ val histogram = colStat.histogram.get
+
+ // find bins where column's current min and max locate. Note that a
column's [min, max]
+ // range may change due to another condition applied earlier.
+ val min = EstimationUtils.toDecimal(colStat.min.get,
literal.dataType).toDouble
+ val max = EstimationUtils.toDecimal(colStat.max.get,
literal.dataType).toDouble
+
+ // compute how many bins the column's current valid range [min, max]
occupies.
+ val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+ max, upperBoundInclusive = true, min, lowerBoundInclusive = true,
histogram)
+
+ val numBinsHoldingDatum = op match {
--- End diff --
`numBinsHoldingRange`
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]