Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19479#discussion_r147669959
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
---
@@ -216,65 +218,61 @@ object ColumnStat extends Logging {
}
}
- /**
- * Constructs an expression to compute column statistics for a given
column.
- *
- * The expression should create a single struct column with the
following schema:
- * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long,
maxLen: Long
- *
- * Together with [[rowToColumnStat]], this function is used to create
[[ColumnStat]] and
- * as a result should stay in sync with it.
- */
- def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
- def struct(exprs: Expression*): CreateNamedStruct =
CreateStruct(exprs.map { expr =>
- expr.transformUp { case af: AggregateFunction =>
af.toAggregateExpression() }
- })
- val one = Literal(1, LongType)
+ private def convertToHistogram(s: String): EquiHeightHistogram = {
+ val idx = s.indexOf(",")
+ if (idx <= 0) {
+ throw new AnalysisException("Failed to parse histogram.")
+ }
+ val height = s.substring(0, idx).toDouble
+ val pattern = "Bucket\\(([^,]+), ([^,]+), ([^\\)]+)\\)".r
+ val buckets = pattern.findAllMatchIn(s).map { m =>
+ EquiHeightBucket(m.group(1).toDouble, m.group(2).toDouble,
m.group(3).toLong)
+ }.toSeq
+ EquiHeightHistogram(height, buckets)
+ }
- // the approximate ndv (num distinct value) should never be larger
than the number of rows
- val numNonNulls = if (col.nullable) Count(col) else Count(one)
- val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
- val numNulls = Subtract(Count(one), numNonNulls)
- val defaultSize = Literal(col.dataType.defaultSize, LongType)
+}
- def fixedLenTypeStruct(castType: DataType) = {
- // For fixed width types, avg size should be the same as max size.
- struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType),
numNulls, defaultSize,
- defaultSize)
- }
+/**
+ * There are a few types of histograms in state-of-the-art estimation
methods. E.g. equi-width
+ * histogram, equi-height histogram, frequency histogram (value-frequency
pairs) and hybrid
+ * histogram, etc.
+ * Currently in Spark, we support equi-height histogram since it is good
at handling skew
+ * distribution, and also provides reasonable accuracy in other cases.
+ * We can add other histograms in the future, which will make estimation
logic more complicated.
--- End diff --
are we really gonna do that ?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]