Github user ron8hu commented on a diff in the pull request:
https://github.com/apache/spark/pull/19479#discussion_r148724807
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
---
@@ -275,6 +327,64 @@ object ColumnStat extends Logging {
avgLen = row.getLong(4),
maxLen = row.getLong(5)
)
+ if (row.isNullAt(6)) {
+ cs
+ } else {
+ val ndvs = row.getArray(6).toLongArray()
+ assert(percentiles.get.length == ndvs.length + 1)
+ val endpoints = percentiles.get.map(_.toString.toDouble)
+ // Construct equi-height histogram
+ val buckets = ndvs.zipWithIndex.map { case (ndv, i) =>
+ EquiHeightBucket(endpoints(i), endpoints(i + 1), ndv)
+ }
+ val nonNullRows = rowCount - cs.nullCount
+ val ehHistogram = EquiHeightHistogram(nonNullRows.toDouble /
ndvs.length, buckets)
+ cs.copy(histogram = Some(ehHistogram))
+ }
}
}
+
+/**
+ * There are a few types of histograms in state-of-the-art estimation
methods. E.g. equi-width
+ * histogram, equi-height histogram, frequency histogram (value-frequency
pairs) and hybrid
+ * histogram, etc.
+ * Currently in Spark, we support equi-height histogram since it is good
at handling skew
+ * distribution, and also provides reasonable accuracy in other cases.
+ * We can add other histograms in the future, which will make estimation
logic more complicated.
+ * This is because we will have to deal with computation between different
types of histograms in
+ * some cases, e.g. for join columns.
+ */
+trait Histogram
+
+/**
+ * Equi-height histogram represents column value distribution by a
sequence of buckets. Each bucket
+ * has a value range and contains approximately the same number of rows.
+ * @param height number of rows in each bucket
+ * @param ehBuckets equi-height histogram buckets
+ */
+case class EquiHeightHistogram(height: Double, ehBuckets:
Seq[EquiHeightBucket]) extends Histogram {
--- End diff --
Please declare ehBuckets: Array[EquiHeightBucket]) instead of ehBuckets:
Seq[EquiHeightBucket]). This is because we need to access a bucket directly
and randomly. For random access, Scala Array can provide better performance as
it has index to access an array element quickly.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]