Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19479#discussion_r149504772
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
---
@@ -220,29 +239,46 @@ object ColumnStat extends Logging {
* Constructs an expression to compute column statistics for a given
column.
*
* The expression should create a single struct column with the
following schema:
- * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long,
maxLen: Long
+ * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long,
maxLen: Long,
+ * distinctCountsForIntervals: Array[Long]
*
* Together with [[rowToColumnStat]], this function is used to create
[[ColumnStat]] and
* as a result should stay in sync with it.
*/
- def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
+ def statExprs(
+ col: Attribute,
+ conf: SQLConf,
+ colPercentiles: AttributeMap[Array[Any]]): CreateNamedStruct = {
def struct(exprs: Expression*): CreateNamedStruct =
CreateStruct(exprs.map { expr =>
expr.transformUp { case af: AggregateFunction =>
af.toAggregateExpression() }
})
val one = Literal(1, LongType)
// the approximate ndv (num distinct value) should never be larger
than the number of rows
val numNonNulls = if (col.nullable) Count(col) else Count(one)
- val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
+ val ndv = Least(Seq(HyperLogLogPlusPlus(col, conf.ndvMaxError),
numNonNulls))
val numNulls = Subtract(Count(one), numNonNulls)
val defaultSize = Literal(col.dataType.defaultSize, LongType)
+ val nullArray = Literal(null, ArrayType(LongType))
- def fixedLenTypeStruct(castType: DataType) = {
+ def fixedLenTypeExprs(castType: DataType) = {
// For fixed width types, avg size should be the same as max size.
- struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType),
numNulls, defaultSize,
+ Seq(ndv, Cast(Min(col), castType), Cast(Max(col), castType),
numNulls, defaultSize,
defaultSize)
}
+ def fixedLenTypeStruct(dataType: DataType) = {
+ val genHistogram =
+ ColumnStat.supportsHistogram(dataType) &&
colPercentiles.contains(col)
+ val intervalNdvsExpr = if (genHistogram) {
+ ApproxCountDistinctForIntervals(col,
+ CreateArray(colPercentiles(col).map(Literal(_))),
conf.ndvMaxError)
--- End diff --
since we need a catalyst array here, why not let the first job return
`ArrayData` directly? i.e., do not call `toArray` in
https://github.com/apache/spark/pull/19479/files#diff-027d6bd7c8cf4f64f99acc058389d859R145
, but just collect the `ArrayData`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]