[GitHub] spark pull request #19479: [SPARK-17074] [SQL] Generate equi-height histogra...

cloud-fan Tue, 07 Nov 2017 13:12:35 -0800

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19479#discussion_r149504772
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
 ---
    @@ -220,29 +239,46 @@ object ColumnStat extends Logging {
        * Constructs an expression to compute column statistics for a given 
column.
        *
        * The expression should create a single struct column with the 
following schema:
    -   * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, 
maxLen: Long
    +   * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, 
maxLen: Long,
    +   * distinctCountsForIntervals: Array[Long]
        *
        * Together with [[rowToColumnStat]], this function is used to create 
[[ColumnStat]] and
        * as a result should stay in sync with it.
        */
    -  def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
    +  def statExprs(
    +      col: Attribute,
    +      conf: SQLConf,
    +      colPercentiles: AttributeMap[Array[Any]]): CreateNamedStruct = {
         def struct(exprs: Expression*): CreateNamedStruct = 
CreateStruct(exprs.map { expr =>
           expr.transformUp { case af: AggregateFunction => 
af.toAggregateExpression() }
         })
         val one = Literal(1, LongType)
     
         // the approximate ndv (num distinct value) should never be larger 
than the number of rows
         val numNonNulls = if (col.nullable) Count(col) else Count(one)
    -    val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
    +    val ndv = Least(Seq(HyperLogLogPlusPlus(col, conf.ndvMaxError), 
numNonNulls))
         val numNulls = Subtract(Count(one), numNonNulls)
         val defaultSize = Literal(col.dataType.defaultSize, LongType)
    +    val nullArray = Literal(null, ArrayType(LongType))
     
    -    def fixedLenTypeStruct(castType: DataType) = {
    +    def fixedLenTypeExprs(castType: DataType) = {
           // For fixed width types, avg size should be the same as max size.
    -      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), 
numNulls, defaultSize,
    +      Seq(ndv, Cast(Min(col), castType), Cast(Max(col), castType), 
numNulls, defaultSize,
             defaultSize)
         }
     
    +    def fixedLenTypeStruct(dataType: DataType) = {
    +      val genHistogram =
    +        ColumnStat.supportsHistogram(dataType) && 
colPercentiles.contains(col)
    +      val intervalNdvsExpr = if (genHistogram) {
    +        ApproxCountDistinctForIntervals(col,
    +          CreateArray(colPercentiles(col).map(Literal(_))), 
conf.ndvMaxError)
    --- End diff --
    
    since we need a catalyst array here, why not let the first job return 
`ArrayData` directly? i.e., do not call `toArray` in 
https://github.com/apache/spark/pull/19479/files#diff-027d6bd7c8cf4f64f99acc058389d859R145
 , but just collect the `ArrayData`.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #19479: [SPARK-17074] [SQL] Generate equi-height histogra...

Reply via email to