Github user rxin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15959#discussion_r88843445
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 ---
    @@ -100,99 +100,30 @@ object AnalyzeColumnCommand extends Logging {
           exprOption.getOrElse(throw new AnalysisException(s"Invalid column 
name: $col."))
         }).toSeq
     
    +    // Make sure the column types are supported for stats gathering.
    +    attributesToAnalyze.foreach { attr =>
    +      if (!ColumnStat.supportsType(attr.dataType)) {
    +        throw new AnalysisException(
    +          s"Data type ${attr.dataType.simpleString} for column 
${attr.name} is not supported " +
    +          "in column statistics gathering.")
    +      }
    +    }
    +
         // Collect statistics per column.
         // The first element in the result will be the overall row count, the 
following elements
         // will be structs containing all column stats.
         // The layout of each struct follows the layout of the ColumnStats.
         val ndvMaxErr = sparkSession.sessionState.conf.ndvMaxError
         val expressions = Count(Literal(1)).toAggregateExpression() +:
    -      
attributesToAnalyze.map(AnalyzeColumnCommand.createColumnStatStruct(_, 
ndvMaxErr))
    +        attributesToAnalyze.map(ColumnStat.statExprs(_, ndvMaxErr))
    +
         val namedExpressions = expressions.map(e => Alias(e, e.toString)())
    -    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, 
namedExpressions, relation))
    -      .queryExecution.toRdd.collect().head
    +    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, 
namedExpressions, relation)).head()
     
    -    // unwrap the result
    -    // TODO: Get rid of numFields by using the public Dataset API.
         val rowCount = statsRow.getLong(0)
         val columnStats = attributesToAnalyze.zipWithIndex.map { case (expr, 
i) =>
    -      val numFields = AnalyzeColumnCommand.numStatFields(expr.dataType)
    -      (expr.name, ColumnStat(statsRow.getStruct(i + 1, numFields)))
    +      (expr.name, ColumnStat.rowToColumnStat(statsRow.getStruct(i + 1)))
         }.toMap
         (rowCount, columnStats)
       }
    -
    -  private val zero = Literal(0, LongType)
    --- End diff --
    
    all of these are now defined in a single function: ColumnStat.statExprs


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to