dongjoon-hyun commented on a change in pull request #24047: [SPARK-25196][SQL] Extends Analyze commands for cached tables URL: https://github.com/apache/spark/pull/24047#discussion_r264524078
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala ########## @@ -89,6 +89,37 @@ case class AnalyzeColumnCommand( columnsToAnalyze } + private def analyzeColumnInCatalog(sparkSession: SparkSession): Unit = { + val sessionState = sparkSession.sessionState + val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase) + val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db)) + val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB) + if (tableMeta.tableType == CatalogTableType.VIEW) { + throw new AnalysisException("ANALYZE TABLE is not supported on views.") + } + val sizeInBytes = CommandUtils.calculateTotalSize(sparkSession, tableMeta) + val relation = sparkSession.table(tableIdent).logicalPlan + val columnsToAnalyze = getColumnsToAnalyze(tableIdent, relation, columnNames, allColumns) + + // Compute stats for the computed list of columns. + val (rowCount, newColStats) = + CommandUtils.computeColumnStats(sparkSession, relation, columnsToAnalyze) + + val newColCatalogStats = newColStats.map { + case (attr, columnStat) => + attr.name -> columnStat.toCatalogColumnStat(attr.name, attr.dataType) + } + + // We also update table-level stats in order to keep them consistent with column-level stats. + val statistics = CatalogStatistics( + sizeInBytes = sizeInBytes, + rowCount = Some(rowCount), + // Newly computed column stats should override the existing ones. + colStats = tableMeta.stats.map(_.colStats).getOrElse(Map.empty) ++ newColCatalogStats) + + sessionState.catalog.alterTableStats(tableIdentWithDB, Some(statistics)) Review comment: ditto. ```scala - sessionState.catalog.alterTableStats(tableIdentWithDB, Some(statistics)) + sessionState.catalog.alterTableStats(tableIdent, Some(statistics)) ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org