okumin commented on code in PR #5106: URL: https://github.com/apache/hive/pull/5106#discussion_r1517452432
########## ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java: ########## @@ -1600,26 +1600,35 @@ Table materializeCTE(String cteName, CTEClause cte) throws HiveException { LOG.info("{} will be materialized into {}", cteName, location); cte.source = analyzer; - ctx.addMaterializedTable(cteName, table, getMaterializedTableStats(analyzer.getSinkOp(), table)); + ctx.addMaterializedTable(cteName, table, getMaterializedTableStats(analyzer.getSinkOp())); return table; } - static Statistics getMaterializedTableStats(Operator<?> sinkOp, Table table) { + protected Statistics getMaterializedTableStats(Operator<?> sinkOp) { final Statistics tableStats = sinkOp.getStatistics().clone(); - final List<ColStatistics> sourceColStatsList = tableStats.getColumnStats(); - final List<String> colNames = table.getCols().stream().map(FieldSchema::getName).collect(Collectors.toList()); - if (sourceColStatsList.size() != colNames.size()) { - throw new IllegalStateException(String.format( - "The size of col stats must be equal to that of schema. Stats = %s, Schema = %s", - sourceColStatsList, colNames)); - } - final List<ColStatistics> colStatsList = new ArrayList<>(sourceColStatsList.size()); - for (int i = 0; i < sourceColStatsList.size(); i++) { - final ColStatistics colStats = sourceColStatsList.get(i); - // FileSinkOperator stores column stats with internal names such as "_col1" - colStats.setColumnName(colNames.get(i)); - colStatsList.add(colStats); + if (tableStats.getColumnStatsState() == Statistics.State.NONE || sinkOp.getNumParent() == 0) { + return tableStats; + } + + final List<String> parentColumnNames = sinkOp.getParentOperators().get(0).getSchema().getColumnNames(); + final List<String> childColumnNames = sinkOp.getSchema().getColumnNames(); + if (parentColumnNames.size() != childColumnNames.size()) { + LOG.warn("The number of columns of FileSinkOperator is inconsistent. Parent = {}, Child = {}", + parentColumnNames, childColumnNames); + tableStats.setColumnStatsState(Statistics.State.NONE); + return tableStats; + } + final Map<String, String> mapping = new HashMap<>(parentColumnNames.size()); + for (int i = 0; i < parentColumnNames.size(); i++) { + mapping.put(parentColumnNames.get(i), childColumnNames.get(i)); + } + + final List<ColStatistics> colStatsList = tableStats.getColumnStats(); + for (ColStatistics colStats : colStatsList) { + if (mapping.containsKey(colStats.getColumnName())) { Review Comment: As far as I know, no, it isn't. The current implementation is a little defensive because there are countless methods which copy or mutate Statistics and it is unrealistic to verify all of them are correctly implemented. I will add logging when we find unexpected states. Just a moment -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org