kasakrisz commented on code in PR #5648: URL: https://github.com/apache/hive/pull/5648#discussion_r2001008226
########## iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java: ########## @@ -491,23 +481,33 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr @Override public Map<String, String> getBasicStatistics(Partish partish) { + return getBasicStatistics(partish, false); + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private Map<String, String> getBasicStatistics(Partish partish, boolean quickStats) { Review Comment: What is `quickStats`? Do we gain any performance improvement if it is `true`? Based on the code I found that only some look ups in a Hash table is skipped. ########## iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java: ########## @@ -491,23 +481,33 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr @Override public Map<String, String> getBasicStatistics(Partish partish) { + return getBasicStatistics(partish, false); + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private Map<String, String> getBasicStatistics(Partish partish, boolean quickStats) { Map<String, String> stats = Maps.newHashMap(); - if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) { - return partish.getPartParameters(); - } org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); // For write queries where rows got modified, don't fetch from cache as values could have changed. Table table = getTable(hmsTable); - Snapshot snapshot = IcebergTableUtil.getTableSnapshot(table, hmsTable); - if (snapshot != null) { + + if (snapshot == null) { + stats.put(StatsSetupConst.NUM_FILES, "0"); + stats.put(StatsSetupConst.ROW_COUNT, "0"); + stats.put(StatsSetupConst.TOTAL_SIZE, "0"); + + } else if (!getStatsSource().equals(HiveMetaHook.ICEBERG) && !quickStats) { Review Comment: How about `HiveMetaHook.ICEBERG.equals(getStatsSource())`? It would be null safe ########## iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java: ########## @@ -612,40 +615,42 @@ private Table getTable(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { @Override public boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { - Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable()); - return table.currentSnapshot() != null && getStatsSource().equals(HiveMetaHook.ICEBERG); + return getStatsSource().equals(HiveMetaHook.ICEBERG); Review Comment: How about HiveMetaHook.ICEBERG.equals(getStatsSource())? It would be null safe ########## iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java: ########## @@ -695,25 +700,64 @@ public List<ColumnStatisticsObj> getColStatistics(org.apache.hadoop.hive.ql.meta ColumnStatistics emptyStats = new ColumnStatistics(); if (snapshot != null) { return IcebergTableUtil.getColStatsPath(table, snapshot.snapshotId()) - .map(statsPath -> readColStats(table, statsPath)) + .map(statsPath -> readColStats(table, statsPath, null).get(0)) .orElse(emptyStats).getStatsObj(); } return emptyStats.getStatsObj(); } - private ColumnStatistics readColStats(Table table, Path statsPath) { + @Override + public AggrStats getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hmsTable, List<String> colNames, + List<String> partNames) throws MetaException { + Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable()); + Snapshot snapshot = IcebergTableUtil.getTableSnapshot(table, hmsTable); + + AggrStats emptyStats = new AggrStats(Collections.emptyList(), 0); + if (snapshot != null) { Review Comment: How about ``` if (snapshot == null) { return new AggrStats(Collections.emptyList(), 0); } boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), MetastoreConf.ConfVars.STATS_NDV_DENSITY_FUNCTION); ... return new AggrStats(colStatsList, partStats.size()); or just move the new instance creation to the end of the method ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org