This is an automated email from the ASF dual-hosted git repository.
kasakrisz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new d4a2f169a10 HIVE-29617: Error while loading column statistics of
Iceberg table after upgrading Hive (#6496)
d4a2f169a10 is described below
commit d4a2f169a10586d5442383429b40ff58095bd2ac
Author: Krisztian Kasa <[email protected]>
AuthorDate: Tue Jun 2 16:54:38 2026 +0200
HIVE-29617: Error while loading column statistics of Iceberg table after
upgrading Hive (#6496)
---
.../apache/hadoop/hive/ql/stats/StatsUtils.java | 35 +++++++----
.../hadoop/hive/ql/stats/TestStatsUtils.java | 67 ++++++++++++++++++++++
2 files changed, 90 insertions(+), 12 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 55f9d0c1e15..e539f0ab9c4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -239,22 +239,33 @@ public static long getNumRows(HiveConf conf,
List<ColumnInfo> schema, Table tabl
return aggregateStat.getNumRows();
}
- private static void estimateStatsForMissingCols(List<String> neededColumns,
List<ColStatistics> columnStats,
- HiveConf conf, long nr, List<ColumnInfo> schema) {
+ /**
+ * Estimates column statistics for columns specified in {@code
neededColumnNames}
+ * that do not already have statistics in the {@code existingColStats} list.
+ *
+ * @return A {@link List} of {@link ColStatistics} objects containing
+ * both the provided existing statistics and the newly estimated ones.
+ */
+ static List<ColStatistics> estimateStatsForMissingCols(
+ List<String> neededColumnNames, List<ColStatistics> existingColStats,
HiveConf conf, long nr,
+ List<ColumnInfo> schema) {
- Set<String> neededCols = new HashSet<>(neededColumns);
- Set<String> colsWithStats = new HashSet<>();
+ Set<String> neededCols = new HashSet<>(neededColumnNames);
+ Set<String> columnNamesWithStats =
HashSet.newHashSet(existingColStats.size());
- for (ColStatistics cstats : columnStats) {
- colsWithStats.add(cstats.getColumnName());
+ for (ColStatistics cstats : existingColStats) {
+ columnNamesWithStats.add(cstats.getColumnName());
}
- List<String> missingColStats = new ArrayList<>(Sets.difference(neededCols,
colsWithStats));
+ List<String> missingColumnNames = new
ArrayList<>(Sets.difference(neededCols, columnNamesWithStats));
+ ArrayList<ColStatistics> combined = new
ArrayList<>(existingColStats.size() + missingColumnNames.size());
+ combined.addAll(existingColStats);
- if (!missingColStats.isEmpty()) {
- columnStats.addAll(
- estimateStats(schema, missingColStats, conf, nr));
+ if (!missingColumnNames.isEmpty()) {
+ combined.addAll(estimateStats(schema, missingColumnNames, conf, nr));
}
+
+ return combined;
}
public static Statistics collectStatistics(HiveConf conf,
PrunedPartitionList partList,
@@ -300,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf,
PrunedPartitionList p
if (needColStats && !metaTable) {
colStats = getTableColumnStats(table, neededColumns, colStatsCache,
fetchColStats);
if (estimateStats) {
- estimateStatsForMissingCols(neededColumns, colStats, conf, nr,
schema);
+ colStats = estimateStatsForMissingCols(neededColumns, colStats,
conf, nr, schema);
}
// we should have stats for all columns (estimated or actual)
if (neededColumns.size() == colStats.size()) {
@@ -386,7 +397,7 @@ private static Statistics collectStatistics(HiveConf conf,
PrunedPartitionList p
boolean statsRetrieved = aggrStats != null &&
aggrStats.getColStats() != null && aggrStats.getColStatsSize() !=
0;
if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() &&
!statsRetrieved)) {
- estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf,
nr, schema);
+ columnStats = estimateStatsForMissingCols(neededColsToRetrieve,
columnStats, conf, nr, schema);
// There are some partitions with no state (or we didn't fetch any
state).
// Update the stats with empty list to reflect that in the
// state/initialize structures.
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
index c009472fed0..2faefafdee3 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.hive.ql.stats;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -41,10 +43,12 @@
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Timestamp;
import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
@@ -565,4 +569,67 @@ void testGetColStatisticsTimestampType() {
assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch
for TIMESTAMP");
}
+ @Test
+ void testEstimateStatsForMissingColsHandlesEmptyList() {
+ HiveConf conf = new HiveConf();
+
+ ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo,
"t", false);
+
+ List<ColStatistics> allColumnStats =
StatsUtils.estimateStatsForMissingCols(
+ List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA));
+
+ assertEquals(1, allColumnStats.size());
+ }
+
+ @Test
+ void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() {
+ HiveConf conf = new HiveConf();
+
+ ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists",
TypeInfoFactory.intTypeInfo, "t", false);
+ ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists",
TypeInfoFactory.intTypeInfo, "t", false);
+ ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists",
TypeInfoFactory.intTypeInfo, "t", false);
+ ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists",
TypeInfoFactory.intTypeInfo, "t", false);
+
+ ColStatistics colStatNeededAndExists = new ColStatistics();
+ colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName());
+ ColStatistics colStatNotNeededButExists = new ColStatistics();
+
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
+
+ List<ColStatistics> allColumnStats =
StatsUtils.estimateStatsForMissingCols(
+ List.of(colNeededAndExists.getInternalName(),
colNeededButNotExists.getInternalName()),
+ List.of(colStatNeededAndExists, colStatNotNeededButExists),
+ conf,
+ 0,
+ List.of(colNeededButNotExists, colNeededAndExists,
colNotNeededButExists, colNotNeededNotExists));
+
+ assertEquals(3, allColumnStats.size());
+ assertEquals(colStatNeededAndExists, allColumnStats.get(0));
+ assertFalse(allColumnStats.get(0).isEstimated());
+ assertEquals(colStatNotNeededButExists, allColumnStats.get(1));
+ assertFalse(allColumnStats.get(1).isEstimated());
+ assertEquals(colNeededButNotExists.getInternalName(),
allColumnStats.get(2).getColumnName());
+ assertTrue(allColumnStats.get(2).isEstimated());
+ }
+
+ @Test
+ void
testEstimateStatsForMissingColsReturnOnlyColumnsWithExistingStatsWhenNoNeededColumn()
{
+ HiveConf conf = new HiveConf();
+
+ ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists",
TypeInfoFactory.intTypeInfo, "t", false);
+ ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists",
TypeInfoFactory.intTypeInfo, "t", false);
+
+ ColStatistics colStatNotNeededButExists = new ColStatistics();
+
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
+
+ List<ColStatistics> allColumnStats =
StatsUtils.estimateStatsForMissingCols(
+ Collections.emptyList(),
+ List.of(colStatNotNeededButExists),
+ conf,
+ 0,
+ List.of(colNotNeededButExists, colNotNeededNotExists));
+
+ assertEquals(1, allColumnStats.size());
+ assertEquals(allColumnStats.getFirst(), colStatNotNeededButExists);
+ }
+
}