This is an automated email from the ASF dual-hosted git repository.

kasakrisz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new d4a2f169a10 HIVE-29617: Error while loading column statistics of 
Iceberg table after upgrading Hive (#6496)
d4a2f169a10 is described below

commit d4a2f169a10586d5442383429b40ff58095bd2ac
Author: Krisztian Kasa <[email protected]>
AuthorDate: Tue Jun 2 16:54:38 2026 +0200

    HIVE-29617: Error while loading column statistics of Iceberg table after 
upgrading Hive (#6496)
---
 .../apache/hadoop/hive/ql/stats/StatsUtils.java    | 35 +++++++----
 .../hadoop/hive/ql/stats/TestStatsUtils.java       | 67 ++++++++++++++++++++++
 2 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 55f9d0c1e15..e539f0ab9c4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -239,22 +239,33 @@ public static long getNumRows(HiveConf conf, 
List<ColumnInfo> schema, Table tabl
     return aggregateStat.getNumRows();
   }
 
-  private static void estimateStatsForMissingCols(List<String> neededColumns, 
List<ColStatistics> columnStats,
-      HiveConf conf, long nr, List<ColumnInfo> schema) {
+  /**
+   * Estimates column statistics for columns specified in {@code 
neededColumnNames}
+   * that do not already have statistics in the {@code existingColStats} list.
+   *
+   * @return A {@link List} of {@link ColStatistics} objects containing
+   * both the provided existing statistics and the newly estimated ones.
+   */
+  static List<ColStatistics> estimateStatsForMissingCols(
+      List<String> neededColumnNames, List<ColStatistics> existingColStats, 
HiveConf conf, long nr,
+      List<ColumnInfo> schema) {
 
-    Set<String> neededCols = new HashSet<>(neededColumns);
-    Set<String> colsWithStats = new HashSet<>();
+    Set<String> neededCols = new HashSet<>(neededColumnNames);
+    Set<String> columnNamesWithStats = 
HashSet.newHashSet(existingColStats.size());
 
-    for (ColStatistics cstats : columnStats) {
-      colsWithStats.add(cstats.getColumnName());
+    for (ColStatistics cstats : existingColStats) {
+      columnNamesWithStats.add(cstats.getColumnName());
     }
 
-    List<String> missingColStats = new ArrayList<>(Sets.difference(neededCols, 
colsWithStats));
+    List<String> missingColumnNames = new 
ArrayList<>(Sets.difference(neededCols, columnNamesWithStats));
+    ArrayList<ColStatistics> combined = new 
ArrayList<>(existingColStats.size() + missingColumnNames.size());
+    combined.addAll(existingColStats);
 
-    if (!missingColStats.isEmpty()) {
-      columnStats.addAll(
-          estimateStats(schema, missingColStats, conf, nr));
+    if (!missingColumnNames.isEmpty()) {
+      combined.addAll(estimateStats(schema, missingColumnNames, conf, nr));
     }
+
+    return combined;
   }
 
   public static Statistics collectStatistics(HiveConf conf, 
PrunedPartitionList partList,
@@ -300,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf, 
PrunedPartitionList p
       if (needColStats && !metaTable) {
         colStats = getTableColumnStats(table, neededColumns, colStatsCache, 
fetchColStats);
         if (estimateStats) {
-          estimateStatsForMissingCols(neededColumns, colStats, conf, nr, 
schema);
+          colStats = estimateStatsForMissingCols(neededColumns, colStats, 
conf, nr, schema);
         }
         // we should have stats for all columns (estimated or actual)
         if (neededColumns.size() == colStats.size()) {
@@ -386,7 +397,7 @@ private static Statistics collectStatistics(HiveConf conf, 
PrunedPartitionList p
         boolean statsRetrieved = aggrStats != null &&
             aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 
0;
         if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && 
!statsRetrieved)) {
-          estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, 
nr, schema);
+          columnStats = estimateStatsForMissingCols(neededColsToRetrieve, 
columnStats, conf, nr, schema);
           // There are some partitions with no state (or we didn't fetch any 
state).
           // Update the stats with empty list to reflect that in the
           // state/initialize structures.
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java 
b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
index c009472fed0..2faefafdee3 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.hive.ql.stats;
 
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -41,10 +43,12 @@
 import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.Timestamp;
 import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
 import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
@@ -565,4 +569,67 @@ void testGetColStatisticsTimestampType() {
     assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch 
for TIMESTAMP");
   }
 
+  @Test
+  void testEstimateStatsForMissingColsHandlesEmptyList() {
+    HiveConf conf = new HiveConf();
+
+    ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo, 
"t", false);
+
+    List<ColStatistics> allColumnStats = 
StatsUtils.estimateStatsForMissingCols(
+        List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA));
+
+    assertEquals(1, allColumnStats.size());
+  }
+
+  @Test
+  void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() {
+    HiveConf conf = new HiveConf();
+
+    ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+    ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+    ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+    ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+
+    ColStatistics colStatNeededAndExists = new ColStatistics();
+    colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName());
+    ColStatistics colStatNotNeededButExists = new ColStatistics();
+    
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
+
+    List<ColStatistics> allColumnStats = 
StatsUtils.estimateStatsForMissingCols(
+        List.of(colNeededAndExists.getInternalName(), 
colNeededButNotExists.getInternalName()),
+        List.of(colStatNeededAndExists, colStatNotNeededButExists),
+        conf,
+        0,
+        List.of(colNeededButNotExists, colNeededAndExists, 
colNotNeededButExists, colNotNeededNotExists));
+
+    assertEquals(3, allColumnStats.size());
+    assertEquals(colStatNeededAndExists, allColumnStats.get(0));
+    assertFalse(allColumnStats.get(0).isEstimated());
+    assertEquals(colStatNotNeededButExists, allColumnStats.get(1));
+    assertFalse(allColumnStats.get(1).isEstimated());
+    assertEquals(colNeededButNotExists.getInternalName(), 
allColumnStats.get(2).getColumnName());
+    assertTrue(allColumnStats.get(2).isEstimated());
+  }
+
+  @Test
+  void 
testEstimateStatsForMissingColsReturnOnlyColumnsWithExistingStatsWhenNoNeededColumn()
 {
+    HiveConf conf = new HiveConf();
+
+    ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+    ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", 
TypeInfoFactory.intTypeInfo, "t", false);
+
+    ColStatistics colStatNotNeededButExists = new ColStatistics();
+    
colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName());
+
+    List<ColStatistics> allColumnStats = 
StatsUtils.estimateStatsForMissingCols(
+        Collections.emptyList(),
+        List.of(colStatNotNeededButExists),
+        conf,
+        0,
+        List.of(colNotNeededButExists, colNotNeededNotExists));
+
+    assertEquals(1, allColumnStats.size());
+    assertEquals(allColumnStats.getFirst(), colStatNotNeededButExists);
+  }
+
 }

Reply via email to