This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 31769a7fb IMPALA-13122: Add detailed file metadata statistics to table 
loading logs
31769a7fb is described below

commit 31769a7fb50ae1d6b6d69d366a776df441e00e3a
Author: Arnab Karmakar <[email protected]>
AuthorDate: Mon Jan 26 11:04:23 2026 -0800

    IMPALA-13122: Add detailed file metadata statistics to table loading logs
    
    This patch enhances table loading logs to include comprehensive file
    metadata statistics, making it easier to identify small files issues
    and diagnose slow storage performance.
    
    The following statistics are now logged when loading file metadata:
    - Number of files and blocks
    - File sizes (min/avg/max)
    - Total file size
    - Modification times (min/max)
    - Access times (min/max)
    - Number of host:disk pairs (HDFS/Ozone only)
    
    Example log output:
      Loaded file and block metadata for functional.alltypes partitions:
      year=2009/month=1, year=2009/month=10, year=2009/month=11, and 21
      others. Time taken: 13.474ms. Files: 24, Blocks: 24, Total size:
      478.45KB, File sizes (min/avg/max): 18.12KB/19.93KB/20.36KB,
      Modification times (min/max): 2026-02-17 01:28:17/2026-02-17 01:28:21,
      Access times (min/max): 2026-02-24 00:58:39/2026-02-24 00:58:39,
      Hosts: 3, Host:Disk pairs: 3
    
    Testing:
    - Added Junit tests to verify statistics collection accuracy
    - Added new python end-to-end tests covering various cases
    
    Change-Id: I6f4592f173c047e5064058402f83be6d1f5c9a79
    Reviewed-on: http://gerrit.cloudera.org:8080/23906
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../java/org/apache/impala/catalog/FeFsTable.java  | 151 +++++++++++++++++++++
 .../apache/impala/catalog/FileMetadataLoader.java  |  16 +++
 .../org/apache/impala/catalog/HdfsPartition.java   |   4 +
 .../java/org/apache/impala/catalog/HdfsTable.java  |  18 ++-
 .../impala/catalog/FileMetadataLoaderTest.java     |  50 +++++++
 tests/custom_cluster/test_file_metadata_stats.py   | 136 +++++++++++++++++++
 6 files changed, 373 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
index bee2ce130..0681cd396 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
@@ -22,6 +22,9 @@ import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 
 import java.io.IOException;
+import java.time.format.DateTimeFormatter;
+import java.time.Instant;
+import java.time.ZoneId;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -44,6 +47,7 @@ import org.apache.impala.analysis.LiteralExpr;
 import org.apache.impala.analysis.PartitionKeyValue;
 import org.apache.impala.common.AnalysisException;
 import org.apache.impala.common.FileSystemUtil;
+import org.apache.impala.common.Pair;
 import org.apache.impala.common.PrintUtils;
 import org.apache.impala.fb.FbFileBlock;
 import org.apache.impala.service.BackendConfig;
@@ -109,6 +113,22 @@ public interface FeFsTable extends FeTable {
     public long numBlocks = 0;
     // Total size (in bytes) of all files in a table/partition.
     public long totalFileBytes = 0;
+    // Min file size in bytes
+    public long minFileBytes = Long.MAX_VALUE;
+    // Max file size in bytes
+    public long maxFileBytes = 0;
+    // Min modification time
+    public long minModificationTime = Long.MAX_VALUE;
+    // Max modification time
+    public long maxModificationTime = 0;
+    // Min access time
+    public long minAccessTime = Long.MAX_VALUE;
+    // Max access time
+    public long maxAccessTime = 0;
+    // Set of unique host:disk pairs (for HDFS/Ozone only)
+    // Stores pairs as Pair<hostIndex, diskId> for efficient tracking
+    // Disk IDs are 0-based per host, so pairs must be tracked together
+    public Set<Pair<Integer, Short>> uniqueHostDiskPairs = new HashSet<>();
 
     public FileMetadataStats() {}
 
@@ -127,24 +147,48 @@ public interface FeFsTable extends FeTable {
       numFiles = 0;
       numBlocks = 0;
       totalFileBytes = 0;
+      minFileBytes = Long.MAX_VALUE;
+      maxFileBytes = 0;
+      minModificationTime = Long.MAX_VALUE;
+      maxModificationTime = 0;
+      minAccessTime = Long.MAX_VALUE;
+      maxAccessTime = 0;
+      uniqueHostDiskPairs.clear();
     }
 
     public void set(FileMetadataStats stats) {
       numFiles = stats.numFiles;
       numBlocks = stats.numBlocks;
       totalFileBytes = stats.totalFileBytes;
+      minFileBytes = stats.minFileBytes;
+      maxFileBytes = stats.maxFileBytes;
+      minModificationTime = stats.minModificationTime;
+      maxModificationTime = stats.maxModificationTime;
+      minAccessTime = stats.minAccessTime;
+      maxAccessTime = stats.maxAccessTime;
+      uniqueHostDiskPairs = new HashSet<>(stats.uniqueHostDiskPairs);
     }
 
     public void merge(FileMetadataStats other) {
       numFiles += other.numFiles;
       numBlocks += other.numBlocks;
       totalFileBytes += other.totalFileBytes;
+      minFileBytes = Math.min(minFileBytes, other.minFileBytes);
+      maxFileBytes = Math.max(maxFileBytes, other.maxFileBytes);
+      minModificationTime = Math.min(minModificationTime, 
other.minModificationTime);
+      maxModificationTime = Math.max(maxModificationTime, 
other.maxModificationTime);
+      minAccessTime = Math.min(minAccessTime, other.minAccessTime);
+      maxAccessTime = Math.max(maxAccessTime, other.maxAccessTime);
+      uniqueHostDiskPairs.addAll(other.uniqueHostDiskPairs);
     }
 
     public void remove(FileMetadataStats other) {
       numFiles -= other.numFiles;
       numBlocks -= other.numBlocks;
       totalFileBytes -= other.totalFileBytes;
+      // Note: We cannot accurately update min/max values or host:disk pairs 
when
+      // removing stats. These fields may be stale after dropPartition() calls.
+      // They are refreshed on the next full table load via HdfsTable.load().
     }
 
     // Accumulate the statistics of the fd into this FileMetadataStats.
@@ -152,6 +196,113 @@ public interface FeFsTable extends FeTable {
       numBlocks += fd.getNumFileBlocks();
       totalFileBytes += fd.getFileLength();
       ++numFiles;
+
+      // Track min/max file sizes
+      long fileLen = fd.getFileLength();
+      minFileBytes = Math.min(minFileBytes, fileLen);
+      maxFileBytes = Math.max(maxFileBytes, fileLen);
+
+      // Track min/max modification times
+      long modTime = fd.getModificationTime();
+      minModificationTime = Math.min(minModificationTime, modTime);
+      maxModificationTime = Math.max(maxModificationTime, modTime);
+
+      // Track unique host:disk pairs from file blocks
+      for (int i = 0; i < fd.getNumFileBlocks(); ++i) {
+        FbFileBlock block = fd.getFbFileBlock(i);
+        int numReplicas = block.replicaHostIdxsLength();
+        int numDiskIds = block.diskIdsLength();
+        // Pair up host indices with disk IDs
+        for (int j = 0; j < numReplicas; ++j) {
+          int hostIdx = FileBlock.getReplicaHostIdx(block, j);
+          short diskId = (j < numDiskIds) ? block.diskIds(j) : -1;
+          if (diskId >= 0) {  // Only track valid disk IDs
+            uniqueHostDiskPairs.add(Pair.create(hostIdx, diskId));
+          }
+        }
+      }
+    }
+
+    public long getAvgFileBytes() {
+      return numFiles > 0 ? totalFileBytes / numFiles : 0;
+    }
+
+    public int getNumUniqueHosts() {
+      Set<Integer> uniqueHosts = new HashSet<>();
+      for (Pair<Integer, Short> pair : uniqueHostDiskPairs) {
+        uniqueHosts.add(pair.first);
+      }
+      return uniqueHosts.size();
+    }
+
+    public int getNumUniqueHostDiskPairs() {
+      return uniqueHostDiskPairs.size();
+    }
+
+    /**
+     * Builds a detailed log string with all file metadata statistics.
+     * @param tableName The full table name for the log message
+     * @param partNames Comma-separated partition names
+     * @param durationNs Time taken to load metadata in nanoseconds
+     * @return Formatted log string with all statistics
+     */
+    public String toLogString(String tableName, String partNames, long 
durationNs) {
+      DateTimeFormatter dateFormatter = 
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
+          .withZone(ZoneId.systemDefault());
+
+      StringBuilder statsLog = new StringBuilder()
+          .append("Loaded file and block metadata for ").append(tableName);
+      if (!partNames.isEmpty()) {
+        statsLog.append(" partitions: ").append(partNames);
+      }
+      statsLog.append(". ")
+          .append("Time taken: ").append(PrintUtils.printTimeNs(durationNs))
+          .append(". ");
+
+      if (numFiles > 0) {
+        statsLog.append("Files: ").append(numFiles)
+            .append(", Blocks: ").append(numBlocks)
+            .append(", Total size: ")
+            .append(PrintUtils.printBytes(totalFileBytes))
+            .append(", File sizes (min/avg/max): ")
+            .append(PrintUtils.printBytes(minFileBytes))
+            .append("/")
+            .append(PrintUtils.printBytes(getAvgFileBytes()))
+            .append("/")
+            .append(PrintUtils.printBytes(maxFileBytes));
+
+        // Modification time statistics (formatted as dates)
+        if (minModificationTime != Long.MAX_VALUE && maxModificationTime > 0) {
+          statsLog.append(", Modification times (min/max): ")
+              .append(dateFormatter.format(Instant.ofEpochMilli(
+                  minModificationTime)))
+              .append("/")
+              .append(dateFormatter.format(Instant.ofEpochMilli(
+                  maxModificationTime)));
+        }
+
+        // Access time statistics (formatted as dates)
+        // Note: Access time may be 0 or not updated if disabled in HDFS for 
performance
+        if (minAccessTime != Long.MAX_VALUE && maxAccessTime > 0) {
+          statsLog.append(", Access times (min/max): ")
+              
.append(dateFormatter.format(Instant.ofEpochMilli(minAccessTime)))
+              .append("/")
+              .append(dateFormatter.format(Instant.ofEpochMilli(
+                  maxAccessTime)));
+        }
+
+        // HDFS/Ozone specific stats
+        int numUniqueHosts = getNumUniqueHosts();
+        int numUniqueHostDiskPairs = getNumUniqueHostDiskPairs();
+        if (numUniqueHosts > 0) {
+          statsLog.append(", Hosts: ").append(numUniqueHosts);
+        }
+        if (numUniqueHostDiskPairs > 0) {
+          statsLog.append(", Host:Disk pairs: 
").append(numUniqueHostDiskPairs);
+        }
+      }
+
+      return statsLog.toString();
     }
   }
 
diff --git a/fe/src/main/java/org/apache/impala/catalog/FileMetadataLoader.java 
b/fe/src/main/java/org/apache/impala/catalog/FileMetadataLoader.java
index 8969aa321..827ca6fd7 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FileMetadataLoader.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FileMetadataLoader.java
@@ -207,6 +207,8 @@ public class FileMetadataLoader {
       if (fileStatuses == null) return;
 
       AtomicLong numUnknownDiskIds = new AtomicLong(0);
+      long minAccessTime = Long.MAX_VALUE;
+      long maxAccessTime = 0;
 
       if (writeIds_ != null) {
         fileStatuses = AcidUtils.filterFilesForAcidState(fileStatuses, 
partPath,
@@ -231,6 +233,13 @@ public class FileMetadataLoader {
             fileStatus, partPath);
         loadedFds_.add(Preconditions.checkNotNull(fd));
         fileMetadataStats_.accumulate(fd);
+
+        // Track access time stats
+        long accessTime = fileStatus.getAccessTime();
+        if (accessTime > 0) {  // Access time can be 0 if not 
supported/disabled
+          minAccessTime = Math.min(minAccessTime, accessTime);
+          maxAccessTime = Math.max(maxAccessTime, accessTime);
+        }
       }
       if (writeIds_ != null) {
         loadedInsertDeltaFds_ = new ArrayList<>();
@@ -244,6 +253,13 @@ public class FileMetadataLoader {
         }
       }
       loadStats_.unknownDiskIds += numUnknownDiskIds.get();
+
+      // Update access time stats in fileMetadataStats_
+      if (maxAccessTime > 0) {
+        fileMetadataStats_.minAccessTime = minAccessTime;
+        fileMetadataStats_.maxAccessTime = maxAccessTime;
+      }
+
       if (LOG.isTraceEnabled()) {
         LOG.trace(loadStats_.debugString());
       }
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
index c50a94fbf..72b1aac77 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsPartition.java
@@ -1220,6 +1220,10 @@ public class HdfsPartition extends CatalogObjectImpl 
implements FeFsPartition {
       fileMetadataStats_ = fileMetadataStats;
     }
 
+    public FileMetadataStats getFileMetadataStats() {
+      return fileMetadataStats_;
+    }
+
     public HdfsFileFormat getFileFormat() {
       return fileFormatDescriptor_.getFileFormat();
     }
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 3d4ef5f61..36ebbaaee 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -344,6 +344,8 @@ public class HdfsTable extends Table implements FeFsTable {
   //   - Used for reporting through catalog web UI.
   //   - Stats are reset whenever the table is loaded (due to a metadata 
operation) and
   //   are set when the table is serialized to Thrift.
+  //   - Note: Min/max timestamps and host:disk pairs may become stale after
+  //   dropPartition() calls, but are refreshed on the next full table load.
   private final FileMetadataStats fileMetadataStats_ = new FileMetadataStats();
 
   private final static Logger LOG = LoggerFactory.getLogger(HdfsTable.class);
@@ -766,6 +768,15 @@ public class HdfsTable extends Table implements FeFsTable {
           getHostIndex(), debugActions, logPrefix).load();
     }
 
+    // Aggregate file metadata stats from all partitions
+    FileMetadataStats aggregatedStats = new FileMetadataStats();
+    for (HdfsPartition.Builder partBuilder : partBuilders) {
+      FileMetadataStats partStats = partBuilder.getFileMetadataStats();
+      if (partStats != null) {
+        aggregatedStats.merge(partStats);
+      }
+    }
+
     // TODO(todd): would be good to log a summary of the loading process:
     // - how many block locations did we reuse/load individually/load via batch
     // - how many partitions did we read metadata for
@@ -782,8 +793,11 @@ public class HdfsTable extends Table implements FeFsTable {
     catalogTimeline.markEvent(String.format("Loaded file metadata for %d 
partitions",
         partBuilders.size()));
     long duration = clock.getTick() - startTime;
-    LOG.info("Loaded file and block metadata for {} partitions: {}. Time 
taken: {}",
-        getFullName(), partNames, PrintUtils.printTimeNs(duration));
+
+    // Log detailed file metadata statistics
+    if (LOG.isInfoEnabled()) {
+      LOG.info(aggregatedStats.toLogString(getFullName(), partNames, 
duration));
+    }
     return duration;
   }
 
diff --git 
a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java 
b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
index 340f46f67..cdb218c24 100644
--- a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
@@ -84,6 +84,56 @@ public class FileMetadataLoaderTest {
     assertEquals(1, refreshFml.getStats().loadedFiles);
   }
 
+  @Test
+  public void testFileMetadataStats() throws IOException, CatalogException {
+    //TODO(IMPALA-9042): Remove "throws CatalogException"
+    ListMap<TNetworkAddress> hostIndex = new ListMap<>();
+    String tablePath = "hdfs://localhost:20500/test-warehouse/alltypes/";
+    FileMetadataLoader fml = new FileMetadataLoader(tablePath, /* 
recursive=*/true,
+        /* oldFds = */Collections.emptyList(), hostIndex, null, null);
+    fml.load();
+
+    // Test FileMetadataStats
+    FeFsTable.FileMetadataStats stats = fml.getFileMetadataStats();
+    assertNotNull(stats);
+
+    // Basic stats
+    assertEquals(24, stats.numFiles);
+    assertTrue(stats.totalFileBytes > 0);
+    assertTrue(stats.numBlocks >= 0);
+
+    // Min/max/avg file sizes
+    assertTrue(stats.minFileBytes > 0);
+    assertTrue(stats.maxFileBytes >= stats.minFileBytes);
+    assertTrue(stats.getAvgFileBytes() > 0);
+    assertEquals(stats.totalFileBytes / stats.numFiles, 
stats.getAvgFileBytes());
+
+    // Verify min/max file sizes are consistent with actual files
+    long actualMin = Long.MAX_VALUE;
+    long actualMax = 0;
+    long actualTotal = 0;
+    for (FileDescriptor fd : fml.getLoadedFds()) {
+      long len = fd.getFileLength();
+      if (len < actualMin) actualMin = len;
+      if (len > actualMax) actualMax = len;
+      actualTotal += len;
+    }
+    assertEquals(actualMin, stats.minFileBytes);
+    assertEquals(actualMax, stats.maxFileBytes);
+    assertEquals(actualTotal, stats.totalFileBytes);
+
+    // Modification time stats
+    assertTrue(stats.minModificationTime > 0);
+    assertTrue(stats.maxModificationTime >= stats.minModificationTime);
+
+    // Host and host:disk pair stats (for HDFS tables these should be 
populated)
+    assertTrue(stats.getNumUniqueHosts() >= 0);
+    assertTrue(stats.getNumUniqueHostDiskPairs() >= 0);
+    // Number of hosts should be <= number of host:disk pairs
+    // (hosts are derived from pairs)
+    assertTrue(stats.getNumUniqueHosts() <= stats.getNumUniqueHostDiskPairs());
+  }
+
   @Test
   public void testRecursiveLoadingWithoutBlockLocations()
       throws IOException, CatalogException {
diff --git a/tests/custom_cluster/test_file_metadata_stats.py 
b/tests/custom_cluster/test_file_metadata_stats.py
new file mode 100644
index 000000000..4c8e57ca1
--- /dev/null
+++ b/tests/custom_cluster/test_file_metadata_stats.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import, division, print_function
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.skip import SkipIf
+import pytest
+
+
[email protected]_serially
[email protected]_args(catalogd_args='--logbuflevel=-1')
[email protected]_dfs
+class TestFileMetadataStats(CustomClusterTestSuite):
+  """
+  Test enhanced file metadata statistics logging in catalogd (IMPALA-13122).
+  This test verifies that the catalogd logs detailed file statistics including:
+  - Number of files and blocks
+  - File size statistics (min/avg/max)
+  - Total file size
+  - Modification and access times (min/max)
+  - Number of unique host:disk pairs (HDFS/Ozone only)
+  """
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  def test_file_metadata_stats_partitioned_table(self, unique_database):
+    """Test that file metadata statistics are logged when loading a 
partitioned table."""
+
+    # Create a partitioned table
+    tbl_name = "{}.test_file_stats_partitioned".format(unique_database)
+    self.execute_query("create table {} (id int, name string) "
+                       "partitioned by (year int, month int)".format(tbl_name))
+
+    # Insert some data to create files
+    self.execute_query("insert into {} partition(year=2023, month=1) "
+                       "values (1, 'test1'), (2, 'test2')".format(tbl_name))
+    self.execute_query("insert into {} partition(year=2023, month=2) "
+                       "values (3, 'test3'), (4, 'test4')".format(tbl_name))
+    self.execute_query("insert into {} partition(year=2024, month=1) "
+                       "values (5, 'test5'), (6, 'test6')".format(tbl_name))
+
+    # Force a refresh to trigger file metadata loading and logging
+    self.execute_query("refresh {}".format(tbl_name))
+
+    # Verify the log contains expected file metadata statistics
+    # The log should contain information about files, blocks, sizes, and times
+    log_regex = r"Loaded file and block metadata for.*{}.*Files: \d+".format(
+        tbl_name.replace(".", r"\."))
+    self.assert_catalogd_log_contains("INFO", log_regex, expected_count=-1, 
timeout_s=10)
+
+    # Verify file size statistics are logged (min/avg/max pattern)
+    size_regex = r"File sizes \(min/avg/max\):"
+    self.assert_catalogd_log_contains("INFO", size_regex, expected_count=-1, 
timeout_s=10)
+
+    # Verify modification times are logged
+    modtime_regex = r"Modification times \(min/max\):"
+    self.assert_catalogd_log_contains("INFO", modtime_regex, expected_count=-1,
+        timeout_s=10)
+
+  def test_file_metadata_stats_unpartitioned_table(self, unique_database):
+    """Test that file metadata statistics are logged for unpartitioned 
tables."""
+    # Create an unpartitioned table
+    tbl_name = "{}.test_file_stats_unpartitioned".format(unique_database)
+    self.execute_query("create table {} (id int, name string, value double)"
+                       .format(tbl_name))
+
+    # Insert data to create files
+    self.execute_query("insert into {} values "
+                       "(1, 'a', 1.1), (2, 'b', 2.2), (3, 'c', 
3.3)".format(tbl_name))
+    self.execute_query("insert into {} values "
+                       "(4, 'd', 4.4), (5, 'e', 5.5)".format(tbl_name))
+
+    # Refresh to trigger metadata loading
+    self.execute_query("refresh {}".format(tbl_name))
+
+    # Verify comprehensive statistics are logged
+    log_regex = r"Loaded file and block metadata for.*{}.*Files: \d+.*Blocks: 
\d+".\
+        format(tbl_name.replace(".", r"\."))
+    self.assert_catalogd_log_contains("INFO", log_regex, expected_count=-1, 
timeout_s=10)
+
+    # Verify total size is logged
+    total_size_regex = r"Total size:.*B"
+    self.assert_catalogd_log_contains("INFO", total_size_regex, 
expected_count=-1,
+        timeout_s=10)
+
+  def test_file_metadata_stats_external_table(self):
+    """Test file metadata statistics for external tables."""
+    # Use existing test data from functional database
+    tbl_name = "functional.alltypes"
+
+    # Invalidate metadata to force a fresh load
+    self.execute_query("invalidate metadata {}".format(tbl_name))
+
+    # Execute a query to trigger table loading
+    self.execute_query("select count(*) from {}".format(tbl_name))
+
+    # Verify detailed file statistics are logged
+    # alltypes is partitioned with 24 partitions (2 years * 12 months)
+    log_regex = (r"Loaded file and block metadata for.*functional\.alltypes.*"
+                 r"Files: 24.*File sizes \(min/avg/max\):")
+    self.assert_catalogd_log_contains("INFO", log_regex, expected_count=-1, 
timeout_s=15)
+
+  def test_file_metadata_stats_host_disk_pairs(self):
+    """Test that host and host:disk pair statistics are logged for HDFS 
tables."""
+    # Use a table that has data on HDFS
+    tbl_name = "functional.alltypessmall"
+
+    # Invalidate to trigger fresh load
+    self.execute_query("invalidate metadata {}".format(tbl_name))
+    self.execute_query("select count(*) from {}".format(tbl_name))
+
+    # For HDFS tables, we should see host statistics logged
+    hosts_regex = r"Hosts: \d+"
+    self.assert_catalogd_log_contains("INFO", hosts_regex, expected_count=-1,
+        timeout_s=15)
+
+    # For HDFS tables, we should see host:disk pair statistics logged
+    host_disk_regex = r"Host:Disk pairs: \d+"
+    self.assert_catalogd_log_contains("INFO", host_disk_regex, 
expected_count=-1,
+        timeout_s=15)

Reply via email to