This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 4a675b72949c859967cc9389bef55402d16c3efa
Author: Arnab Karmakar <[email protected]>
AuthorDate: Wed Mar 4 11:22:26 2026 -0800

    IMPALA-13122 addendum: Fix host statistics logging for erasure coded files
    
    When erasure coding is enabled, disk IDs are unavailable for
    EC blocks. The previous implementation only tracked hosts via host:disk
    pairs, requiring valid disk IDs. This caused host statistics to be missing
    from logs in EC environments.
    
    Fixed by tracking host indices separately from host:disk pairs:
    - Added uniqueHostIndices set to FileMetadataStats
    - Track all host indices regardless of disk ID availability
    - Host:disk pairs still tracked only when disk IDs are valid (>= 0)
    - Updated getNumUniqueHosts() to use uniqueHostIndices directly
    
    With this fix:
    - Traditional replication: Both hosts and host:disk pairs are logged
    - Erasure coding: Hosts are logged, host:disk pairs may be 0 or omitted
    
    Testing:
    - All tests pass with and without erasure coding
    
    Change-Id: Ie6f5b70fa9c46dd3f34287f030553360da6b20c6
    Reviewed-on: http://gerrit.cloudera.org:8080/24068
    Reviewed-by: Michael Smith <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../main/java/org/apache/impala/catalog/FeFsTable.java | 18 +++++++++++-------
 .../apache/impala/catalog/FileMetadataLoaderTest.java  |  3 ---
 tests/common/environ.py                                |  4 ++++
 tests/custom_cluster/test_file_metadata_stats.py       | 12 ++++++++----
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
index 0681cd396..f26ea698a 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java
@@ -125,9 +125,13 @@ public interface FeFsTable extends FeTable {
     public long minAccessTime = Long.MAX_VALUE;
     // Max access time
     public long maxAccessTime = 0;
+    // Set of unique host indices (for HDFS/Ozone only)
+    public Set<Integer> uniqueHostIndices = new HashSet<>();
     // Set of unique host:disk pairs (for HDFS/Ozone only)
     // Stores pairs as Pair<hostIndex, diskId> for efficient tracking
     // Disk IDs are 0-based per host, so pairs must be tracked together
+    // Note: With erasure coding, disk IDs may not be available, so this may 
be empty
+    // even when uniqueHostIndices is populated
     public Set<Pair<Integer, Short>> uniqueHostDiskPairs = new HashSet<>();
 
     public FileMetadataStats() {}
@@ -153,6 +157,7 @@ public interface FeFsTable extends FeTable {
       maxModificationTime = 0;
       minAccessTime = Long.MAX_VALUE;
       maxAccessTime = 0;
+      uniqueHostIndices.clear();
       uniqueHostDiskPairs.clear();
     }
 
@@ -166,6 +171,7 @@ public interface FeFsTable extends FeTable {
       maxModificationTime = stats.maxModificationTime;
       minAccessTime = stats.minAccessTime;
       maxAccessTime = stats.maxAccessTime;
+      uniqueHostIndices = new HashSet<>(stats.uniqueHostIndices);
       uniqueHostDiskPairs = new HashSet<>(stats.uniqueHostDiskPairs);
     }
 
@@ -179,6 +185,7 @@ public interface FeFsTable extends FeTable {
       maxModificationTime = Math.max(maxModificationTime, 
other.maxModificationTime);
       minAccessTime = Math.min(minAccessTime, other.minAccessTime);
       maxAccessTime = Math.max(maxAccessTime, other.maxAccessTime);
+      uniqueHostIndices.addAll(other.uniqueHostIndices);
       uniqueHostDiskPairs.addAll(other.uniqueHostDiskPairs);
     }
 
@@ -207,14 +214,15 @@ public interface FeFsTable extends FeTable {
       minModificationTime = Math.min(minModificationTime, modTime);
       maxModificationTime = Math.max(maxModificationTime, modTime);
 
-      // Track unique host:disk pairs from file blocks
+      // Track unique hosts and host:disk pairs from file blocks
       for (int i = 0; i < fd.getNumFileBlocks(); ++i) {
         FbFileBlock block = fd.getFbFileBlock(i);
         int numReplicas = block.replicaHostIdxsLength();
         int numDiskIds = block.diskIdsLength();
-        // Pair up host indices with disk IDs
+        // Track host indices and pair them with disk IDs when available
         for (int j = 0; j < numReplicas; ++j) {
           int hostIdx = FileBlock.getReplicaHostIdx(block, j);
+          uniqueHostIndices.add(hostIdx);
           short diskId = (j < numDiskIds) ? block.diskIds(j) : -1;
           if (diskId >= 0) {  // Only track valid disk IDs
             uniqueHostDiskPairs.add(Pair.create(hostIdx, diskId));
@@ -228,11 +236,7 @@ public interface FeFsTable extends FeTable {
     }
 
     public int getNumUniqueHosts() {
-      Set<Integer> uniqueHosts = new HashSet<>();
-      for (Pair<Integer, Short> pair : uniqueHostDiskPairs) {
-        uniqueHosts.add(pair.first);
-      }
-      return uniqueHosts.size();
+      return uniqueHostIndices.size();
     }
 
     public int getNumUniqueHostDiskPairs() {
diff --git 
a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java 
b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
index cdb218c24..dfb85f120 100644
--- a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
@@ -129,9 +129,6 @@ public class FileMetadataLoaderTest {
     // Host and host:disk pair stats (for HDFS tables these should be 
populated)
     assertTrue(stats.getNumUniqueHosts() >= 0);
     assertTrue(stats.getNumUniqueHostDiskPairs() >= 0);
-    // Number of hosts should be <= number of host:disk pairs
-    // (hosts are derived from pairs)
-    assertTrue(stats.getNumUniqueHosts() <= stats.getNumUniqueHostDiskPairs());
   }
 
   @Test
diff --git a/tests/common/environ.py b/tests/common/environ.py
index 53701d990..4dbeffe33 100644
--- a/tests/common/environ.py
+++ b/tests/common/environ.py
@@ -452,6 +452,10 @@ class ImpalaTestClusterProperties(object):
         return False
       raise
 
+  def is_erasure_coding_enabled(self):
+    """Return whether the test cluster was built with erasure coding 
enabled."""
+    return os.getenv("ERASURE_CODING") == "true"
+
 
 def build_flavor_timeout(default_timeout, slow_build_timeout=None,
         asan_build_timeout=None, code_coverage_build_timeout=None):
diff --git a/tests/custom_cluster/test_file_metadata_stats.py 
b/tests/custom_cluster/test_file_metadata_stats.py
index 4c8e57ca1..749d9e08c 100644
--- a/tests/custom_cluster/test_file_metadata_stats.py
+++ b/tests/custom_cluster/test_file_metadata_stats.py
@@ -17,6 +17,7 @@
 
 from __future__ import absolute_import, division, print_function
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.environ import ImpalaTestClusterProperties
 from tests.common.skip import SkipIf
 import pytest
 
@@ -130,7 +131,10 @@ class TestFileMetadataStats(CustomClusterTestSuite):
     self.assert_catalogd_log_contains("INFO", hosts_regex, expected_count=-1,
         timeout_s=15)
 
-    # For HDFS tables, we should see host:disk pair statistics logged
-    host_disk_regex = r"Host:Disk pairs: \d+"
-    self.assert_catalogd_log_contains("INFO", host_disk_regex, 
expected_count=-1,
-        timeout_s=15)
+    # For HDFS tables with disk IDs available (non-EC), host:disk pair stats 
are logged
+    # With erasure coding, disk IDs may not be available, so skip this check
+    cluster_properties = ImpalaTestClusterProperties.get_instance()
+    if not cluster_properties.is_erasure_coding_enabled():
+      host_disk_regex = r"Host:Disk pairs: \d+"
+      self.assert_catalogd_log_contains("INFO", host_disk_regex, 
expected_count=-1,
+          timeout_s=15)

Reply via email to