This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 4a675b72949c859967cc9389bef55402d16c3efa Author: Arnab Karmakar <[email protected]> AuthorDate: Wed Mar 4 11:22:26 2026 -0800 IMPALA-13122 addendum: Fix host statistics logging for erasure coded files When erasure coding is enabled, disk IDs are unavailable for EC blocks. The previous implementation only tracked hosts via host:disk pairs, requiring valid disk IDs. This caused host statistics to be missing from logs in EC environments. Fixed by tracking host indices separately from host:disk pairs: - Added uniqueHostIndices set to FileMetadataStats - Track all host indices regardless of disk ID availability - Host:disk pairs still tracked only when disk IDs are valid (>= 0) - Updated getNumUniqueHosts() to use uniqueHostIndices directly With this fix: - Traditional replication: Both hosts and host:disk pairs are logged - Erasure coding: Hosts are logged, host:disk pairs may be 0 or omitted Testing: - All tests pass with and without erasure coding Change-Id: Ie6f5b70fa9c46dd3f34287f030553360da6b20c6 Reviewed-on: http://gerrit.cloudera.org:8080/24068 Reviewed-by: Michael Smith <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../main/java/org/apache/impala/catalog/FeFsTable.java | 18 +++++++++++------- .../apache/impala/catalog/FileMetadataLoaderTest.java | 3 --- tests/common/environ.py | 4 ++++ tests/custom_cluster/test_file_metadata_stats.py | 12 ++++++++---- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java index 0681cd396..f26ea698a 100644 --- a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java @@ -125,9 +125,13 @@ public interface FeFsTable extends FeTable { public long minAccessTime = Long.MAX_VALUE; // Max access time public long maxAccessTime = 0; + // Set of unique host indices (for HDFS/Ozone only) + public Set<Integer> uniqueHostIndices = new HashSet<>(); // Set of unique host:disk pairs (for HDFS/Ozone only) // Stores pairs as Pair<hostIndex, diskId> for efficient tracking // Disk IDs are 0-based per host, so pairs must be tracked together + // Note: With erasure coding, disk IDs may not be available, so this may be empty + // even when uniqueHostIndices is populated public Set<Pair<Integer, Short>> uniqueHostDiskPairs = new HashSet<>(); public FileMetadataStats() {} @@ -153,6 +157,7 @@ public interface FeFsTable extends FeTable { maxModificationTime = 0; minAccessTime = Long.MAX_VALUE; maxAccessTime = 0; + uniqueHostIndices.clear(); uniqueHostDiskPairs.clear(); } @@ -166,6 +171,7 @@ public interface FeFsTable extends FeTable { maxModificationTime = stats.maxModificationTime; minAccessTime = stats.minAccessTime; maxAccessTime = stats.maxAccessTime; + uniqueHostIndices = new HashSet<>(stats.uniqueHostIndices); uniqueHostDiskPairs = new HashSet<>(stats.uniqueHostDiskPairs); } @@ -179,6 +185,7 @@ public interface FeFsTable extends FeTable { maxModificationTime = Math.max(maxModificationTime, other.maxModificationTime); minAccessTime = Math.min(minAccessTime, other.minAccessTime); maxAccessTime = Math.max(maxAccessTime, other.maxAccessTime); + uniqueHostIndices.addAll(other.uniqueHostIndices); uniqueHostDiskPairs.addAll(other.uniqueHostDiskPairs); } @@ -207,14 +214,15 @@ public interface FeFsTable extends FeTable { minModificationTime = Math.min(minModificationTime, modTime); maxModificationTime = Math.max(maxModificationTime, modTime); - // Track unique host:disk pairs from file blocks + // Track unique hosts and host:disk pairs from file blocks for (int i = 0; i < fd.getNumFileBlocks(); ++i) { FbFileBlock block = fd.getFbFileBlock(i); int numReplicas = block.replicaHostIdxsLength(); int numDiskIds = block.diskIdsLength(); - // Pair up host indices with disk IDs + // Track host indices and pair them with disk IDs when available for (int j = 0; j < numReplicas; ++j) { int hostIdx = FileBlock.getReplicaHostIdx(block, j); + uniqueHostIndices.add(hostIdx); short diskId = (j < numDiskIds) ? block.diskIds(j) : -1; if (diskId >= 0) { // Only track valid disk IDs uniqueHostDiskPairs.add(Pair.create(hostIdx, diskId)); @@ -228,11 +236,7 @@ public interface FeFsTable extends FeTable { } public int getNumUniqueHosts() { - Set<Integer> uniqueHosts = new HashSet<>(); - for (Pair<Integer, Short> pair : uniqueHostDiskPairs) { - uniqueHosts.add(pair.first); - } - return uniqueHosts.size(); + return uniqueHostIndices.size(); } public int getNumUniqueHostDiskPairs() { diff --git a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java index cdb218c24..dfb85f120 100644 --- a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java +++ b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java @@ -129,9 +129,6 @@ public class FileMetadataLoaderTest { // Host and host:disk pair stats (for HDFS tables these should be populated) assertTrue(stats.getNumUniqueHosts() >= 0); assertTrue(stats.getNumUniqueHostDiskPairs() >= 0); - // Number of hosts should be <= number of host:disk pairs - // (hosts are derived from pairs) - assertTrue(stats.getNumUniqueHosts() <= stats.getNumUniqueHostDiskPairs()); } @Test diff --git a/tests/common/environ.py b/tests/common/environ.py index 53701d990..4dbeffe33 100644 --- a/tests/common/environ.py +++ b/tests/common/environ.py @@ -452,6 +452,10 @@ class ImpalaTestClusterProperties(object): return False raise + def is_erasure_coding_enabled(self): + """Return whether the test cluster was built with erasure coding enabled.""" + return os.getenv("ERASURE_CODING") == "true" + def build_flavor_timeout(default_timeout, slow_build_timeout=None, asan_build_timeout=None, code_coverage_build_timeout=None): diff --git a/tests/custom_cluster/test_file_metadata_stats.py b/tests/custom_cluster/test_file_metadata_stats.py index 4c8e57ca1..749d9e08c 100644 --- a/tests/custom_cluster/test_file_metadata_stats.py +++ b/tests/custom_cluster/test_file_metadata_stats.py @@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function from tests.common.custom_cluster_test_suite import CustomClusterTestSuite +from tests.common.environ import ImpalaTestClusterProperties from tests.common.skip import SkipIf import pytest @@ -130,7 +131,10 @@ class TestFileMetadataStats(CustomClusterTestSuite): self.assert_catalogd_log_contains("INFO", hosts_regex, expected_count=-1, timeout_s=15) - # For HDFS tables, we should see host:disk pair statistics logged - host_disk_regex = r"Host:Disk pairs: \d+" - self.assert_catalogd_log_contains("INFO", host_disk_regex, expected_count=-1, - timeout_s=15) + # For HDFS tables with disk IDs available (non-EC), host:disk pair stats are logged + # With erasure coding, disk IDs may not be available, so skip this check + cluster_properties = ImpalaTestClusterProperties.get_instance() + if not cluster_properties.is_erasure_coding_enabled(): + host_disk_regex = r"Host:Disk pairs: \d+" + self.assert_catalogd_log_contains("INFO", host_disk_regex, expected_count=-1, + timeout_s=15)
