This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 49196804f8e74db346001d951b44777d6e0b2bfe Author: Arnab Karmakar <[email protected]> AuthorDate: Fri Feb 6 02:13:49 2026 -0800 IMPALA-14623: Optimize memory usage for Iceberg file path hashes Use THash128 Thrift struct (16 bytes) instead of String (64 bytes) for storing 128-bit XXH128 hashes of Iceberg file paths, achieving 4x memory reduction. Key Changes: - Added THash128 Thrift struct with two i64 fields (high/low) - Updated TIcebergContentFileStore to use THash128 as map keys - Created Hash128 Java class with Thrift serialization support - Migrated from Murmur3 to XXH128 for better performance - Added C++ comparison operators for THash128 Testing: - Added comprehensive JUnit tests for Hash128 class Change-Id: Ie0de793de2434dae3b60c3aa4f87dba203eee3c1 Reviewed-on: http://gerrit.cloudera.org:8080/23946 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Zoltan Borok-Nagy <[email protected]> --- be/src/catalog/catalog-server.cc | 1 + be/src/catalog/catalog.cc | 1 + be/src/rpc/hs2-http-test.cc | 1 + be/src/runtime/descriptors.cc | 1 + be/src/service/frontend.cc | 1 + be/src/util/container-util.h | 1 + be/src/util/thash128-util.h | 38 +++++ common/thrift/CatalogObjects.thrift | 17 +- fe/pom.xml | 6 + .../org/apache/impala/catalog/FeIcebergTable.java | 3 +- .../impala/catalog/IcebergContentFileStore.java | 36 ++-- .../apache/impala/planner/IcebergScanPlanner.java | 3 +- .../main/java/org/apache/impala/util/Hash128.java | 83 +++++++++ .../java/org/apache/impala/util/IcebergUtil.java | 13 +- .../impala/catalog/local/LocalCatalogTest.java | 3 +- .../java/org/apache/impala/util/Hash128Test.java | 185 +++++++++++++++++++++ .../org/apache/impala/util/IcebergUtilTest.java | 4 +- 17 files changed, 364 insertions(+), 33 deletions(-) diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc index 88a0eb571..077ec22cb 100644 --- a/be/src/catalog/catalog-server.cc +++ b/be/src/catalog/catalog-server.cc @@ -28,6 +28,7 @@ #include "gen-cpp/CatalogService_types.h" #include "statestore/statestore-subscriber.h" #include "util/collection-metrics.h" +#include "util/container-util.h" #include "util/debug-util.h" #include "util/event-metrics.h" #include "util/json-util.h" diff --git a/be/src/catalog/catalog.cc b/be/src/catalog/catalog.cc index 08d8fad6a..4f88d6767 100644 --- a/be/src/catalog/catalog.cc +++ b/be/src/catalog/catalog.cc @@ -24,6 +24,7 @@ #include "common/thread-debug-info.h" #include "rpc/jni-thrift-util.h" #include "util/backend-gflag-util.h" +#include "util/container-util.h" #include "common/names.h" diff --git a/be/src/rpc/hs2-http-test.cc b/be/src/rpc/hs2-http-test.cc index b3db0b0ba..35ffbef30 100644 --- a/be/src/rpc/hs2-http-test.cc +++ b/be/src/rpc/hs2-http-test.cc @@ -21,6 +21,7 @@ #include "gen-cpp/ImpalaHiveServer2Service.h" #include "rpc/authentication.h" +#include "util/container-util.h" #include "util/kudu-status-util.h" #include "util/network-util.h" #include "util/os-util.h" diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc index d7de726d1..e018f7a9d 100644 --- a/be/src/runtime/descriptors.cc +++ b/be/src/runtime/descriptors.cc @@ -35,6 +35,7 @@ #include "gen-cpp/PlanNodes_types.h" #include "rpc/thrift-util.h" #include "runtime/runtime-state.h" +#include "util/container-util.h" #include "util/hash-util.h" #include "common/names.h" diff --git a/be/src/service/frontend.cc b/be/src/service/frontend.cc index 6255f8fc8..94aa8f140 100644 --- a/be/src/service/frontend.cc +++ b/be/src/service/frontend.cc @@ -24,6 +24,7 @@ #include "common/logging.h" #include "rpc/jni-thrift-util.h" #include "util/backend-gflag-util.h" +#include "util/container-util.h" #include "util/jni-util.h" #include "util/test-info.h" #include "util/time.h" diff --git a/be/src/util/container-util.h b/be/src/util/container-util.h index 01f586b0f..f472ba48e 100644 --- a/be/src/util/container-util.h +++ b/be/src/util/container-util.h @@ -32,6 +32,7 @@ #include "gen-cpp/Types_types.h" #include "gen-cpp/StatestoreService_types.h" #include "gen-cpp/common.pb.h" +#include "util/thash128-util.h" /// Comparators for types that we commonly use in containers. namespace impala { diff --git a/be/src/util/thash128-util.h b/be/src/util/thash128-util.h new file mode 100644 index 000000000..e92af5898 --- /dev/null +++ b/be/src/util/thash128-util.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gen-cpp/CatalogObjects_types.h" + +namespace impala { + +/// Comparison operators for THash128 (used as key in std::map) +inline bool operator==(const THash128& lhs, const THash128& rhs) { + return lhs.high == rhs.high && lhs.low == rhs.low; +} + +inline bool operator!=(const THash128& lhs, const THash128& rhs) { + return !(lhs == rhs); +} + +/// Required for using THash128 as std::map key +inline bool operator<(const THash128& lhs, const THash128& rhs) { + return std::tie(lhs.high, lhs.low) < std::tie(rhs.high, rhs.low); +} + +} // namespace impala diff --git a/common/thrift/CatalogObjects.thrift b/common/thrift/CatalogObjects.thrift index 7b0fd2f88..60009e523 100644 --- a/common/thrift/CatalogObjects.thrift +++ b/common/thrift/CatalogObjects.thrift @@ -657,12 +657,19 @@ struct TIcebergPartitionStats { 3: required i64 file_size_in_bytes; } -// Contains maps from 128-bit Murmur3 hash of file path to its file descriptor +// Represents a 128-bit hash value (XXH128 hash of file path) +// Stored as two 64-bit longs for memory efficiency +struct THash128 { + 1: required i64 high + 2: required i64 low +} + +// Contains maps from 128-bit XXH128 hash of file path to its file descriptor struct TIcebergContentFileStore { - 1: optional map<string, THdfsFileDesc> path_hash_to_data_file_without_deletes - 2: optional map<string, THdfsFileDesc> path_hash_to_data_file_with_deletes - 3: optional map<string, THdfsFileDesc> path_hash_to_position_delete_file - 4: optional map<string, THdfsFileDesc> path_hash_to_equality_delete_file + 1: optional map<THash128, THdfsFileDesc> path_hash_to_data_file_without_deletes + 2: optional map<THash128, THdfsFileDesc> path_hash_to_data_file_with_deletes + 3: optional map<THash128, THdfsFileDesc> path_hash_to_position_delete_file + 4: optional map<THash128, THdfsFileDesc> path_hash_to_equality_delete_file 5: optional bool has_avro 6: optional bool has_orc 7: optional bool has_parquet diff --git a/fe/pom.xml b/fe/pom.xml index b280ca87a..25aa141fb 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -585,6 +585,12 @@ under the License. </exclusion> </exclusions> </dependency> + + <dependency> + <groupId>net.openhft</groupId> + <artifactId>zero-allocation-hashing</artifactId> + <version>0.16</version> + </dependency> </dependencies> <reporting> diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java index 637613f74..dc2a2b988 100644 --- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java @@ -84,6 +84,7 @@ import org.apache.impala.thrift.TIcebergTable; import org.apache.impala.thrift.TNetworkAddress; import org.apache.impala.thrift.TResultSet; import org.apache.impala.thrift.TResultSetMetadata; +import org.apache.impala.util.Hash128; import org.apache.impala.util.HdfsCachingUtil; import org.apache.impala.util.IcebergSchemaConverter; import org.apache.impala.util.IcebergUtil; @@ -496,7 +497,7 @@ public interface FeIcebergTable extends FeFsTable { rowBuilder.add(filePath); // Try to get actual file size from content file store - String pathHash = IcebergUtil.getFilePathHash(filePath); + Hash128 pathHash = IcebergUtil.getFilePathHash(filePath); FileDescriptor fd = contentFileStore.getDataFileDescriptor(pathHash); if (fd == null) { fd = contentFileStore.getDeleteFileDescriptor(pathHash); diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java index 9baf60ece..95434cea2 100644 --- a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java +++ b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java @@ -44,10 +44,12 @@ import org.apache.impala.fb.FbFileDesc; import org.apache.impala.fb.FbFileMetadata; import org.apache.impala.fb.FbIcebergDataFileFormat; import org.apache.impala.fb.FbIcebergMetadata; +import org.apache.impala.thrift.THash128; import org.apache.impala.thrift.THdfsFileDesc; import org.apache.impala.thrift.TIcebergContentFileStore; import org.apache.impala.thrift.TIcebergPartition; import org.apache.impala.thrift.TNetworkAddress; +import org.apache.impala.util.Hash128; import org.apache.impala.util.IcebergUtil; import org.apache.impala.util.ListMap; @@ -97,12 +99,12 @@ public class IcebergContentFileStore { // Auxiliary class for holding file descriptors in both a map and a list. private static class MapListContainer { // Key is the ContentFile path hash, value is FileDescriptor transformed from DataFile - private final Map<String, EncodedFileDescriptor> fileDescMap_ = new HashMap<>(); + private final Map<Hash128, EncodedFileDescriptor> fileDescMap_ = new HashMap<>(); private final List<EncodedFileDescriptor> fileDescList_ = new ArrayList<>(); // Adds a file to the map. If this is a new entry, then add it to the list as well. // Return true if 'desc' was a new entry. - public boolean add(String pathHash, EncodedFileDescriptor desc) { + public boolean add(Hash128 pathHash, EncodedFileDescriptor desc) { if (fileDescMap_.put(pathHash, desc) == null) { fileDescList_.add(desc); return true; @@ -110,7 +112,7 @@ public class IcebergContentFileStore { return false; } - public IcebergFileDescriptor get(String pathHash) { + public IcebergFileDescriptor get(Hash128 pathHash) { if (!fileDescMap_.containsKey(pathHash)) return null; return decode(fileDescMap_.get(pathHash)); } @@ -124,27 +126,27 @@ public class IcebergContentFileStore { } // It's enough to only convert the map part to thrift. - Map<String, THdfsFileDesc> toThrift() { - Map<String, THdfsFileDesc> ret = new HashMap<>(); - for (Map.Entry<String, EncodedFileDescriptor> entry : fileDescMap_.entrySet()) { + Map<THash128, THdfsFileDesc> toThrift() { + Map<THash128, THdfsFileDesc> ret = new HashMap<>(); + for (Map.Entry<Hash128, EncodedFileDescriptor> entry : fileDescMap_.entrySet()) { ret.put( - entry.getKey(), + entry.getKey().toThrift(), decode(entry.getValue()).toThrift()); } return ret; } - static MapListContainer fromThrift(Map<String, THdfsFileDesc> thriftMap, + static MapListContainer fromThrift(Map<THash128, THdfsFileDesc> thriftMap, List<TNetworkAddress> networkAddresses, ListMap<TNetworkAddress> hostIndex) { MapListContainer ret = new MapListContainer(); - for (Map.Entry<String, THdfsFileDesc> entry : thriftMap.entrySet()) { + for (Map.Entry<THash128, THdfsFileDesc> entry : thriftMap.entrySet()) { IcebergFileDescriptor fd = IcebergFileDescriptor.fromThrift(entry.getValue()); Preconditions.checkNotNull(fd); if (networkAddresses != null) { Preconditions.checkNotNull(hostIndex); fd = fd.cloneWithNewHostIndex(networkAddresses, hostIndex); } - ret.add(entry.getKey(), encode(fd)); + ret.add(Hash128.fromThrift(entry.getKey()), encode(fd)); } return ret; } @@ -161,7 +163,7 @@ public class IcebergContentFileStore { private Map<TIcebergPartition, Integer> partitions_; // Caches file descriptors loaded during time-travel queries. - private final ConcurrentMap<String, EncodedFileDescriptor> oldFileDescMap_ = + private final ConcurrentMap<Hash128, EncodedFileDescriptor> oldFileDescMap_ = new ConcurrentHashMap<>(); // Caches the partitions of file descriptors loaded during time-travel queries. private final ConcurrentMap<TIcebergPartition, Integer> oldPartitionMap_ = @@ -206,7 +208,7 @@ public class IcebergContentFileStore { private void storeFile(ContentFile<?> contentFile, Map<String, IcebergFileDescriptor> fileDescMap, MapListContainer container) { - Pair<String, EncodedFileDescriptor> pathHashAndFd = + Pair<Hash128, EncodedFileDescriptor> pathHashAndFd = getPathHashAndFd(contentFile, fileDescMap); if (pathHashAndFd.second != null) { container.add(pathHashAndFd.first, pathHashAndFd.second); @@ -217,7 +219,7 @@ public class IcebergContentFileStore { // This is only invoked during time travel, when we are querying a snapshot that has // data files which have been removed since. - public void addOldFileDescriptor(String pathHash, IcebergFileDescriptor desc) { + public void addOldFileDescriptor(Hash128 pathHash, IcebergFileDescriptor desc) { oldFileDescMap_.put(pathHash, encode(desc)); } @@ -227,19 +229,19 @@ public class IcebergContentFileStore { oldPartitionMap_.put(partition, id); } - public IcebergFileDescriptor getDataFileDescriptor(String pathHash) { + public IcebergFileDescriptor getDataFileDescriptor(Hash128 pathHash) { IcebergFileDescriptor desc = dataFilesWithoutDeletes_.get(pathHash); if (desc != null) return desc; return dataFilesWithDeletes_.get(pathHash); } - public IcebergFileDescriptor getDeleteFileDescriptor(String pathHash) { + public IcebergFileDescriptor getDeleteFileDescriptor(Hash128 pathHash) { IcebergFileDescriptor ret = positionDeleteFiles_.get(pathHash); if (ret != null) return ret; return equalityDeleteFiles_.get(pathHash); } - public IcebergFileDescriptor getOldFileDescriptor(String pathHash) { + public IcebergFileDescriptor getOldFileDescriptor(Hash128 pathHash) { if (!oldFileDescMap_.containsKey(pathHash)) return null; return decode(oldFileDescMap_.get(pathHash)); } @@ -332,7 +334,7 @@ public class IcebergContentFileStore { } } - private Pair<String, EncodedFileDescriptor> getPathHashAndFd( + private Pair<Hash128, EncodedFileDescriptor> getPathHashAndFd( ContentFile<?> contentFile, Map<String, IcebergFileDescriptor> fileDescMap) { return new Pair<>( IcebergUtil.getFilePathHash(contentFile), diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java b/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java index 76b16f166..2d0bbd87f 100644 --- a/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java @@ -88,6 +88,7 @@ import org.apache.impala.thrift.TIcebergPartition; import org.apache.impala.thrift.TIcebergPartitionTransformType; import org.apache.impala.thrift.TQueryOptions; import org.apache.impala.thrift.TVirtualColumnType; +import org.apache.impala.util.Hash128; import org.apache.impala.util.IcebergUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -801,7 +802,7 @@ public class IcebergScanPlanner { private Pair<IcebergFileDescriptor, Boolean> getFileDescriptor(ContentFile<?> cf, IcebergContentFileStore fileStore) throws ImpalaRuntimeException { - String pathHash = IcebergUtil.getFilePathHash(cf); + Hash128 pathHash = IcebergUtil.getFilePathHash(cf); IcebergFileDescriptor iceFileDesc = cf.content() == FileContent.DATA ? fileStore.getDataFileDescriptor(pathHash) : diff --git a/fe/src/main/java/org/apache/impala/util/Hash128.java b/fe/src/main/java/org/apache/impala/util/Hash128.java new file mode 100644 index 000000000..718281f00 --- /dev/null +++ b/fe/src/main/java/org/apache/impala/util/Hash128.java @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.impala.util; + +import com.google.common.base.Preconditions; +import java.util.Objects; +import org.apache.impala.thrift.THash128; + +/** + * Represents a 128-bit hash value using two longs (high and low 64 bits). + * + * Usage: + * - Create: Hash128 hash = IcebergUtil.getFilePathHash(filePath); + * - As map key: map.put(hash, value); + * - Thrift: THash128 t = hash.toThrift(); Hash128 h = Hash128.fromThrift(t); + * - Logging: LOG.debug("Hash: {}", hash.toString()); + */ +public class Hash128 { + private final long high_; + private final long low_; + + public Hash128(long high, long low) { + high_ = high; + low_ = low; + } + + public long getHigh() { + return high_; + } + + public long getLow() { + return low_; + } + + /** + * Converts this hash to a Thrift THash128 struct for serialization. + */ + public THash128 toThrift() { + return new THash128(high_, low_); + } + + /** + * Creates a Hash128 from a Thrift THash128 struct. + */ + public static Hash128 fromThrift(THash128 thrift) { + Preconditions.checkNotNull(thrift); + return new Hash128(thrift.getHigh(), thrift.getLow()); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Hash128 hash128 = (Hash128) o; + return high_ == hash128.high_ && low_ == hash128.low_; + } + + @Override + public int hashCode() { + return Objects.hash(high_, low_); + } + + @Override + public String toString() { + // Return hexadecimal representation for debugging + return String.format("%016x%016x", high_, low_); + } +} diff --git a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java index 8c8736cbd..38f10a20d 100644 --- a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java +++ b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java @@ -734,16 +734,17 @@ public class IcebergUtil { } /** - * Use ContentFile path to generate 128-bit Murmur3 hash as map key, cached in memory + * Use ContentFile path to generate 128-bit XXH128 hash as map key, cached in memory */ - public static String getFilePathHash(ContentFile contentFile) { + public static Hash128 getFilePathHash(ContentFile contentFile) { return getFilePathHash(contentFile.path().toString()); } - public static String getFilePathHash(String path) { - Hasher hasher = Hashing.murmur3_128().newHasher(); - hasher.putUnencodedChars(path); - return hasher.hash().toString(); + public static Hash128 getFilePathHash(String path) { + net.openhft.hashing.LongTupleHashFunction hasher = + net.openhft.hashing.LongTupleHashFunction.xx128(); + long[] result = hasher.hashChars(path); + return new Hash128(result[0], result[1]); } /** diff --git a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java index deb8bd647..a213336c3 100644 --- a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java +++ b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java @@ -52,6 +52,7 @@ import org.apache.impala.thrift.TMetadataOpcode; import org.apache.impala.thrift.TNetworkAddress; import org.apache.impala.thrift.TPartialTableInfo; import org.apache.impala.thrift.TResultSet; +import org.apache.impala.util.Hash128; import org.apache.impala.util.IcebergUtil; import org.apache.impala.util.ListMap; import org.apache.impala.util.MetaStoreUtil; @@ -292,7 +293,7 @@ public class LocalCatalogTest { // For this test table the manifest files contain data paths without FS-scheme, so // they are loaded to the file content store without them. path = path.substring(path.indexOf("/test-warehouse")); - String pathHash = IcebergUtil.getFilePathHash(path); + Hash128 pathHash = IcebergUtil.getFilePathHash(path); FileDescriptor catalogFd = catalogFileStore.getDataFileDescriptor(pathHash); assertEquals(localFd.getNumFileBlocks(), 1); FbFileBlock localBlock = localFd.getFbFileBlock(0); diff --git a/fe/src/test/java/org/apache/impala/util/Hash128Test.java b/fe/src/test/java/org/apache/impala/util/Hash128Test.java new file mode 100644 index 000000000..a70a3f6ad --- /dev/null +++ b/fe/src/test/java/org/apache/impala/util/Hash128Test.java @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.impala.util; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNull; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.impala.thrift.THash128; +import org.junit.Test; + +/** + * Unit tests for the Hash128 class which stores 128-bit hash values + * as two longs for memory efficiency. + */ +public class Hash128Test { + + @Test + public void testBasicConstruction() { + Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + assertEquals(0x1234567890ABCDEFL, hash.getHigh()); + assertEquals(0xFEDCBA0987654321L, hash.getLow()); + } + + @Test + public void testEquality() { + Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 hash3 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654320L); + Hash128 hash4 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + + // Test equals + assertEquals(hash1, hash2); + assertEquals(hash1, hash4); + assertNotEquals(hash1, hash3); + assertNotEquals(hash1, null); + assertNotEquals(hash1, "not a hash"); + + // Test hashCode consistency + assertEquals(hash1.hashCode(), hash2.hashCode()); + assertEquals(hash1.hashCode(), hash4.hashCode()); + } + + @Test + public void testHashMapUsage() { + // Hash128 should work correctly as a HashMap key + Map<Hash128, String> map = new HashMap<>(); + Hash128 key1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 key2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 key3 = new Hash128(0xAAAAAAAAAAAAAAAAL, 0xBBBBBBBBBBBBBBBBL); + + map.put(key1, "value1"); + map.put(key3, "value3"); + + // key2 should retrieve the same value as key1 since they're equal + assertEquals("value1", map.get(key2)); + assertEquals("value3", map.get(key3)); + assertNull(map.get(new Hash128(0x0L, 0x0L))); + } + + @Test + public void testToString() { + Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + String str = hash.toString(); + + // Should be 32 hex characters (16 for each long) + assertEquals(32, str.length()); + assertEquals("1234567890abcdeffedcba0987654321", str); + + // Test with zero values + Hash128 zeroHash = new Hash128(0x0L, 0x0L); + assertEquals("00000000000000000000000000000000", zeroHash.toString()); + + // Test with max values + Hash128 maxHash = new Hash128(0xFFFFFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFFFL); + assertEquals("ffffffffffffffffffffffffffffffff", maxHash.toString()); + } + + @Test + public void testThriftRoundTrip() { + Hash128 original = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + + // Convert to Thrift and back + THash128 thrift = original.toThrift(); + Hash128 restored = Hash128.fromThrift(thrift); + + assertEquals(original, restored); + assertEquals(original.getHigh(), restored.getHigh()); + assertEquals(original.getLow(), restored.getLow()); + } + + @Test + public void testThriftStructFields() { + Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + THash128 thrift = hash.toThrift(); + + // Verify Thrift struct has correct values + assertEquals(0x1234567890ABCDEFL, thrift.getHigh()); + assertEquals(0xFEDCBA0987654321L, thrift.getLow()); + } + + @Test + public void testThriftWithEdgeCases() { + // Test with zero + Hash128 zero = new Hash128(0x0L, 0x0L); + THash128 thriftZero = zero.toThrift(); + Hash128 restoredZero = Hash128.fromThrift(thriftZero); + assertEquals(zero, restoredZero); + + // Test with all ones + Hash128 ones = new Hash128(0xFFFFFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFFFL); + THash128 thriftOnes = ones.toThrift(); + Hash128 restoredOnes = Hash128.fromThrift(thriftOnes); + assertEquals(ones, restoredOnes); + + // Test with high bit patterns + Hash128 pattern = new Hash128(0x8000000000000000L, 0x8000000000000001L); + THash128 thriftPattern = pattern.toThrift(); + Hash128 restoredPattern = Hash128.fromThrift(thriftPattern); + assertEquals(pattern, restoredPattern); + } + + @Test(expected = NullPointerException.class) + public void testNullThriftInput() { + Hash128.fromThrift(null); + } + + @Test + public void testConsistentHashing() { + // Multiple Hash128 objects with same values should have consistent hashCode + Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + + // Create and recreate through serialization + Hash128 hash3 = Hash128.fromThrift(hash1.toThrift()); + + assertEquals(hash1.hashCode(), hash2.hashCode()); + assertEquals(hash1.hashCode(), hash3.hashCode()); + } + + @Test + public void testDifferentHashesDifferentHashCodes() { + // Different Hash128 objects should (likely) have different hashCodes + Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654320L); + Hash128 hash3 = new Hash128(0x1234567890ABCDEEL, 0xFEDCBA0987654321L); + + // Note: hashCode collision is possible but unlikely + assertNotEquals(hash1.hashCode(), hash2.hashCode()); + assertNotEquals(hash1.hashCode(), hash3.hashCode()); + } + + @Test + public void testThriftAsMapKey() { + // Verify that Hash128 can be used as a map key through Thrift round-trip + Map<Hash128, String> map = new HashMap<>(); + Hash128 key1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L); + map.put(key1, "value1"); + + // Serialize and deserialize + THash128 thriftKey = key1.toThrift(); + Hash128 key2 = Hash128.fromThrift(thriftKey); + + // Should retrieve the same value + assertEquals("value1", map.get(key2)); + } +} diff --git a/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java b/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java index 15e8c9879..d00a127ae 100644 --- a/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java +++ b/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java @@ -259,9 +259,9 @@ public class IcebergUtilTest { */ @Test public void testGetDataFilePathHash() { - String hash = getFilePathHash(FILE_A); + Hash128 hash = getFilePathHash(FILE_A); assertNotNull(hash); - String hash2 = getFilePathHash(FILE_A); + Hash128 hash2 = getFilePathHash(FILE_A); assertEquals(hash, hash2); }
