This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new f41ac10d177 branch-3.0: [Bug](hash) fix wrong HashLen16 implement and
add cityhash64 unit test #46928 (#46948)
f41ac10d177 is described below
commit f41ac10d17703730c4ff29e17647cf4fd5b65341
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Jan 15 00:07:23 2025 +0800
branch-3.0: [Bug](hash) fix wrong HashLen16 implement and add cityhash64
unit test #46928 (#46948)
Cherry-picked from #46928
Co-authored-by: Pxl <[email protected]>
---
be/src/gutil/hash/city.cc | 10 +-
be/src/gutil/hash/city.h | 9 +-
.../olap/rowset/segment_v2/ngram_bloom_filter.cpp | 2 -
be/src/olap/rowset/segment_v2/ngram_bloom_filter.h | 2 +
be/src/vec/common/string_ref.h | 10 -
be/test/util/cityhash_test.cpp | 232 +++++++++++++++++++++
6 files changed, 243 insertions(+), 22 deletions(-)
diff --git a/be/src/gutil/hash/city.cc b/be/src/gutil/hash/city.cc
index d4d53ab2f6b..4e34959ea08 100644
--- a/be/src/gutil/hash/city.cc
+++ b/be/src/gutil/hash/city.cc
@@ -73,7 +73,7 @@ uint64 HashLen16(uint64 u, uint64 v) {
const uint64 kMul = 0xc6a4a7935bd1e995ULL;
uint64 a = (u ^ v) * kMul;
a ^= (a >> 47);
- uint64 b = (v ^ a) * kMul;
+ uint64 b = (u ^ a) * kMul;
b ^= (b >> 47);
b *= kMul;
return b;
@@ -199,11 +199,11 @@ uint64 CityHash64(const char* s, size_t len) {
HashLen16(v.second, w.second) + x);
}
-uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) {
- return CityHash64WithSeeds(s, len, k2, seed);
-}
-
uint64 CityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64
seed1) {
return HashLen16(CityHash64(s, len) - seed0, seed1);
}
+
+uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) {
+ return CityHash64WithSeeds(s, len, k2, seed);
+}
} // namespace util_hash
diff --git a/be/src/gutil/hash/city.h b/be/src/gutil/hash/city.h
index 8e6042b61a8..0617bc442a1 100644
--- a/be/src/gutil/hash/city.h
+++ b/be/src/gutil/hash/city.h
@@ -24,6 +24,10 @@
#include "gutil/integral_types.h"
+// WARNING
+// The implementation of cityhash in this file is somewhat different from
Google's original version.
+// For the same input, there will be different output results.
+// Therefore, we should do not to use this special cityhash as possible as we
can.
namespace util_hash {
uint64 HashLen16(uint64 u, uint64 v);
@@ -35,9 +39,4 @@ uint64 CityHash64(const char* buf, size_t len);
// Hash function for a byte array. For convenience, a 64-bit seed is also
// hashed into the result. The mapping may change from time to time.
uint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed);
-
-// Hash function for a byte array. For convenience, two seeds are also
-// hashed into the result. The mapping may change from time to time.
-uint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0, uint64
seed1);
-
} // namespace util_hash
diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp
b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp
index a3227f8e75f..f4d0953a8b9 100644
--- a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp
+++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp
@@ -26,8 +26,6 @@
namespace doris {
namespace segment_v2 {
-static constexpr uint64_t SEED_GEN = 217728422;
-
NGramBloomFilter::NGramBloomFilter(size_t size)
: _size(size),
words((size + sizeof(UnderType) - 1) / sizeof(UnderType)),
diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h
b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h
index 527f8a6e99e..6418ab30c98 100644
--- a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h
+++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h
@@ -26,6 +26,8 @@
#include "olap/rowset/segment_v2/bloom_filter.h"
namespace doris {
+static constexpr uint64_t SEED_GEN = 217728422;
+
namespace segment_v2 {
enum HashStrategyPB : int;
diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h
index e2094ca8f39..bdb9b5f8b99 100644
--- a/be/src/vec/common/string_ref.h
+++ b/be/src/vec/common/string_ref.h
@@ -343,16 +343,6 @@ inline size_t hash_less_than8(const char* data, size_t
size) {
return k2;
}
-inline size_t hash_less_than16(const char* data, size_t size) {
- if (size > 8) {
- auto a = unaligned_load<doris::vectorized::UInt64>(data);
- auto b = unaligned_load<doris::vectorized::UInt64>(data + size - 8);
- return hash_len16(a, rotate_by_at_least1(b + size, size)) ^ b;
- }
-
- return hash_less_than8(data, size);
-}
-
inline size_t crc32_hash(const char* pos, size_t size) {
if (size == 0) {
return 0;
diff --git a/be/test/util/cityhash_test.cpp b/be/test/util/cityhash_test.cpp
new file mode 100644
index 00000000000..399c7954fa0
--- /dev/null
+++ b/be/test/util/cityhash_test.cpp
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+
+#include "gtest/gtest_pred_impl.h"
+#include "gutil/hash/city.h"
+#include "olap/rowset/segment_v2/ngram_bloom_filter.h"
+#include "testutil/any_type.h"
+
+namespace doris {
+
+void test(uint64_t result, uint64_t result_with_seed, const std::string& str) {
+ EXPECT_EQ(result, util_hash::CityHash64(str.data(), str.size()));
+ EXPECT_EQ(result_with_seed, util_hash::CityHash64WithSeed(str.data(),
str.size(), SEED_GEN));
+}
+
+TEST(city_hash, simple) {
+ test(200641142423602673ULL, 14284967837249502203ULL, "Testing 1, 2, 3");
+ test(15001808493323962774ULL, 11424039407453860401ULL, "Moscow");
+ test(8034095133228533394ULL, 18132648334508667393ULL, "Moscow is the
capital of Russia");
+ test(13956251726120965457ULL, 1381326629274895150ULL,
+ "Moscow is the capital of Russia, and the largest city in the
country");
+ test(2261632006729419242ULL, 17084064757690202946ULL,
+ "Moscow is the capital of Russia, and the largest city in the
country, with a population "
+ "of over 12 million people");
+ test(6469445502724539237ULL, 11343158389890087495ULL,
+ "Moscow is the capital of Russia, and the largest city in the
country, with a population "
+ "of over 12 million people, and the largest city in Europe");
+ test(4443578057564282258ULL, 8388333080872437400ULL, "one");
+}
+
+TEST(city_hash, long_text) {
+ test(2603920979253371939ULL, 17286059257277065140ULL,
+ "Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing
1, 2, 3Testing 1, 2, "
+ "3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing
1, 2, 3Testing 1, "
+ "2, 3");
+ test(3019762420123308560ULL, 17640672461655093518ULL,
+
"MoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosco"
+
"wMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosc"
+
"owMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMos"
+
"cowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMo"
+
"scowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowM"
+
"oscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscow"
+
"MoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosco"
+ "wMoscowMoscowMoscowMoscowMoscowMoscowMoscow");
+ test(15852398506477355940ULL, 7555841253624036420ULL,
+ "Moscow is the capital of RussiaMoscow is the capital of RussiaMoscow
is the capital of "
+ "RussiaMoscow is the capital of RussiaMoscow is the capital of
RussiaMoscow is the "
+ "capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of RussiaMoscow is "
+ "the capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of "
+ "RussiaMoscow is the capital of RussiaMoscow is the capital of
RussiaMoscow is the "
+ "capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of RussiaMoscow is "
+ "the capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of "
+ "RussiaMoscow is the capital of RussiaMoscow is the capital of
RussiaMoscow is the "
+ "capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of RussiaMoscow is "
+ "the capital of RussiaMoscow is the capital of RussiaMoscow is the
capital of "
+ "RussiaMoscow is the capital of RussiaMoscow is the capital of
Russia");
+ test(1035060799968064901ULL, 16530320359523728196ULL,
+ "Moscow is the capital of Russia, and the largest city in the
countryMoscow is the "
+ "capital of Russia, and the largest city in the countryMoscow is the
capital of Russia, "
+ "and the largest city in the countryMoscow is the capital of Russia,
and the largest city "
+ "in the countryMoscow is the capital of Russia, and the largest city
in the countryMoscow "
+ "is the capital of Russia, and the largest city in the countryMoscow
is the capital of "
+ "Russia, and the largest city in the countryMoscow is the capital of
Russia, and the "
+ "largest city in the countryMoscow is the capital of Russia, and the
largest city in the "
+ "countryMoscow is the capital of Russia, and the largest city in the
countryMoscow is the "
+ "capital of Russia, and the largest city in the countryMoscow is the
capital of Russia, "
+ "and the largest city in the countryMoscow is the capital of Russia,
and the largest city "
+ "in the countryMoscow is the capital of Russia, and the largest city
in the countryMoscow "
+ "is the capital of Russia, and the largest city in the countryMoscow
is the capital of "
+ "Russia, and the largest city in the countryMoscow is the capital of
Russia, and the "
+ "largest city in the countryMoscow is the capital of Russia, and the
largest city in the "
+ "countryMoscow is the capital of Russia, and the largest city in the
countryMoscow is the "
+ "capital of Russia, and the largest city in the countryMoscow is the
capital of Russia, "
+ "and the largest city in the countryMoscow is the capital of Russia,
and the largest city "
+ "in the countryMoscow is the capital of Russia, and the largest city
in the countryMoscow "
+ "is the capital of Russia, and the largest city in the countryMoscow
is the capital of "
+ "Russia, and the largest city in the countryMoscow is the capital of
Russia, and the "
+ "largest city in the countryMoscow is the capital of Russia, and the
largest city in the "
+ "countryMoscow is the capital of Russia, and the largest city in the
country");
+ test(6905655253535205836ULL, 5560247534938243804ULL,
+ "Moscow is the capital of Russia, and the largest city in the
country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million peopleMoscow is the capital of Russia, and the
largest city in the "
+ "country, with a population "
+ "of over 12 million people");
+ test(17180533981300533381ULL, 985389469184223326ULL,
+ "Moscow is the capital of Russia, and the largest city in the
country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in EuropeMoscow is
the capital of "
+ "Russia, and the largest city in the country, with a population "
+ "of over 12 million people, and the largest city in Europe");
+ test(15470213656373493785ULL, 18337102184905563192ULL,
+
"ooneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneo"
+
"neoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneone"
+
"oneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneon"
+ "eoneoneonene");
+}
+
+TEST(city_hash, short_text8) {
+ test(2548571168015014692ULL, 284905140919628274ULL, "a");
+ test(2070361765919599426ULL, 9025766345180298939ULL, "bb");
+ test(14141338468832955152ULL, 10552519853845467035ULL, "ccc");
+ test(18138810160329292616ULL, 5334357042905201868ULL, "dddd");
+ test(425873312578527799ULL, 17471752968193695125ULL, "aaaaa");
+ test(16631617225600905903ULL, 8316778257275536170ULL, "aaaccc");
+ test(12719358095561328225ULL, 17872985103380866073ULL, "abcdefh");
+ test(17174388733642667936ULL, 11830283950305971712ULL, "abcdeffh");
+}
+
+TEST(city_hash, short_text16) {
+ test(5762505351139159907ULL, 5576230494162395002ULL, "a32146547");
+ test(5384428512347131045ULL, 15753090810004664388ULL, "b32146547b");
+ test(12829545899344666879ULL, 3848967114221885261ULL, "c32146547cc");
+ test(16893894660113381599ULL, 5947978601334952997ULL, "dd32146547dd");
+ test(17429098434560484596ULL, 6398107568024805321ULL, "aaaa32146547a");
+ test(10726875353743957364ULL, 2240715260928806674ULL, "aaa32146547ccc");
+ test(6679348617805124815ULL, 9286469086836368089ULL, "abc32146547defh");
+ test(6691049816525186727ULL, 7961087717381234556ULL, "abcd32146547effh");
+}
+
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]