This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ca983a83cab [Improvement] add zlib_crc32_fixed (#60735)
ca983a83cab is described below
commit ca983a83cabfeaf063a55396ecabf9f804559d34
Author: Pxl <[email protected]>
AuthorDate: Wed Feb 25 14:36:57 2026 +0800
[Improvement] add zlib_crc32_fixed (#60735)
zlib_crc32: SplitBlockHashComputeTime: 269.15ms
crc32c_fixed: SplitBlockHashComputeTime: 33.258ms
zlib_crc32_fixed: SplitBlockHashComputeTime: 61.941ms
---
be/src/util/hash_util.hpp | 75 ++++++-
be/src/vec/columns/column_decimal.cpp | 11 +-
be/src/vec/columns/column_vector.cpp | 46 ++--
be/src/vec/columns/column_vector.h | 1 +
be/test/util/crc32c_test.cpp | 401 ++++++++++++++++++++++++++++++++++
5 files changed, 500 insertions(+), 34 deletions(-)
diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
index 9c5d4ef3aca..9371f8867ca 100644
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@@ -34,20 +34,93 @@
#include "util/hash/city.h"
#include "util/murmur_hash3.h"
#include "util/sse_util.hpp"
+#include "vec/common/endian.h"
namespace doris {
#include "common/compile_check_begin.h"
+namespace detail {
+// Slicing-by-4 table: t[0] is the standard byte-at-a-time table,
+// t[1..3] are extended tables for parallel 4-byte processing.
+struct CRC32SliceBy4Table {
+ uint32_t t[4][256] {};
+ constexpr CRC32SliceBy4Table() {
+ // t[0]: standard CRC32 lookup table
+ for (uint32_t i = 0; i < 256; i++) {
+ uint32_t c = i;
+ for (int j = 0; j < 8; j++) {
+ c = (c & 1) ? ((c >> 1) ^ 0xEDB88320U) : (c >> 1);
+ }
+ t[0][i] = c;
+ }
+ // t[1..3]: each entry is one additional CRC byte-step applied to
t[k-1]
+ for (uint32_t i = 0; i < 256; i++) {
+ uint32_t c = t[0][i];
+ for (int k = 1; k < 4; k++) {
+ c = t[0][c & 0xFF] ^ (c >> 8);
+ t[k][i] = c;
+ }
+ }
+ }
+};
+} // namespace detail
+
// Utility class to compute hash values.
class HashUtil {
+private:
+ static inline constexpr detail::CRC32SliceBy4Table CRC32_TABLE {};
+
public:
static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t
hash) {
return (uint32_t)crc32(hash, (const unsigned char*)data, bytes);
}
+ // Inline CRC32 (zlib-compatible, standard CRC32 polynomial) for
fixed-size types.
+ // Uses Slicing-by-4 technique for 4/8-byte types: processes 4 bytes at a
time using
+ // 4 precomputed lookup tables, reducing serial table lookups from 4 to 1
per 4-byte chunk.
+ // Polynomial: 0xEDB88320 (reflected form of 0x04C11DB7).
+ // Endian note: CRC32 reflected algorithm processes bytes in address order
(byte[0] first).
+ // Slicing-by-4 requires byte[0] at LSB of the loaded uint32_t, which is
little-endian layout.
+ // LittleEndian::Load32 provides this on ALL platforms: noop on LE, bswap
on BE.
+ template <typename T>
+ static uint32_t zlib_crc32_fixed(const T& value, uint32_t hash) {
+ const auto* p = reinterpret_cast<const uint8_t*>(&value);
+ // zlib convention: pre/post XOR with 0xFFFFFFFF
+ uint32_t crc = hash ^ 0xFFFFFFFFU;
+
+ if constexpr (sizeof(T) == 1) {
+ // 1 byte: single table lookup
+ crc = CRC32_TABLE.t[0][(crc ^ p[0]) & 0xFF] ^ (crc >> 8);
+ } else if constexpr (sizeof(T) == 2) {
+ // 2 bytes: two sequential table lookups (slicing doesn't help
below 4 bytes)
+ crc = CRC32_TABLE.t[0][(crc ^ p[0]) & 0xFF] ^ (crc >> 8);
+ crc = CRC32_TABLE.t[0][(crc ^ p[1]) & 0xFF] ^ (crc >> 8);
+ } else if constexpr (sizeof(T) == 4) {
+ // 4 bytes: one Slicing-by-4 step — 4 independent lookups in
parallel
+ // LittleEndian::Load32 handles unaligned load + byte-swap on
big-endian,
+ // ensuring byte[0] is always at LSB for correct CRC byte
processing order.
+ uint32_t word = LittleEndian::Load32(p) ^ crc;
+ crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8)
& 0xFF] ^
+ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^
CRC32_TABLE.t[0][(word >> 24) & 0xFF];
+ } else if constexpr (sizeof(T) == 8) {
+ // 8 bytes: two Slicing-by-4 steps
+ uint32_t word = LittleEndian::Load32(p) ^ crc;
+ crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8)
& 0xFF] ^
+ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^
CRC32_TABLE.t[0][(word >> 24) & 0xFF];
+
+ word = LittleEndian::Load32(p + 4) ^ crc;
+ crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8)
& 0xFF] ^
+ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^
CRC32_TABLE.t[0][(word >> 24) & 0xFF];
+ } else {
+ // Fallback to zlib for larger/unusual types
+ return (uint32_t)crc32(hash, (const unsigned char*)&value,
sizeof(T));
+ }
+ return crc ^ 0xFFFFFFFFU;
+ }
+
static uint32_t zlib_crc_hash_null(uint32_t hash) {
// null is treat as 0 when hash
static const int INT_VALUE = 0;
- return (uint32_t)crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
+ return zlib_crc32_fixed(INT_VALUE, hash);
}
template <typename T>
diff --git a/be/src/vec/columns/column_decimal.cpp
b/be/src/vec/columns/column_decimal.cpp
index 94afcc6aa38..90d126f4911 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -170,7 +170,7 @@ void ColumnDecimal<T>::update_crc_with_value(size_t start,
size_t end, uint32_t&
if (null_data == nullptr) {
for (size_t i = start; i < end; i++) {
if constexpr (T != TYPE_DECIMALV2) {
- hash = HashUtil::zlib_crc_hash(&data[i], sizeof(value_type),
hash);
+ hash = HashUtil::zlib_crc32_fixed(data[i], hash);
} else {
decimalv2_do_crc(i, hash);
}
@@ -179,7 +179,7 @@ void ColumnDecimal<T>::update_crc_with_value(size_t start,
size_t end, uint32_t&
for (size_t i = start; i < end; i++) {
if (null_data[i] == 0) {
if constexpr (T != TYPE_DECIMALV2) {
- hash = HashUtil::zlib_crc_hash(&data[i],
sizeof(value_type), hash);
+ hash = HashUtil::zlib_crc32_fixed(data[i], hash);
} else {
decimalv2_do_crc(i, hash);
}
@@ -198,12 +198,13 @@ void ColumnDecimal<T>::update_crcs_with_value(uint32_t*
__restrict hashes, Primi
if constexpr (T != TYPE_DECIMALV2) {
if (null_data == nullptr) {
for (size_t i = 0; i < s; i++) {
- hashes[i] = HashUtil::zlib_crc_hash(&data[i],
sizeof(value_type), hashes[i]);
+ hashes[i] = HashUtil::zlib_crc32_fixed(data[i], hashes[i]);
}
} else {
for (size_t i = 0; i < s; i++) {
- if (null_data[i] == 0)
- hashes[i] = HashUtil::zlib_crc_hash(&data[i],
sizeof(value_type), hashes[i]);
+ if (null_data[i] == 0) {
+ hashes[i] = HashUtil::zlib_crc32_fixed(data[i], hashes[i]);
+ }
}
}
} else {
diff --git a/be/src/vec/columns/column_vector.cpp
b/be/src/vec/columns/column_vector.cpp
index 21d7889e825..a34390836cb 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -201,41 +201,31 @@ void ColumnVector<T>::update_crcs_with_value(uint32_t*
__restrict hashes, Primit
auto s = rows;
DCHECK(s == size());
- if constexpr (is_date_or_datetime(T)) {
- char buf[64];
- auto date_convert_do_crc = [&](size_t i) {
- const auto& date_val = (const VecDateTimeValue&)data[i];
- auto len = date_val.to_buffer(buf);
- hashes[i] = HashUtil::zlib_crc_hash(buf, len, hashes[i]);
- };
-
- if (null_data == nullptr) {
- for (size_t i = 0; i < s; i++) {
- date_convert_do_crc(i);
- }
- } else {
- for (size_t i = 0; i < s; i++) {
- if (null_data[i] == 0) {
- date_convert_do_crc(i);
- }
- }
+ if (null_data == nullptr) {
+ for (size_t i = 0; i < s; i++) {
+ hashes[i] = _zlib_crc32_hash(hashes[i], i);
}
} else {
- if (null_data == nullptr) {
- for (size_t i = 0; i < s; i++) {
- hashes[i] = HashUtil::zlib_crc_hash(
- &data[i], sizeof(typename
PrimitiveTypeTraits<T>::CppType), hashes[i]);
- }
- } else {
- for (size_t i = 0; i < s; i++) {
- if (null_data[i] == 0)
- hashes[i] = HashUtil::zlib_crc_hash(
- &data[i], sizeof(typename
PrimitiveTypeTraits<T>::CppType), hashes[i]);
+ for (size_t i = 0; i < s; i++) {
+ if (null_data[i] == 0) {
+ hashes[i] = _zlib_crc32_hash(hashes[i], i);
}
}
}
}
+template <PrimitiveType T>
+uint32_t ColumnVector<T>::_zlib_crc32_hash(uint32_t hash, size_t idx) const {
+ if constexpr (is_date_or_datetime(T)) {
+ char buf[64];
+ const auto& date_val = (const VecDateTimeValue&)data[idx];
+ auto len = date_val.to_buffer(buf);
+ return HashUtil::zlib_crc_hash(buf, len, hash);
+ } else {
+ return HashUtil::zlib_crc32_fixed(data[idx], hash);
+ }
+}
+
template <PrimitiveType T>
uint32_t ColumnVector<T>::_crc32c_hash(uint32_t hash, size_t idx) const {
if constexpr (is_date_or_datetime(T)) {
diff --git a/be/src/vec/columns/column_vector.h
b/be/src/vec/columns/column_vector.h
index fb3141d6485..9e97eeae6b5 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -405,6 +405,7 @@ public:
}
protected:
+ uint32_t _zlib_crc32_hash(uint32_t hash, size_t idx) const;
uint32_t _crc32c_hash(uint32_t hash, size_t idx) const;
Container data;
};
diff --git a/be/test/util/crc32c_test.cpp b/be/test/util/crc32c_test.cpp
index 5a6a7faa3a5..e9795882b36 100644
--- a/be/test/util/crc32c_test.cpp
+++ b/be/test/util/crc32c_test.cpp
@@ -22,10 +22,14 @@
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <string.h>
+#include <zlib.h>
+#include <cstdint>
+#include <limits>
#include <vector>
#include "gtest/gtest_pred_impl.h"
+#include "util/hash_util.hpp"
#include "util/slice.h"
namespace doris {
@@ -75,3 +79,400 @@ TEST(CRC, Extend) {
}
} // namespace doris
+
+namespace doris {
+
+// Helper: compute crc32c via crc32c::Crc32c for a value of type T
+template <typename T>
+uint32_t crc32c_reference(const T& value, uint32_t seed) {
+ return crc32c::Extend(seed, reinterpret_cast<const uint8_t*>(&value),
sizeof(T));
+}
+
+// Helper: compute zlib crc32 for a value of type T
+template <typename T>
+uint32_t zlib_crc32_reference(const T& value, uint32_t seed) {
+ return HashUtil::zlib_crc_hash(&value, sizeof(T), seed);
+}
+
+/*
+todo: fix those cases when we have a new release version; do not consider the
compatibility issue
+use following code to replace the old crc32c_fixed function in hash_util.hpp
+template <typename T>
+static uint32_t crc32c_fixed(const T& value, uint32_t hash) {
+ uint32_t crc = hash ^ 0xFFFFFFFFU;
+ if constexpr (sizeof(T) == 1) {
+ crc = _mm_crc32_u8(crc, *reinterpret_cast<const uint8_t*>(&value));
+ } else if constexpr (sizeof(T) == 2) {
+ crc = _mm_crc32_u16(crc, *reinterpret_cast<const uint16_t*>(&value));
+ } else if constexpr (sizeof(T) == 4) {
+ crc = _mm_crc32_u32(crc, *reinterpret_cast<const uint32_t*>(&value));
+ } else if constexpr (sizeof(T) == 8) {
+ crc = (uint32_t)_mm_crc32_u64(crc, *reinterpret_cast<const
uint64_t*>(&value));
+ } else {
+ return crc32c_extend(hash, (const uint8_t*)&value, sizeof(T));
+ }
+ return crc ^ 0xFFFFFFFFU;
+}
+// ==================== crc32c_fixed tests ====================
+TEST(CRC32CFixed, Uint8Values) {
+ uint8_t values[] = {0, 1, 127, 128, 255};
+ for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0xDEADBEEFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "uint8_t v=" << (int)v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, Uint16Values) {
+ uint16_t values[] = {0, 1, 255, 256, 1000, 32767, 65535};
+ for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0x12345678U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "uint16_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, Int32Values) {
+ int32_t values[] = {0,
+ 1,
+ -1,
+ 42,
+ -42,
+ 1000000,
+ -1000000,
+ std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<int32_t>::max()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xCAFEBABEU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "int32_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, Uint32Values) {
+ uint32_t values[] = {0, 1, 0xFF, 0xFFFF, 0xFFFFFFFF, 0xDEADBEEF,
0x12345678};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCD1234U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "uint32_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, Int64Values) {
+ int64_t values[] = {0,
+ 1,
+ -1,
+ 1000000000LL,
+ -1000000000LL,
+ std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max(),
+ 0x0102030405060708LL,
+ -0x0102030405060708LL};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x87654321U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "int64_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, Uint64Values) {
+ uint64_t values[] = {0,
+ 1,
+ 0xFFFFFFFFFFFFFFFFULL,
+ 0xDEADBEEFCAFEBABEULL,
+ 0x0123456789ABCDEFULL,
+ 0xFF00FF00FF00FF00ULL};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x11111111U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "uint64_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, FloatValues) {
+ float values[] = {0.0f,
+ -0.0f,
+ 1.0f,
+ -1.0f,
+ 3.14f,
+ std::numeric_limits<float>::min(),
+ std::numeric_limits<float>::max(),
+ std::numeric_limits<float>::infinity()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "float v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, DoubleValues) {
+ double values[] = {0.0,
+ -0.0,
+ 1.0,
+ -1.0,
+ 3.141592653589793,
+ 1e100,
+ -1e100,
+ std::numeric_limits<double>::infinity()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v,
seed))
+ << "double v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(CRC32CFixed, NullHash) {
+ // crc32c_null should match crc32c_fixed with int(0)
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xDEADBEEFU}) {
+ int zero = 0;
+ EXPECT_EQ(HashUtil::crc32c_null(seed), HashUtil::crc32c_fixed(zero,
seed));
+ EXPECT_EQ(HashUtil::crc32c_null(seed), crc32c_reference(zero, seed));
+ }
+}
+*/
+// ==================== zlib_crc32_fixed tests ====================
+
+TEST(ZlibCRC32Fixed, Uint8Values) {
+ uint8_t values[] = {0, 1, 42, 127, 128, 255};
+ for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0xDEADBEEFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "uint8_t v=" << (int)v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Int16Values) {
+ int16_t values[] = {0, 1, -1, 256, -256, 32767, -32768};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x12345678U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "int16_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Uint16Values) {
+ uint16_t values[] = {0, 1, 255, 256, 1000, 32767, 65535};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCDEF00U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "uint16_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Int32Values) {
+ int32_t values[] = {0,
+ 1,
+ -1,
+ 42,
+ -42,
+ 1000000,
+ -1000000,
+ std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<int32_t>::max()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xCAFEBABEU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "int32_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Uint32Values) {
+ uint32_t values[] = {0, 1, 0xFF, 0xFFFF, 0xFFFFFFFF, 0xDEADBEEF,
0x12345678};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCD1234U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "uint32_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Int64Values) {
+ int64_t values[] = {0,
+ 1,
+ -1,
+ 1000000000LL,
+ -1000000000LL,
+ std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max(),
+ 0x0102030405060708LL,
+ -0x0102030405060708LL};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x87654321U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "int64_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, Uint64Values) {
+ uint64_t values[] = {0,
+ 1,
+ 0xFFFFFFFFFFFFFFFFULL,
+ 0xDEADBEEFCAFEBABEULL,
+ 0x0123456789ABCDEFULL,
+ 0xFF00FF00FF00FF00ULL};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x11111111U}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "uint64_t v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, FloatValues) {
+ float values[] = {0.0f,
+ -0.0f,
+ 1.0f,
+ -1.0f,
+ 3.14f,
+ 1e10f,
+ -1e10f,
+ std::numeric_limits<float>::min(),
+ std::numeric_limits<float>::max(),
+ std::numeric_limits<float>::infinity()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "float v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, DoubleValues) {
+ double values[] = {0.0,
+ -0.0,
+ 1.0,
+ -1.0,
+ 3.141592653589793,
+ 1e100,
+ -1e100,
+ 1e-300,
+ std::numeric_limits<double>::infinity()};
+ for (uint32_t seed : {0U, 0xFFFFFFFFU}) {
+ for (auto v : values) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed),
zlib_crc32_reference(v, seed))
+ << "double v=" << v << " seed=" << seed;
+ }
+ }
+}
+
+TEST(ZlibCRC32Fixed, NullHash) {
+ // zlib_crc_hash_null should match zlib_crc32_fixed with int(0)
+ for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xDEADBEEFU}) {
+ int zero = 0;
+ EXPECT_EQ(HashUtil::zlib_crc_hash_null(seed),
HashUtil::zlib_crc32_fixed(zero, seed));
+ EXPECT_EQ(HashUtil::zlib_crc_hash_null(seed),
zlib_crc32_reference(zero, seed));
+ }
+}
+
+// ==================== Cross-validation: fixed vs non-fixed should differ
====================
+
+TEST(CRC32Fixed, CRC32CVsZlibDiffer) {
+ // CRC32C and standard CRC32 use different polynomials, so results should
differ
+ // (except possibly by coincidence on some values, but not systematically)
+ int32_t v = 12345678;
+ uint32_t seed = 0;
+ uint32_t crc32c_result = HashUtil::crc32c_fixed(v, seed);
+ uint32_t zlib_result = HashUtil::zlib_crc32_fixed(v, seed);
+ EXPECT_NE(crc32c_result, zlib_result)
+ << "CRC32C and zlib CRC32 should produce different results for
non-trivial input";
+}
+
+// ==================== Chaining: verify incremental hashing
====================
+/*
+TEST(CRC32CFixed, IncrementalChaining) {
+ // Hash two int32 values incrementally and compare with hashing 8 bytes at
once
+ int32_t a = 0x11223344;
+ int32_t b = 0x55667788;
+ uint32_t seed = 0;
+
+ uint32_t chained = HashUtil::crc32c_fixed(a, seed);
+ chained = HashUtil::crc32c_fixed(b, chained);
+
+ // Reference: hash the 8 bytes sequentially via crc32c::Extend
+ uint8_t buf[8];
+ memcpy(buf, &a, 4);
+ memcpy(buf + 4, &b, 4);
+ uint32_t reference = crc32c::Extend(seed, buf, 8);
+
+ EXPECT_EQ(chained, reference);
+}
+*/
+TEST(ZlibCRC32Fixed, IncrementalChaining) {
+ // Hash two int32 values incrementally and compare with hashing 8 bytes at
once
+ int32_t a = 0x11223344;
+ int32_t b = 0x55667788;
+ uint32_t seed = 0;
+
+ uint32_t chained = HashUtil::zlib_crc32_fixed(a, seed);
+ chained = HashUtil::zlib_crc32_fixed(b, chained);
+
+ // Reference: hash the 8 bytes sequentially via zlib crc32
+ uint8_t buf[8];
+ memcpy(buf, &a, 4);
+ memcpy(buf + 4, &b, 4);
+ uint32_t reference = (uint32_t)crc32(seed, buf, 8);
+
+ EXPECT_EQ(chained, reference);
+}
+/*
+// ==================== Exhaustive 1-byte test ====================
+
+TEST(CRC32CFixed, AllByteValues) {
+ for (int i = 0; i <= 255; i++) {
+ uint8_t v = static_cast<uint8_t>(i);
+ uint32_t seed = 0x12345678U;
+ EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed))
<< "byte=" << i;
+ }
+}
+
+TEST(ZlibCRC32Fixed, AllByteValues) {
+ for (int i = 0; i <= 255; i++) {
+ uint8_t v = static_cast<uint8_t>(i);
+ uint32_t seed = 0x12345678U;
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v,
seed))
+ << "byte=" << i;
+ }
+}
+
+// ==================== Sequential pattern ====================
+
+TEST(CRC32CFixed, SequentialInt32) {
+ // Hash a sequence of increasing int32 values, verify each against
reference
+ uint32_t seed = 0;
+ for (int32_t i = -500; i <= 500; i++) {
+ EXPECT_EQ(HashUtil::crc32c_fixed(i, seed), crc32c_reference(i, seed))
<< "i=" << i;
+ }
+}
+
+TEST(ZlibCRC32Fixed, SequentialInt32) {
+ uint32_t seed = 0;
+ for (int32_t i = -500; i <= 500; i++) {
+ EXPECT_EQ(HashUtil::zlib_crc32_fixed(i, seed), zlib_crc32_reference(i,
seed)) << "i=" << i;
+ }
+}
+*/
+// ==================== Large 16-byte type fallback test ====================
+
+TEST(ZlibCRC32Fixed, LargeTypeFallback) {
+ // __int128 is 16 bytes, should hit the fallback path to zlib crc32()
+ __int128 value = static_cast<__int128>(0x0102030405060708ULL) << 64 |
0x090A0B0C0D0E0F10ULL;
+ uint32_t seed = 0;
+ uint32_t fixed_result = HashUtil::zlib_crc32_fixed(value, seed);
+ uint32_t ref_result = HashUtil::zlib_crc_hash(&value, sizeof(value), seed);
+ EXPECT_EQ(fixed_result, ref_result);
+}
+
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]