This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
new 5b5967a0a1 [Improve](array)improve array hash function (#21779)
5b5967a0a1 is described below
commit 5b5967a0a1930d40b9bec10a6ac5aa94cb2696ed
Author: amory <[email protected]>
AuthorDate: Thu Jul 13 11:53:54 2023 +0800
[Improve](array)improve array hash function (#21779)
cherry-pick from #21699
---
be/src/util/timezone_utils.cpp | 2 +
be/src/vec/columns/column.h | 8 +++-
be/src/vec/columns/column_array.cpp | 73 ++++++++++++++++++++++++++--------
be/src/vec/columns/column_array.h | 6 ++-
be/src/vec/columns/column_const.h | 8 ++--
be/src/vec/columns/column_decimal.cpp | 50 +++++++++++++++++------
be/src/vec/columns/column_decimal.h | 14 ++++++-
be/src/vec/columns/column_nullable.cpp | 34 +++++++++++-----
be/src/vec/columns/column_nullable.h | 6 ++-
be/src/vec/columns/column_string.h | 42 +++++++++++++++----
be/src/vec/columns/column_vector.h | 42 +++++++++++++++----
11 files changed, 219 insertions(+), 66 deletions(-)
diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp
index c22c12b560..b746634e03 100644
--- a/be/src/util/timezone_utils.cpp
+++ b/be/src/util/timezone_utils.cpp
@@ -17,7 +17,9 @@
//
#include "util/timezone_utils.h"
+
#include <cctz/time_zone.h>
+
#include <boost/algorithm/string.hpp>
#include <filesystem>
#include <string>
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 89dd4fbef4..c82944b44d 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -360,7 +360,9 @@ public:
LOG(FATAL) << get_name() << "update_hashes_with_value xxhash not
supported";
};
- virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const {
+ // use range for one hash value to avoid virtual function call in loop
+ virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t&
hash,
+ const uint8_t* __restrict null_data)
const {
LOG(FATAL) << get_name() << " update_hash_with_value xxhash not
supported";
}
@@ -372,7 +374,9 @@ public:
LOG(FATAL) << get_name() << "update_crcs_with_value not supported";
};
- virtual void update_crc_with_value(size_t n, uint64_t& hash) const {
+ // use range for one hash value to avoid virtual function call in loop
+ virtual void update_crc_with_value(size_t start, size_t end, uint64_t&
hash,
+ const uint8_t* __restrict null_data)
const {
LOG(FATAL) << get_name() << " update_crc_with_value not supported";
}
diff --git a/be/src/vec/columns/column_array.cpp
b/be/src/vec/columns/column_array.cpp
index 999991fde5..6a2f40792b 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -229,25 +229,64 @@ void
ColumnArray::update_hashes_with_value(std::vector<SipHash>& hashes,
}
// for every array row calculate xxHash
-void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const {
- size_t elem_size = size_at(n);
- size_t offset = offset_at(n);
- hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&elem_size), sizeof(elem_size),
- hash);
- for (auto i = 0; i < elem_size; ++i) {
- get_data().update_xxHash_with_value(offset + i, hash);
+void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t&
hash,
+ const uint8_t* __restrict
null_data) const {
+ auto& offsets_column = get_offsets();
+ if (null_data) {
+ for (size_t i = start; i < end; ++i) {
+ if (null_data[i] == 0) {
+ size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+ if (elem_size == 0) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&elem_size),
+ sizeof(elem_size), hash);
+ } else {
+ get_data().update_crc_with_value(offsets_column[i - 1],
offsets_column[i], hash,
+ nullptr);
+ }
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; ++i) {
+ size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+ if (elem_size == 0) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&elem_size),
+ sizeof(elem_size), hash);
+ } else {
+ get_data().update_crc_with_value(offsets_column[i - 1],
offsets_column[i], hash,
+ nullptr);
+ }
+ }
}
}
// for every array row calculate crcHash
-void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const {
- size_t elem_size = size_at(n);
- size_t offset = offset_at(n);
-
- crc = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size),
sizeof(elem_size),
- crc);
- for (auto i = 0; i < elem_size; ++i) {
- get_data().update_crc_with_value(offset + i, crc);
+void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t&
hash,
+ const uint8_t* __restrict null_data)
const {
+ auto& offsets_column = get_offsets();
+ if (null_data) {
+ for (size_t i = start; i < end; ++i) {
+ if (null_data[i] == 0) {
+ size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+ if (elem_size == 0) {
+ hash = HashUtil::zlib_crc_hash(reinterpret_cast<const
char*>(&elem_size),
+ sizeof(elem_size), hash);
+ } else {
+ get_data().update_crc_with_value(offsets_column[i - 1],
offsets_column[i], hash,
+ nullptr);
+ }
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; ++i) {
+ size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+ if (elem_size == 0) {
+ hash = HashUtil::zlib_crc_hash(reinterpret_cast<const
char*>(&elem_size),
+ sizeof(elem_size), hash);
+ } else {
+ get_data().update_crc_with_value(offsets_column[i - 1],
offsets_column[i], hash,
+ nullptr);
+ }
+ }
}
}
@@ -257,12 +296,12 @@ void ColumnArray::update_hashes_with_value(uint64_t*
__restrict hashes,
if (null_data) {
for (size_t i = 0; i < s; ++i) {
if (null_data[i] == 0) {
- update_xxHash_with_value(i, hashes[i]);
+ update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
}
}
} else {
for (size_t i = 0; i < s; ++i) {
- update_xxHash_with_value(i, hashes[i]);
+ update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
}
}
}
diff --git a/be/src/vec/columns/column_array.h
b/be/src/vec/columns/column_array.h
index 7239de3347..2725814fad 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -97,8 +97,10 @@ public:
void insert_data(const char* pos, size_t length) override;
StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*&
begin) const override;
const char* deserialize_and_insert_from_arena(const char* pos) override;
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
- void update_crc_with_value(size_t n, uint64_t& crc) const override;
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
void update_hashes_with_value(std::vector<SipHash>& hashes,
const uint8_t* __restrict null_data) const
override;
diff --git a/be/src/vec/columns/column_const.h
b/be/src/vec/columns/column_const.h
index 13640a0a9a..05ffe9ac6f 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -135,7 +135,8 @@ public:
void update_hashes_with_value(uint64_t* __restrict hashes,
const uint8_t* __restrict null_data) const
override;
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
auto real_data = data->get_data_at(0);
if (real_data.data == nullptr) {
hash = HashUtil::xxHash64NullWithSeed(hash);
@@ -144,8 +145,9 @@ public:
}
}
- void update_crc_with_value(size_t n, uint64_t& crc) const override {
- get_data_column_ptr()->update_crc_with_value(n, crc);
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
+ get_data_column_ptr()->update_crc_with_value(start, end, hash,
nullptr);
}
ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const
override;
diff --git a/be/src/vec/columns/column_decimal.cpp
b/be/src/vec/columns/column_decimal.cpp
index 78e7d055cb..4faa3fa490 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -128,16 +128,27 @@ void
ColumnDecimal<T>::update_hashes_with_value(std::vector<SipHash>& hashes,
}
template <typename T>
-void ColumnDecimal<T>::update_crc_with_value(size_t n, uint64_t& crc) const {
- if constexpr (!IsDecimalV2<T>) {
- crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+void ColumnDecimal<T>::update_crc_with_value(size_t start, size_t end,
uint64_t& hash,
+ const uint8_t* __restrict
null_data) const {
+ if (null_data == nullptr) {
+ for (size_t i = start; i < end; i++) {
+ if constexpr (!IsDecimalV2<T>) {
+ hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+ } else {
+ decimalv2_do_crc(i, hash);
+ }
+ }
} else {
- const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n];
- int64_t int_val = dec_val.int_value();
- int32_t frac_val = dec_val.frac_value();
- crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc);
- crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc);
- };
+ for (size_t i = start; i < end; i++) {
+ if (null_data[i] == 0) {
+ if constexpr (!IsDecimalV2<T>) {
+ hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+ } else {
+ decimalv2_do_crc(i, hash);
+ }
+ }
+ }
+ }
}
template <typename T>
@@ -150,19 +161,32 @@ void
ColumnDecimal<T>::update_crcs_with_value(std::vector<uint64_t>& hashes, Pri
} else {
if (null_data == nullptr) {
for (size_t i = 0; i < s; i++) {
- update_crc_with_value(i, hashes[i]);
+ decimalv2_do_crc(i, hashes[i]);
}
} else {
for (size_t i = 0; i < s; i++) {
- if (null_data[i] == 0) update_crc_with_value(i, hashes[i]);
+ if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]);
}
}
}
}
template <typename T>
-void ColumnDecimal<T>::update_xxHash_with_value(size_t n, uint64_t& hash)
const {
- hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[n]),
sizeof(T), hash);
+void ColumnDecimal<T>::update_xxHash_with_value(size_t start, size_t end,
uint64_t& hash,
+ const uint8_t* __restrict
null_data) const {
+ if (null_data) {
+ for (size_t i = start; i < end; i++) {
+ if (null_data[i] == 0) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&data[i]),
+ sizeof(T), hash);
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; i++) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&data[i]), sizeof(T),
+ hash);
+ }
+ }
}
template <typename T>
diff --git a/be/src/vec/columns/column_decimal.h
b/be/src/vec/columns/column_decimal.h
index 8968ff6017..ac3bd7456f 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -159,8 +159,10 @@ public:
const uint8_t* __restrict null_data) const
override;
void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType
type,
const uint8_t* __restrict null_data) const
override;
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
- void update_crc_with_value(size_t n, uint64_t& crc) const override;
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
int compare_at(size_t n, size_t m, const IColumn& rhs_, int
nan_direction_hint) const override;
void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
@@ -265,6 +267,14 @@ protected:
std::partial_sort(res.begin(), sort_end, res.end(),
[this](size_t a, size_t b) { return data[a] <
data[b]; });
}
+
+ void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const {
+ const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i];
+ int64_t int_val = dec_val.int_value();
+ int32_t frac_val = dec_val.frac_value();
+ hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash);
+ hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash);
+ };
};
template <typename>
diff --git a/be/src/vec/columns/column_nullable.cpp
b/be/src/vec/columns/column_nullable.cpp
index 0e90251343..2b8632f8ec 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -114,21 +114,35 @@ void ColumnNullable::update_hashes_with_value(uint64_t*
__restrict hashes,
}
}
-void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const {
- auto* __restrict real_null_data = assert_cast<const
ColumnUInt8&>(*null_map).get_data().data();
- if (real_null_data[n] != 0) {
- hash = HashUtil::xxHash64NullWithSeed(hash);
+void ColumnNullable::update_xxHash_with_value(size_t start, size_t end,
uint64_t& hash,
+ const uint8_t* __restrict
null_data) const {
+ if (!has_null()) {
+ nested_column->update_xxHash_with_value(start, end, hash, nullptr);
} else {
- nested_column->update_xxHash_with_value(n, hash);
+ auto* __restrict real_null_data =
+ assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+ for (int i = start; i < end; ++i) {
+ if (real_null_data[i] != 0) {
+ hash = HashUtil::xxHash64NullWithSeed(hash);
+ }
+ }
+ nested_column->update_xxHash_with_value(start, end, hash,
real_null_data);
}
}
-void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const {
- auto* __restrict real_null_data = assert_cast<const
ColumnUInt8&>(*null_map).get_data().data();
- if (real_null_data[n] != 0) {
- crc = HashUtil::zlib_crc_hash_null(crc);
+void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t&
hash,
+ const uint8_t* __restrict
null_data) const {
+ if (!has_null()) {
+ nested_column->update_crc_with_value(start, end, hash, nullptr);
} else {
- nested_column->update_xxHash_with_value(n, crc);
+ auto* __restrict real_null_data =
+ assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+ for (int i = start; i < end; ++i) {
+ if (real_null_data[i] != 0) {
+ hash = HashUtil::zlib_crc_hash_null(hash);
+ }
+ }
+ nested_column->update_crc_with_value(start, end, hash, real_null_data);
}
}
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 9f29965a32..1285f0c5e8 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -186,8 +186,10 @@ public:
const uint8_t* __restrict null_data) const
override;
void update_hashes_with_value(uint64_t* __restrict hashes,
const uint8_t* __restrict null_data) const
override;
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
- void update_crc_with_value(size_t n, uint64_t& crc) const override;
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override;
void get_extremes(Field& min, Field& max) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector& selector)
const override {
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index b4caa7b985..10b3c1a36a 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -342,16 +342,42 @@ public:
SIP_HASHES_FUNCTION_COLUMN_IMPL();
}
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
- size_t string_size = size_at(n);
- size_t offset = offset_at(n);
- hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&chars[offset]),
- string_size, hash);
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
+ if (null_data) {
+ for (size_t i = start; i < end; ++i) {
+ if (null_data[i] == 0) {
+ size_t string_size = size_at(i);
+ size_t offset = offset_at(i);
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&chars[offset]),
+ string_size, hash);
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; ++i) {
+ size_t string_size = size_at(i);
+ size_t offset = offset_at(i);
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&chars[offset]),
+ string_size, hash);
+ }
+ }
}
- void update_crc_with_value(size_t n, uint64_t& crc) const override {
- auto data_ref = get_data_at(n);
- crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc);
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
+ if (null_data) {
+ for (size_t i = start; i < end; ++i) {
+ if (null_data[i] == 0) {
+ auto data_ref = get_data_at(i);
+ hash = HashUtil::zlib_crc_hash(data_ref.data,
data_ref.size, hash);
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; ++i) {
+ auto data_ref = get_data_at(i);
+ hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size,
hash);
+ }
+ }
}
void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType
type,
diff --git a/be/src/vec/columns/column_vector.h
b/be/src/vec/columns/column_vector.h
index 1c18e810b0..899be2b2fc 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -243,21 +243,49 @@ public:
const uint8_t* null_map,
size_t max_row_byte_size) const override;
- void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
- hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&data[n]), sizeof(T), hash);
+ void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
+ if (null_data) {
+ for (size_t i = start; i < end; i++) {
+ if (null_data[i] == 0) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&data[i]),
+ sizeof(T), hash);
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; i++) {
+ hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const
char*>(&data[i]),
+ sizeof(T), hash);
+ }
+ }
}
- void update_crc_with_value(size_t n, uint64_t& crc) const override {
+ void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx,
uint64_t& hash) const {
if constexpr (!std::is_same_v<T, Int64>) {
- crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+ hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
} else {
if (this->is_date_type() || this->is_datetime_type()) {
char buf[64];
- const VecDateTimeValue& date_val = (const
VecDateTimeValue&)data[n];
+ const VecDateTimeValue& date_val = (const
VecDateTimeValue&)data[idx];
auto len = date_val.to_buffer(buf);
- crc = HashUtil::zlib_crc_hash(buf, len, crc);
+ hash = HashUtil::zlib_crc_hash(buf, len, hash);
} else {
- crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+ hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
+ }
+ }
+ }
+
+ void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+ const uint8_t* __restrict null_data) const
override {
+ if (null_data) {
+ for (size_t i = start; i < end; i++) {
+ if (null_data[i] == 0) {
+ update_crc_with_value_without_null(i, hash);
+ }
+ }
+ } else {
+ for (size_t i = start; i < end; i++) {
+ update_crc_with_value_without_null(i, hash);
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]