This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new bb2f70b855d [branch-4.0][fix](inverted index) fix StringRef to
std::string reinterpret_cast overflow (#61150)
bb2f70b855d is described below
commit bb2f70b855df3e892734a12de01e5a1f6941829f
Author: Jack <[email protected]>
AuthorDate: Tue Mar 10 11:55:03 2026 +0800
[branch-4.0][fix](inverted index) fix StringRef to std::string
reinterpret_cast overflow (#61150)
## Proposed changes
Cherry-pick of #61120 to branch-4.0.
On ARM64, std::string is 24 bytes but StringRef is 16 bytes. Several
places pass StringRef* through void* and then reinterpret_cast to
std::string*, reading 8 bytes beyond the buffer.
1. **function_multi_match.cpp**: Convert StringRef to std::string before
passing as query_value. Downstream FullTextIndexReader::query()
reinterpret_casts query_value as std::string* (24 bytes on ARM64), but
StringRef is only 16 bytes, causing stack-buffer-overflow.
2. **in_list_predicate.h**: Fix 3 sites where HybridSet iterator returns
StringRef* via get_value(), but code treats it as std::string*. Add `if
constexpr (is_string_type(Type))` guard to construct std::string from
StringRef data/size before use.
Cherry-pick applied cleanly with no conflicts.
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
be/src/olap/in_list_predicate.h | 84 +++++++++++++++------------
be/src/vec/functions/function_multi_match.cpp | 5 +-
2 files changed, 52 insertions(+), 37 deletions(-)
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index b271fa85d1c..80b00b9af3b 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -115,8 +115,15 @@ public:
}
HybridSetBase::IteratorBase* iter = _values->begin();
while (iter->has_next()) {
- const T* value = (const T*)(iter->get_value());
- _update_min_max(*value);
+ if constexpr (is_string_type(Type)) {
+ // get_value() returns StringRef*, not std::string*
+ const auto* ref = (const StringRef*)(iter->get_value());
+ T str(ref->data, ref->size);
+ _update_min_max(str);
+ } else {
+ const T* value = (const T*)(iter->get_value());
+ _update_min_max(*value);
+ }
iter->next();
}
}
@@ -169,12 +176,18 @@ public:
roaring::Roaring indices;
HybridSetBase::IteratorBase* iter = _values->begin();
while (iter->has_next()) {
- const void* ptr = iter->get_value();
- // auto&& value =
PrimitiveTypeConvertor<Type>::to_storage_field_type(
- // *reinterpret_cast<const T*>(ptr));
std::unique_ptr<InvertedIndexQueryParamFactory> query_param =
nullptr;
-
RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value<Type>((const
T*)ptr,
-
query_param));
+ if constexpr (is_string_type(Type)) {
+ // get_value() returns StringRef*, not std::string*
+ const auto* ref = (const StringRef*)(iter->get_value());
+ T str(ref->data, ref->size);
+
RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value<Type>(
+ &str, query_param));
+ } else {
+ const T* value = (const T*)(iter->get_value());
+
RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value<Type>(
+ value, query_param));
+ }
InvertedIndexQueryType query_type =
InvertedIndexQueryType::EQUAL_QUERY;
InvertedIndexParam param;
param.column_name = name_with_type.first;
@@ -417,44 +430,43 @@ public:
if constexpr (PT == PredicateType::IN_LIST) {
HybridSetBase::IteratorBase* iter = _values->begin();
while (iter->has_next()) {
- const T* value = (const T*)(iter->get_value());
-
auto test_bytes = [&]<typename V>(const V& val) {
return
bf->test_bytes(const_cast<char*>(reinterpret_cast<const char*>(&val)),
sizeof(V));
};
- // Small integers (TINYINT, SMALLINT, INTEGER) -> hash as int32
- if constexpr (Type == PrimitiveType::TYPE_TINYINT ||
- Type == PrimitiveType::TYPE_SMALLINT ||
- Type == PrimitiveType::TYPE_INT) {
- int32_t int32_value = static_cast<int32_t>(*value);
- if (test_bytes(int32_value)) {
- return true;
- }
- } else if constexpr (Type == PrimitiveType::TYPE_BIGINT) {
- // BIGINT -> hash as int64
- if (test_bytes(*value)) {
- return true;
- }
- } else if constexpr (Type == PrimitiveType::TYPE_DOUBLE) {
- // DOUBLE -> hash as double
- if (test_bytes(*value)) {
- return true;
- }
- } else if constexpr (Type == PrimitiveType::TYPE_FLOAT) {
- // FLOAT -> hash as float
- if (test_bytes(*value)) {
+ if constexpr (is_string_type(Type)) {
+ // get_value() returns StringRef*, not std::string*
+ const auto* ref = (const StringRef*)(iter->get_value());
+ if (bf->test_bytes(ref->data, ref->size)) {
return true;
}
- } else if constexpr (is_string_type(Type)) {
- // VARCHAR/STRING -> hash bytes
- if (bf->test_bytes(value->data(), value->size())) {
+ } else {
+ const T* value = (const T*)(iter->get_value());
+ // Small integers (TINYINT, SMALLINT, INTEGER) -> hash as
int32
+ if constexpr (Type == PrimitiveType::TYPE_TINYINT ||
+ Type == PrimitiveType::TYPE_SMALLINT ||
+ Type == PrimitiveType::TYPE_INT) {
+ int32_t int32_value = static_cast<int32_t>(*value);
+ if (test_bytes(int32_value)) {
+ return true;
+ }
+ } else if constexpr (Type == PrimitiveType::TYPE_BIGINT) {
+ if (test_bytes(*value)) {
+ return true;
+ }
+ } else if constexpr (Type == PrimitiveType::TYPE_DOUBLE) {
+ if (test_bytes(*value)) {
+ return true;
+ }
+ } else if constexpr (Type == PrimitiveType::TYPE_FLOAT) {
+ if (test_bytes(*value)) {
+ return true;
+ }
+ } else {
+ // Unsupported types: return true (accept)
return true;
}
- } else {
- // Unsupported types: return true (accept)
- return true;
}
iter->next();
}
diff --git a/be/src/vec/functions/function_multi_match.cpp
b/be/src/vec/functions/function_multi_match.cpp
index 676ab446635..e9d66d74abf 100644
--- a/be/src/vec/functions/function_multi_match.cpp
+++ b/be/src/vec/functions/function_multi_match.cpp
@@ -73,12 +73,15 @@ Status FunctionMultiMatch::evaluate_inverted_index(
}
// query
- auto query_str = arguments[1].column->get_data_at(0);
+ auto query_str_ref = arguments[1].column->get_data_at(0);
auto param_type = arguments[1].type->get_primitive_type();
if (!is_string_type(param_type)) {
return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
"arguments for multi_match must be string");
}
+ // Must convert StringRef to std::string because downstream readers
+ // (e.g. FullTextIndexReader::query) reinterpret_cast query_value as
std::string*.
+ std::string query_str(query_str_ref.data, query_str_ref.size);
// search
InvertedIndexParam param;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]