This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 919c2ef7d45 branch-4.0: [feature](search) add phrase/wildcard/regex
support for search and refact some code #57372 (#57469)
919c2ef7d45 is described below
commit 919c2ef7d45ac79659ccc411567084f6169130c6
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Oct 31 09:41:58 2025 +0800
branch-4.0: [feature](search) add phrase/wildcard/regex support for search
and refact some code #57372 (#57469)
Cherry-picked from #57372
Co-authored-by: Jack <[email protected]>
---
.../const_score_query/const_score_scorer.h | 10 +-
.../inverted_index/query_v2/null_bitmap_fetcher.h | 82 ++++++++
.../query_v2/phrase_query/phrase_query.h | 2 +-
.../query_v2/phrase_query/phrase_weight.h | 32 ++-
.../query_v2/regexp_query/regexp_query.h | 2 +-
.../query_v2/regexp_query/regexp_weight.cpp | 15 +-
.../query_v2/term_query/term_query.h | 11 +-
.../query_v2/term_query/term_scorer.h | 25 +--
.../query_v2/term_query/term_weight.h | 11 +-
.../segment_v2/inverted_index/query_v2/weight.h | 31 +++
.../query_v2/wildcard_query/wildcard_query.h | 2 +-
.../query_v2/wildcard_query/wildcard_weight.h | 5 +-
be/src/vec/functions/function_search.cpp | 90 +++++++-
be/src/vec/functions/function_search.h | 4 +
.../search/test_search_default_field_operator.out | 79 +++++++
.../data/search/test_search_dsl_syntax.out | 6 +-
.../data/search/test_search_function.out | 3 +
.../data/search/test_search_null_semantics.out | 10 +-
.../test_search_default_field_operator.groovy | 230 +++++++++++++++++++++
.../search/test_search_null_semantics.groovy | 7 +
20 files changed, 597 insertions(+), 60 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h
index 1ebf47f1d5d..6f313068d4a 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h
@@ -34,10 +34,18 @@ public:
float score() override { return _score; }
+ bool has_null_bitmap(const NullBitmapResolver* resolver = nullptr)
override {
+ return _scorer && _scorer->has_null_bitmap(resolver);
+ }
+
+ const roaring::Roaring* get_null_bitmap(const NullBitmapResolver* resolver
= nullptr) override {
+ return _scorer ? _scorer->get_null_bitmap(resolver) : nullptr;
+ }
+
private:
ScorerPtrT _scorer;
float _score = 1.0F;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h
new file mode 100644
index 00000000000..204533d2ec5
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <memory>
+#include <roaring/roaring.hh>
+#include <string>
+
+#include "olap/rowset/segment_v2/index_iterator.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
+#include "olap/rowset/segment_v2/inverted_index_cache.h"
+
+namespace doris::segment_v2::inverted_index::query_v2 {
+
+// Small helper that centralizes "field NULL bitmap" lookups so weights/scorers
+// don't have to duplicate resolver plumbing.
+class FieldNullBitmapFetcher {
+public:
+ FieldNullBitmapFetcher() = delete;
+
+ static std::shared_ptr<roaring::Roaring> fetch(const
QueryExecutionContext& context,
+ const std::string&
logical_field,
+ const Scorer* scorer =
nullptr) {
+ return fetch(context.null_resolver, logical_field, scorer);
+ }
+
+ static std::shared_ptr<roaring::Roaring> fetch(const NullBitmapResolver*
resolver,
+ const std::string&
logical_field,
+ const Scorer* scorer =
nullptr) {
+ if (resolver == nullptr || logical_field.empty()) {
+ return nullptr;
+ }
+
+ EmptyScorer fallback_scorer;
+ const Scorer* resolver_scorer = scorer != nullptr ? scorer :
&fallback_scorer;
+
+ auto iterator = resolver->iterator_for(*resolver_scorer,
logical_field);
+ if (iterator == nullptr) {
+ return nullptr;
+ }
+
+ auto has_null = iterator->has_null();
+ if (!has_null.has_value() || !has_null.value()) {
+ return nullptr;
+ }
+
+ segment_v2::InvertedIndexQueryCacheHandle cache_handle;
+ auto status = iterator->read_null_bitmap(&cache_handle);
+ if (!status.ok()) {
+ LOG(WARNING) << "Failed to read null bitmap for field '" <<
logical_field
+ << "': " << status.to_string();
+ return nullptr;
+ }
+
+ auto bitmap_ptr = cache_handle.get_bitmap();
+ if (bitmap_ptr == nullptr) {
+ return nullptr;
+ }
+
+ return std::make_shared<roaring::Roaring>(*bitmap_ptr);
+ }
+};
+
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h
index 133cf71afe4..f46ac1793b7 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h
@@ -51,4 +51,4 @@ private:
std::vector<std::wstring> _terms;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h
index 7fe68e33995..339b4fdc99f 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_weight.h
@@ -18,6 +18,9 @@
#pragma once
#include "olap/rowset/segment_v2/index_query_context.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
@@ -37,12 +40,29 @@ public:
~PhraseWeight() override = default;
ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string&
binding_key) override {
- auto scorer = phrase_scorer(ctx, binding_key);
- if (scorer) {
- return scorer;
- } else {
- return std::make_shared<EmptyScorer>();
+ auto phrase = phrase_scorer(ctx, binding_key);
+ auto logical_field = logical_field_or_fallback(ctx, binding_key,
_field);
+ auto null_bitmap = FieldNullBitmapFetcher::fetch(ctx, logical_field);
+
+ auto doc_bitset = std::make_shared<roaring::Roaring>();
+ if (phrase) {
+ uint32_t doc = phrase->doc();
+ if (doc == TERMINATED) {
+ doc = phrase->advance();
+ }
+ while (doc != TERMINATED) {
+ doc_bitset->add(doc);
+ doc = phrase->advance();
+ }
+ }
+
+ auto bit_set =
+ std::make_shared<BitSetScorer>(std::move(doc_bitset),
std::move(null_bitmap));
+ if (!phrase) {
+ return bit_set;
}
+ // Wrap with const score for consistency with other non-scoring paths
+ return
std::make_shared<ConstScoreScorer<BitSetScorerPtr>>(std::move(bit_set));
}
private:
@@ -78,4 +98,4 @@ private:
bool _enable_scoring = false;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h
index 9f1e7491b50..cce83b6e1e7 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h
@@ -43,4 +43,4 @@ private:
std::string _pattern;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp
index 5404abaddb0..f70b5be77c4 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.cpp
@@ -26,8 +26,10 @@
#include
"olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_scorer.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/const_score_query/const_score_scorer.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h"
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+#include "olap/rowset/segment_v2/inverted_index_iterator.h"
CL_NS_USE(index)
@@ -44,6 +46,9 @@ RegexpWeight::RegexpWeight(IndexQueryContextPtr context,
std::wstring field, std
ScorerPtr RegexpWeight::scorer(const QueryExecutionContext& context,
const std::string& binding_key) {
+ auto logical_field = logical_field_or_fallback(context, binding_key,
_field);
+ VLOG_DEBUG << "RegexpWeight::scorer() called - pattern=" << _pattern << ",
logical_field='"
+ << logical_field << "'";
auto prefix = get_regex_prefix(_pattern);
hs_database_t* database = nullptr;
@@ -76,7 +81,10 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext&
context,
hs_free_database(database);
if (matching_terms.empty()) {
- return std::make_shared<EmptyScorer>();
+ // Even when there are no matching terms, we must honor NULL semantics
for the field.
+ auto empty_true = std::make_shared<roaring::Roaring>();
+ auto null_bitmap = FieldNullBitmapFetcher::fetch(context,
logical_field);
+ return std::make_shared<BitSetScorer>(std::move(empty_true),
std::move(null_bitmap));
}
auto doc_bitset = std::make_shared<roaring::Roaring>();
@@ -93,7 +101,8 @@ ScorerPtr RegexpWeight::scorer(const QueryExecutionContext&
context,
}
}
- auto bit_set = std::make_shared<BitSetScorer>(doc_bitset);
+ auto null_bitmap = FieldNullBitmapFetcher::fetch(context, logical_field);
+ auto bit_set = std::make_shared<BitSetScorer>(doc_bitset, null_bitmap);
auto const_score =
std::make_shared<ConstScoreScorer<BitSetScorerPtr>>(std::move(bit_set));
return const_score;
}
@@ -220,4 +229,4 @@ void RegexpWeight::collect_matching_terms(const
QueryExecutionContext& context,
}
}
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h
index e4370a6a14d..7b8fa2eeace 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h
@@ -25,12 +25,8 @@ namespace doris::segment_v2::inverted_index::query_v2 {
class TermQuery : public Query {
public:
- TermQuery(IndexQueryContextPtr context, std::wstring field, std::wstring
term,
- std::string logical_field = {})
- : _context(std::move(context)),
- _field(std::move(field)),
- _term(std::move(term)),
- _logical_field(std::move(logical_field)) {}
+ TermQuery(IndexQueryContextPtr context, std::wstring field, std::wstring
term)
+ : _context(std::move(context)), _field(std::move(field)),
_term(std::move(term)) {}
~TermQuery() override = default;
WeightPtr weight(bool enable_scoring) override {
@@ -43,7 +39,7 @@ public:
}
return std::make_shared<TermWeight>(std::move(_context),
std::move(_field),
std::move(_term),
std::move(bm25_similarity),
- enable_scoring, _logical_field);
+ enable_scoring);
}
private:
@@ -51,7 +47,6 @@ private:
std::wstring _field;
std::wstring _term;
- std::string _logical_field;
};
} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h
index 77bbc922b1b..9099a71877d 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h
@@ -22,6 +22,7 @@
#include <optional>
#include <roaring/roaring.hh>
+#include "olap/rowset/segment_v2/inverted_index/query_v2/null_bitmap_fetcher.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/segment_postings.h"
#include "olap/rowset/segment_v2/inverted_index/similarity/similarity.h"
@@ -70,27 +71,9 @@ private:
_null_bitmap_checked = true;
- auto iterator = resolver->iterator_for(*this, _logical_field);
- if (iterator == nullptr) {
- return;
- }
-
- auto has_null_result = iterator->has_null();
- if (!has_null_result.has_value() || !has_null_result.value()) {
- return;
- }
-
- segment_v2::InvertedIndexQueryCacheHandle cache_handle;
- auto status = iterator->read_null_bitmap(&cache_handle);
- if (!status.ok()) {
- LOG(WARNING) << "TermScorer failed to read null bitmap for field
'" << _logical_field
- << "': " << status.to_string();
- return;
- }
-
- auto bitmap_ptr = cache_handle.get_bitmap();
- if (bitmap_ptr != nullptr) {
- _null_bitmap = *bitmap_ptr;
+ auto bitmap = FieldNullBitmapFetcher::fetch(resolver, _logical_field,
this);
+ if (bitmap != nullptr) {
+ _null_bitmap = *bitmap;
}
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
index 7f0e329d88e..d532e9664cb 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
@@ -27,23 +27,21 @@ namespace doris::segment_v2::inverted_index::query_v2 {
class TermWeight : public Weight {
public:
TermWeight(IndexQueryContextPtr context, std::wstring field, std::wstring
term,
- SimilarityPtr similarity, bool enable_scoring, std::string
logical_field = {})
+ SimilarityPtr similarity, bool enable_scoring)
: _context(std::move(context)),
_field(std::move(field)),
_term(std::move(term)),
_similarity(std::move(similarity)),
- _enable_scoring(enable_scoring),
- _logical_field(std::move(logical_field)) {}
+ _enable_scoring(enable_scoring) {}
~TermWeight() override = default;
ScorerPtr scorer(const QueryExecutionContext& ctx, const std::string&
binding_key) override {
auto reader = lookup_reader(_field, ctx, binding_key);
- auto field_name =
- _logical_field.empty() ? std::string(_field.begin(),
_field.end()) : _logical_field;
+ auto logical_field = logical_field_or_fallback(ctx, binding_key,
_field);
auto make_scorer = [&](auto segment_postings) -> ScorerPtr {
using PostingsT = decltype(segment_postings);
return
std::make_shared<TermScorer<PostingsT>>(std::move(segment_postings),
_similarity,
- field_name);
+ logical_field);
};
if (!reader) {
@@ -76,7 +74,6 @@ private:
std::wstring _term;
SimilarityPtr _similarity;
bool _enable_scoring = false;
- std::string _logical_field;
};
} // namespace doris::segment_v2::inverted_index::query_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
index c3483128912..17d8d11cbc7 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
@@ -30,12 +30,19 @@ class IndexReader;
namespace doris::segment_v2::inverted_index::query_v2 {
+struct FieldBindingContext {
+ std::string logical_field_name;
+ std::string stored_field_name;
+ std::wstring stored_field_wstr;
+};
+
struct QueryExecutionContext {
uint32_t segment_num_rows = 0;
std::vector<std::shared_ptr<lucene::index::IndexReader>> readers;
std::unordered_map<std::string,
std::shared_ptr<lucene::index::IndexReader>> reader_bindings;
std::unordered_map<std::wstring,
std::shared_ptr<lucene::index::IndexReader>>
field_reader_bindings;
+ std::unordered_map<std::string, FieldBindingContext> binding_fields;
const NullBitmapResolver* null_resolver = nullptr;
};
@@ -52,6 +59,30 @@ public:
}
protected:
+ const FieldBindingContext* get_field_binding(const QueryExecutionContext&
ctx,
+ const std::string&
binding_key) const {
+ auto it = ctx.binding_fields.find(binding_key);
+ if (it != ctx.binding_fields.end()) {
+ return &it->second;
+ }
+ return nullptr;
+ }
+
+ std::string logical_field_or_fallback(const QueryExecutionContext& ctx,
+ const std::string& binding_key,
+ const std::wstring& fallback) const {
+ const auto* binding = get_field_binding(ctx, binding_key);
+ if (binding != nullptr) {
+ if (!binding->logical_field_name.empty()) {
+ return binding->logical_field_name;
+ }
+ if (!binding->stored_field_name.empty()) {
+ return binding->stored_field_name;
+ }
+ }
+ return std::string(fallback.begin(), fallback.end());
+ }
+
std::shared_ptr<lucene::index::IndexReader> lookup_reader(
const std::wstring& field, const QueryExecutionContext& ctx,
const std::string& binding_key) const {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h
index 8cd92418a00..8b71ab9c0d4 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h
@@ -45,4 +45,4 @@ private:
std::string _pattern;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h
index da2de84eae3..b906605db29 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_weight.h
@@ -47,7 +47,10 @@ public:
private:
std::string wildcard_to_regex(const std::string& pattern) {
std::string escaped = RE2::QuoteMeta(pattern);
+ // Replace wildcard characters with regex equivalents
+ // * -> .* (zero or more of any character)
escaped = std::regex_replace(escaped, std::regex(R"(\\\*)"), ".*");
+ // ? -> . (exactly one of any character)
escaped = std::regex_replace(escaped, std::regex(R"(\\\?)"), ".");
return "^" + escaped + "$";
}
@@ -59,4 +62,4 @@ private:
bool _enable_scoring = false;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 19ec3a33612..95e0f868a48 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -38,7 +38,10 @@
#include
"olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_query.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/operator.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h"
#include
"olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h"
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
#include "olap/rowset/segment_v2/inverted_index_iterator.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
@@ -225,6 +228,16 @@ Status
FunctionSearch::evaluate_inverted_index_with_search_param(
exec_ctx.readers = resolver.readers();
exec_ctx.reader_bindings = resolver.reader_bindings();
exec_ctx.field_reader_bindings = resolver.field_readers();
+ for (const auto& [binding_key, binding] : resolver.binding_cache()) {
+ if (binding_key.empty()) {
+ continue;
+ }
+ query_v2::FieldBindingContext binding_ctx;
+ binding_ctx.logical_field_name = binding.logical_field_name;
+ binding_ctx.stored_field_name = binding.stored_field_name;
+ binding_ctx.stored_field_wstr = binding.stored_field_wstr;
+ exec_ctx.binding_fields.emplace(binding_key, std::move(binding_ctx));
+ }
class ResolverAdapter final : public query_v2::NullBitmapResolver {
public:
@@ -253,7 +266,7 @@ Status
FunctionSearch::evaluate_inverted_index_with_search_param(
return Status::OK();
}
- auto scorer = weight->scorer(exec_ctx);
+ auto scorer = weight->scorer(exec_ctx, root_binding_key);
if (!scorer) {
LOG(WARNING) << "search: Failed to build scorer";
bitmap_result =
InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
@@ -478,7 +491,7 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
std::wstring value_wstr = StringHelper::to_wstring(value);
auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr
{
- return std::make_shared<query_v2::TermQuery>(context, field_wstr,
term, field_name);
+ return std::make_shared<query_v2::TermQuery>(context, field_wstr,
term);
};
if (clause_type == "TERM") {
@@ -497,7 +510,9 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
value, binding.index_properties);
if (term_infos.empty()) {
LOG(WARNING) << "search: No terms found after tokenization for
TERM query, field="
- << field_name << ", value='" << value << "'";
+ << field_name << ", value='" << value
+ << "', returning empty BitSetQuery";
+ *out =
std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
return Status::OK();
}
@@ -523,8 +538,47 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) {
if (clause_type == "PHRASE") {
- VLOG_DEBUG << "search: PHRASE clause not implemented, fallback to
TERM";
- *out = make_term_query(value_wstr);
+ bool should_analyze =
inverted_index::InvertedIndexAnalyzer::should_analyzer(
+ binding.index_properties);
+ if (!should_analyze) {
+ VLOG_DEBUG << "search: PHRASE on non-tokenized field '" <<
field_name
+ << "', falling back to TERM";
+ *out = make_term_query(value_wstr);
+ return Status::OK();
+ }
+
+ if (binding.index_properties.empty()) {
+ LOG(WARNING) << "search: analyzer required but index
properties empty for PHRASE "
+ "query on field '"
+ << field_name << "'";
+ *out = make_term_query(value_wstr);
+ return Status::OK();
+ }
+
+ std::vector<TermInfo> term_infos =
+ inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+ value, binding.index_properties);
+ if (term_infos.empty()) {
+ LOG(WARNING) << "search: No terms found after tokenization for
PHRASE query, field="
+ << field_name << ", value='" << value
+ << "', returning empty BitSetQuery";
+ *out =
std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
+ return Status::OK();
+ }
+
+ if (term_infos.size() == 1) {
+ std::wstring term_wstr =
StringHelper::to_wstring(term_infos[0].get_single_term());
+ *out = make_term_query(term_wstr);
+ return Status::OK();
+ }
+
+ std::vector<std::wstring> terms;
+ for (const auto& term_info : term_infos) {
+
terms.push_back(StringHelper::to_wstring(term_info.get_single_term()));
+ }
+ *out = std::make_shared<query_v2::PhraseQuery>(context,
field_wstr, terms);
+ VLOG_DEBUG << "search: Built PhraseQuery for field=" << field_name
<< " with "
+ << terms.size() << " terms";
return Status::OK();
}
if (clause_type == "MATCH") {
@@ -553,7 +607,8 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
value, binding.index_properties);
if (term_infos.empty()) {
LOG(WARNING) << "search: tokenization yielded no terms for
clause '" << clause_type
- << "', field=" << field_name;
+ << "', field=" << field_name << ", returning
empty BitSetQuery";
+ *out =
std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
return Status::OK();
}
@@ -593,9 +648,28 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
<< value << "'";
return Status::OK();
}
+ if (clause_type == "PREFIX") {
+ *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, value);
+ VLOG_DEBUG << "search: PREFIX clause processed, field=" <<
field_name << ", pattern='"
+ << value << "'";
+ return Status::OK();
+ }
+
+ if (clause_type == "WILDCARD") {
+ *out = std::make_shared<query_v2::WildcardQuery>(context,
field_wstr, value);
+ VLOG_DEBUG << "search: WILDCARD clause processed, field=" <<
field_name << ", pattern='"
+ << value << "'";
+ return Status::OK();
+ }
+
+ if (clause_type == "REGEXP") {
+ *out = std::make_shared<query_v2::RegexpQuery>(context,
field_wstr, value);
+ VLOG_DEBUG << "search: REGEXP clause processed, field=" <<
field_name << ", pattern='"
+ << value << "'";
+ return Status::OK();
+ }
- if (clause_type == "PREFIX" || clause_type == "WILDCARD" ||
clause_type == "REGEXP" ||
- clause_type == "RANGE" || clause_type == "LIST") {
+ if (clause_type == "RANGE" || clause_type == "LIST") {
VLOG_DEBUG << "search: clause type '" << clause_type
<< "' not implemented, fallback to TERM";
}
diff --git a/be/src/vec/functions/function_search.h
b/be/src/vec/functions/function_search.h
index 910a8e25936..96e93220f44 100644
--- a/be/src/vec/functions/function_search.h
+++ b/be/src/vec/functions/function_search.h
@@ -93,6 +93,10 @@ public:
return _field_readers;
}
+ const std::unordered_map<std::string, FieldReaderBinding>& binding_cache()
const {
+ return _cache;
+ }
+
IndexIterator* get_iterator(const std::string& field_name) const {
auto it = _iterators.find(field_name);
return (it != _iterators.end()) ? it->second : nullptr;
diff --git a/regression-test/data/search/test_search_default_field_operator.out
b/regression-test/data/search/test_search_default_field_operator.out
new file mode 100644
index 00000000000..c418209e4ea
--- /dev/null
+++ b/regression-test/data/search/test_search_default_field_operator.out
@@ -0,0 +1,79 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !wildcard_prefix --
+1 Chris
+2 Christopher
+
+-- !multi_term_and --
+1 foo bar
+3 bar foo
+
+-- !multi_term_or --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !wildcard_multi_and --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !explicit_or_override --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !exact_function --
+1 foo bar
+
+-- !traditional_syntax --
+1 Chris
+2 Christopher
+
+-- !single_term --
+1 foo bar
+3 bar foo
+
+-- !wildcard_middle --
+1 Chris
+2 Christopher
+
+-- !case_sensitive --
+
+-- !default_or --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !any_function --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !all_function --
+1 foo bar
+3 bar foo
+
+-- !complex_wildcard --
+3 Kevin
+4 kevin
+
+-- !explicit_and --
+1 foo bar
+3 bar foo
+
+-- !multiple_fields --
+1 Chris foo bar
+2 Christopher foobar
+4 kevin foolish bark
+
+-- !not_operator --
+1 foo bar
+3 bar foo
+4 foolish bark
+
+-- !param_count_mix --
+1
+2
+3
+4
+
diff --git a/regression-test/data/search/test_search_dsl_syntax.out
b/regression-test/data/search/test_search_dsl_syntax.out
index b1f1ba93d4e..e8f6c627121 100644
--- a/regression-test/data/search/test_search_dsl_syntax.out
+++ b/regression-test/data/search/test_search_dsl_syntax.out
@@ -4,10 +4,15 @@
-- !sql --
-- !sql --
+2 Advanced Deep Learning
-- !sql --
+4 Data Science with R
+6 Database Design Patterns
-- !sql --
+4 Data Science with R
+6 Database Design Patterns
-- !sql --
1 Machine Learning Introduction
@@ -175,7 +180,6 @@
14 Test with null tags
15 Test with null author
16 Test with null status
-17 Message about success
18 Error message details
19 Warning message content
20 Regular article without msg
diff --git a/regression-test/data/search/test_search_function.out
b/regression-test/data/search/test_search_function.out
index f418a8c71eb..b24b8676831 100644
--- a/regression-test/data/search/test_search_function.out
+++ b/regression-test/data/search/test_search_function.out
@@ -22,6 +22,8 @@
-- !sql --
-- !sql --
+4 Data Science Methods
+9 Database Systems
-- !sql --
2 Deep Learning Tutorial
@@ -33,6 +35,7 @@
-- !sql --
-- !sql --
+1 Machine Learning Basics
-- !sql --
0
diff --git a/regression-test/data/search/test_search_null_semantics.out
b/regression-test/data/search/test_search_null_semantics.out
index 27eddd437be..237bd14bc1b 100644
--- a/regression-test/data/search/test_search_null_semantics.out
+++ b/regression-test/data/search/test_search_null_semantics.out
@@ -11,6 +11,15 @@
-- !test_case_2_external_not --
4
+-- !test_case_2_phrase_not --
+1
+2
+3
+5
+7
+9
+10
+
-- !test_case_3_or_with_null --
1 Ronald Reagan President of the United States
3 \N Biography of Ronald McDonald
@@ -148,4 +157,3 @@
-- !ternary_7_all_null --
0
-
diff --git
a/regression-test/suites/search/test_search_default_field_operator.groovy
b/regression-test/suites/search/test_search_default_field_operator.groovy
new file mode 100644
index 00000000000..fd5c7ce6198
--- /dev/null
+++ b/regression-test/suites/search/test_search_default_field_operator.groovy
@@ -0,0 +1,230 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_search_default_field_operator") {
+ def tableName = "search_enhanced_test"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ // Create table with inverted indexes
+ // firstname: with lower_case for case-insensitive wildcard search
+ // tags: with parser for tokenized search
+ // tags_exact: without parser specification (default behavior) for exact
matching
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ firstname VARCHAR(100),
+ tags VARCHAR(200),
+ tags_exact VARCHAR(200),
+ INDEX idx_firstname(firstname) USING INVERTED
PROPERTIES("lower_case" = "true"),
+ INDEX idx_tags(tags) USING INVERTED PROPERTIES("parser" =
"english"),
+ INDEX idx_tags_exact(tags_exact) USING INVERTED
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_allocation" = "tag.location.default: 1")
+ """
+
+ // Insert test data matching the image requirements
+ sql """INSERT INTO ${tableName} VALUES
+ (1, 'Chris', 'foo bar', 'foo bar'),
+ (2, 'Christopher', 'foobar', 'foobar'),
+ (3, 'Kevin', 'bar foo', 'bar foo'),
+ (4, 'kevin', 'foolish bark', 'foolish bark')
+ """
+
+ // Wait for index building
+ Thread.sleep(3000)
+
+ // ============ Test 1: Wildcard Prefix with Default Field ============
+ // Requirement: firstname EQ Chris*
+ // SQL: search('Chris*', 'firstname')
+ // Expected: Chris (1), Christopher (2)
+ // Note: Without parser, inverted index is case-sensitive
+ qt_wildcard_prefix """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
+ FROM ${tableName}
+ WHERE search('Chris*', 'firstname')
+ ORDER BY id
+ """
+
+ // ============ Test 2: Multi-term AND with Default Operator ============
+ // Requirement: tags EQ foo bar (with AND semantics)
+ // SQL: search('foo bar', 'tags', 'and')
+ // Expected: 'foo bar' (1), 'bar foo' (3)
+ qt_multi_term_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo bar', 'tags', 'and')
+ ORDER BY id
+ """
+
+ // ============ Test 3: Multi-term OR with Default Operator ============
+ // Requirement: tags EQ foo OR bark (with OR semantics)
+ // SQL: search('foo bark', 'tags', 'or')
+ // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4)
+ qt_multi_term_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo bark', 'tags', 'or')
+ ORDER BY id
+ """
+
+ // ============ Test 4: Multi-wildcard AND ============
+ // Requirement: tags EQ foo* bar* (with AND semantics)
+ // SQL: search('foo* bar*', 'tags', 'and')
+ // Expands to: tags:foo* AND tags:bar*
+ // Expected: rows with tokens matching foo* AND tokens matching bar*
+ // - 'foo bar' (1): tokens=['foo','bar'] - matches foo* ✓ and bar* ✓
+ // - 'foobar' (2): tokens=['foobar'] - matches foo* ✓ but NOT bar* ✗
(excluded)
+ // - 'bar foo' (3): tokens=['bar','foo'] - matches foo* ✓ and bar* ✓
+ // - 'foolish bark' (4): tokens=['foolish','bark'] - matches foo* ✓ and
bar* ✓
+ qt_wildcard_multi_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo* bar*', 'tags', 'and')
+ ORDER BY id
+ """
+
+ // ============ Test 5: Explicit OR operator overrides default ============
+ // SQL: search('foo OR bark', 'tags', 'and')
+ // The explicit OR in DSL should override the default 'and' operator
+ // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4)
+ qt_explicit_or_override """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo OR bark', 'tags', 'and')
+ ORDER BY id
+ """
+
+ // ============ Test 6: EXACT function with default field ============
+ // Requirement: EXACT(foo bar) on tags_exact field (no tokenization)
+ // SQL: search('EXACT(foo bar)', 'tags_exact')
+ // Expected: 'foo bar' (1) only - exact string match
+ qt_exact_function """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags_exact
+ FROM ${tableName}
+ WHERE search('EXACT(foo bar)', 'tags_exact')
+ ORDER BY id
+ """
+
+ // ============ Test 7: Traditional syntax still works ============
+ // Ensure backward compatibility - original syntax unchanged
+ qt_traditional_syntax """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
+ FROM ${tableName}
+ WHERE search('firstname:Chris*')
+ ORDER BY id
+ """
+
+ // ============ Test 8: Single term with default field ============
+ qt_single_term """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('bar', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 9: Wildcard in middle ============
+ qt_wildcard_middle """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
+ FROM ${tableName}
+ WHERE search('*ris*', 'firstname')
+ ORDER BY id
+ """
+
+ // ============ Test 10: Case sensitivity for wildcard ============
+ // Without parser, wildcard queries are case-sensitive (matches Lucene
behavior)
+ // CHRIS* won't match Chris/Christopher
+ qt_case_sensitive """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
+ FROM ${tableName}
+ WHERE search('CHRIS*', 'firstname')
+ ORDER BY id
+ """
+
+ // ============ Test 11: Default operator is OR when not specified
============
+ qt_default_or """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo bark', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 12: ANY function with default field ============
+ qt_any_function """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('ANY(foo bark)', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 13: ALL function with default field ============
+ qt_all_function """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('ALL(foo bar)', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 14: Complex wildcard pattern ============
+ qt_complex_wildcard """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname
+ FROM ${tableName}
+ WHERE search('?evin', 'firstname')
+ ORDER BY id
+ """
+
+ // ============ Test 15: Default field with explicit AND ============
+ qt_explicit_and """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('foo AND bar', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 16: Multiple fields still work ============
+ qt_multiple_fields """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname,
tags
+ FROM ${tableName}
+ WHERE search('firstname:Chris* OR tags:bark')
+ ORDER BY id
+ """
+
+ // ============ Test 17: NOT operator with default field ============
+ qt_not_operator """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags
+ FROM ${tableName}
+ WHERE search('NOT foobar', 'tags')
+ ORDER BY id
+ """
+
+ // ============ Test 18: Combining different parameter counts ============
+ // Tests mixing 1-param, 2-param, and 3-param search() calls in same query
+ // - search('firstname:Chris*'): 1-param, traditional syntax → matches id
1,2
+ // - search('foo*', 'tags', 'or'): 3-param with wildcard → matches id 1,3,4
+ // - OR combination → matches id 1,2,3,4 (all rows)
+ qt_param_count_mix """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id
+ FROM ${tableName}
+ WHERE search('firstname:Chris*') OR search('foo*', 'tags', 'or')
+ ORDER BY id
+ """
+
+ // Cleanup
+ sql "DROP TABLE IF EXISTS ${tableName}"
+}
diff --git a/regression-test/suites/search/test_search_null_semantics.groovy
b/regression-test/suites/search/test_search_null_semantics.groovy
index 269a27056cf..c7d97c18bdc 100644
--- a/regression-test/suites/search/test_search_null_semantics.groovy
+++ b/regression-test/suites/search/test_search_null_semantics.groovy
@@ -81,6 +81,13 @@ suite("test_search_null_semantics") {
WHERE not search('content:Round')
"""
+ // Test Case 2b: Phrase NOT queries must treat NULL rows as UNKNOWN
+ qt_test_case_2_phrase_not """
+ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM
${tableName}
+ WHERE NOT search('content:"Selma Blair"')
+ ORDER BY id
+ """
+
// Test Case 3: NULL handling in OR queries
// Verify that NULL OR TRUE = TRUE logic works
qt_test_case_3_or_with_null """
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]