This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1f0b95f4076 [fix](inverted index) fix multi-position phrase query
handling in MultiPhraseQuery (#57993)
1f0b95f4076 is described below
commit 1f0b95f4076058820c436fbeeb8e1963b24979ca
Author: zzzxl <[email protected]>
AuthorDate: Tue Nov 25 13:42:01 2025 +0800
[fix](inverted index) fix multi-position phrase query handling in
MultiPhraseQuery (#57993)
https://github.com/apache/doris-website/pull/3114
---
.../inverted_index/query/query_helper.cpp | 29 ++++++++
.../segment_v2/inverted_index/query/query_helper.h | 3 +
.../rowset/segment_v2/inverted_index_common.cpp | 42 +++++++++++
.../olap/rowset/segment_v2/inverted_index_common.h | 62 +++++++---------
.../rowset/segment_v2/inverted_index_common_impl.h | 61 ++++++++++++++++
be/src/vec/functions/function_search.cpp | 30 ++++----
.../rowset/segment_v2/index_reader_helper_test.cpp | 2 +-
be/test/vec/function/function_search_test.cpp | 85 +++++++++++++++++++++-
8 files changed, 258 insertions(+), 56 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
index a1ac87797de..d38d38a57a5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
@@ -74,5 +74,34 @@ bool QueryHelper::is_simple_phrase(const
std::vector<TermInfo>& term_infos) {
[](const auto& term_info) { return
term_info.is_single_term(); });
}
+std::vector<TermInfo> QueryHelper::build_phrase_term_infos(const
std::vector<TermInfo>& src) {
+ std::vector<TermInfo> dst;
+ dst.reserve(src.size());
+ size_t idx = 0;
+ while (idx < src.size()) {
+ int32_t pos = src[idx].position;
+ std::vector<std::string> group_terms;
+ while (idx < src.size() && src[idx].position == pos) {
+ const auto& info = src[idx];
+ if (info.is_single_term()) {
+ group_terms.emplace_back(info.get_single_term());
+ } else {
+ const auto& terms = info.get_multi_terms();
+ group_terms.insert(group_terms.end(), terms.begin(),
terms.end());
+ }
+ ++idx;
+ }
+ TermInfo t;
+ t.position = pos;
+ if (group_terms.size() == 1) {
+ t.term = std::move(group_terms[0]);
+ } else {
+ t.term = std::move(group_terms);
+ }
+ dst.emplace_back(std::move(t));
+ }
+ return dst;
+}
+
#include "common/compile_check_end.h"
} // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
index 319554ec46a..faa2c67120a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
@@ -18,8 +18,10 @@
#pragma once
#include <memory>
+#include <vector>
#include "olap/rowset/segment_v2/inverted_index/query/query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/query_info.h"
namespace doris::segment_v2 {
#include "common/compile_check_begin.h"
@@ -39,6 +41,7 @@ public:
const DocRange& doc_range);
static bool is_simple_phrase(const std::vector<TermInfo>& term_infos);
+ static std::vector<TermInfo> build_phrase_term_infos(const
std::vector<TermInfo>& src);
};
#include "common/compile_check_end.h"
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_common.cpp
new file mode 100644
index 00000000000..7c5ac5aa231
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common.cpp
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_common.h"
+
+#include <CLucene.h>
+
+namespace doris::segment_v2 {
+
+void DirectoryDeleter::operator()(lucene::store::Directory* p) const {
+ if (p != nullptr) {
+ _CLDECDELETE(p);
+ }
+}
+
+void TermDeleter::operator()(lucene::index::Term* p) const {
+ if (p != nullptr) {
+ _CLDECDELETE(p);
+ }
+}
+
+void CLuceneDeleter::operator()(lucene::index::TermDocs* p) const {
+ if (p != nullptr) {
+ _CLLDELETE(p);
+ }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common.h
b/be/src/olap/rowset/segment_v2/inverted_index_common.h
index 9d5b232a100..207ea473f0a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_common.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common.h
@@ -17,47 +17,50 @@
#pragma once
-#include <CLucene.h> // IWYU pragma: keep
-
+#include <exception>
#include <memory>
+#include <string>
+
+namespace lucene {
+namespace store {
+class Directory;
+} // namespace store
+
+namespace index {
+class Term;
+class TermDocs;
+class TermPositions;
+class IndexReader;
+} // namespace index
+} // namespace lucene
-#include "common/logging.h"
+class CLuceneError;
namespace doris::segment_v2 {
struct DirectoryDeleter {
- void operator()(lucene::store::Directory* ptr) const { _CLDECDELETE(ptr); }
+ void operator()(lucene::store::Directory* p) const;
};
struct TermDeleter {
- void operator()(lucene::index::Term* p) const { _CLDECDELETE(p); }
+ void operator()(lucene::index::Term* p) const;
};
using TermPtr = std::unique_ptr<lucene::index::Term, TermDeleter>;
template <typename... Args>
-TermPtr make_term_ptr(Args&&... args) {
- return TermPtr(new lucene::index::Term(std::forward<Args>(args)...));
-}
+TermPtr make_term_ptr(Args&&... args);
struct CLuceneDeleter {
- void operator()(lucene::index::TermDocs* p) const {
- if (p) {
- _CLDELETE(p);
- }
- }
+ void operator()(lucene::index::TermDocs* p) const;
};
using TermDocsPtr = std::unique_ptr<lucene::index::TermDocs, CLuceneDeleter>;
using TermPositionsPtr = std::unique_ptr<lucene::index::TermPositions,
CLuceneDeleter>;
template <typename... Args>
-TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&...
args) {
- return TermDocsPtr(reader->termDocs(std::forward<Args>(args)...));
-}
+TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&...
args);
template <typename... Args>
-TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader,
Args&&... args) {
- return
TermPositionsPtr(reader->termPositions(std::forward<Args>(args)...));
-}
+TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader,
Args&&... args);
struct ErrorContext {
std::string err_msg;
@@ -71,22 +74,7 @@ concept HasClose = requires(T t) {
template <typename PtrType>
requires HasClose<PtrType>
-void finally_close(PtrType& resource, ErrorContext& error_context) {
- if (resource) {
- try {
- resource->close();
- } catch (CLuceneError& err) {
- error_context.eptr = std::current_exception();
- error_context.err_msg.append("Error occurred while closing
resource: ");
- error_context.err_msg.append(err.what());
- LOG(ERROR) << error_context.err_msg;
- } catch (...) {
- error_context.eptr = std::current_exception();
- error_context.err_msg.append("Error occurred while closing
resource");
- LOG(ERROR) << error_context.err_msg;
- }
- }
-}
+void finally_close(PtrType& resource, ErrorContext& error_context);
#if defined(__clang__)
#pragma clang diagnostic push
@@ -126,4 +114,6 @@ void finally_close(PtrType& resource, ErrorContext&
error_context) {
#pragma clang diagnostic pop
#endif
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
+
+#include "inverted_index_common_impl.h"
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h
b/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h
new file mode 100644
index 00000000000..6d89326cc1e
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+
+#include "common/logging.h"
+#include "olap/rowset/segment_v2/inverted_index_common.h"
+
+namespace doris::segment_v2 {
+
+template <typename... Args>
+TermPtr make_term_ptr(Args&&... args) {
+ return TermPtr(new lucene::index::Term(std::forward<Args>(args)...));
+}
+
+template <typename... Args>
+TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&...
args) {
+ return TermDocsPtr(reader->termDocs(std::forward<Args>(args)...));
+}
+
+template <typename... Args>
+TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader,
Args&&... args) {
+ return
TermPositionsPtr(reader->termPositions(std::forward<Args>(args)...));
+}
+
+template <typename PtrType>
+ requires HasClose<PtrType>
+void finally_close(PtrType& resource, ErrorContext& error_context) {
+ if (resource) {
+ try {
+ resource->close();
+ } catch (CLuceneError& err) {
+ error_context.eptr = std::current_exception();
+ error_context.err_msg.append("Error occurred while closing
resource: ");
+ error_context.err_msg.append(err.what());
+ LOG(ERROR) << error_context.err_msg;
+ } catch (...) {
+ error_context.eptr = std::current_exception();
+ error_context.err_msg.append("Error occurred while closing
resource");
+ LOG(ERROR) << error_context.err_msg;
+ }
+ }
+}
+
+} // namespace doris::segment_v2
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 0c5a3dab1a1..f9c0b5b010a 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -566,22 +566,22 @@ Status FunctionSearch::build_leaf_query(const
TSearchClause& clause,
<< "', returning empty BitSetQuery";
*out =
std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
return Status::OK();
- } else if (term_infos.size() == 1) {
- if (term_infos.size() == 1) {
- const auto& term_info = term_infos[0];
- if (term_info.is_single_term()) {
- std::wstring term_wstr =
-
StringHelper::to_wstring(term_info.get_single_term());
- *out = std::make_shared<query_v2::TermQuery>(context,
field_wstr,
-
term_wstr);
- } else {
- query_v2::BooleanQuery::Builder
builder(query_v2::OperatorType::OP_OR);
- for (const auto& term : term_info.get_multi_terms()) {
- std::wstring term_wstr =
StringHelper::to_wstring(term);
- builder.add(make_term_query(term_wstr),
binding.binding_key);
- }
- *out = builder.build();
+ }
+
+ std::vector<TermInfo> phrase_term_infos =
+ QueryHelper::build_phrase_term_infos(term_infos);
+ if (phrase_term_infos.size() == 1) {
+ const auto& term_info = term_infos[0];
+ if (term_info.is_single_term()) {
+ std::wstring term_wstr =
StringHelper::to_wstring(term_info.get_single_term());
+ *out = std::make_shared<query_v2::TermQuery>(context,
field_wstr, term_wstr);
+ } else {
+ query_v2::BooleanQuery::Builder
builder(query_v2::OperatorType::OP_OR);
+ for (const auto& term : term_info.get_multi_terms()) {
+ std::wstring term_wstr =
StringHelper::to_wstring(term);
+ builder.add(make_term_query(term_wstr),
binding.binding_key);
}
+ *out = builder.build();
}
} else {
if (QueryHelper::is_simple_phrase(term_infos)) {
diff --git a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
index 8292d3d0d14..6d0b6c2cb02 100644
--- a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
+++ b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
@@ -33,7 +33,7 @@
using namespace doris;
using namespace doris::segment_v2;
-class MockIndexReader : public IndexReader {
+class MockIndexReader : public segment_v2::IndexReader {
public:
MockIndexReader(IndexType type, uint64_t id) : _type(type), _id(id) {}
diff --git a/be/test/vec/function/function_search_test.cpp
b/be/test/vec/function/function_search_test.cpp
index 4d89ad023e5..4b6b27ed861 100644
--- a/be/test/vec/function/function_search_test.cpp
+++ b/be/test/vec/function/function_search_test.cpp
@@ -26,6 +26,9 @@
#include "gen_cpp/Exprs_types.h"
#include "olap/rowset/segment_v2/index_iterator.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h"
#include "vec/core/block.h"
namespace doris::vectorized {
@@ -1678,9 +1681,83 @@ TEST_F(FunctionSearchTest,
TestOrWithNotSameFieldMatchesMatchAllRows) {
EXPECT_TRUE(result_diff.isEmpty());
}
-// Note: Full testing of evaluate_inverted_index_with_search_param with real
InvertedIndexIterator
-// and actual file operations would require complex setup with real index files
-// and is better suited for integration tests. The tests above cover the main
-// execution paths and error handling logic in the function.
+TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) {
+ TSearchClause clause;
+ clause.clause_type = "PHRASE";
+ clause.field_name = "content";
+ clause.value = "hello world";
+ clause.__isset.field_name = true;
+ clause.__isset.value = true;
+
+ auto context = std::make_shared<IndexQueryContext>();
+
+ std::unordered_map<std::string, vectorized::IndexFieldNameAndTypePair>
data_type_with_names;
+ data_type_with_names.emplace("content",
+ vectorized::IndexFieldNameAndTypePair
{"content", nullptr});
+
+ std::unordered_map<std::string, IndexIterator*> iterators;
+ FieldReaderResolver resolver(data_type_with_names, iterators, context);
+
+ FieldReaderBinding binding;
+ binding.logical_field_name = "content";
+ binding.stored_field_name = "content";
+ binding.stored_field_wstr = L"content";
+ binding.index_properties["parser"] = "unicode";
+ binding.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
+
+ auto* dummy_reader = reinterpret_cast<lucene::index::IndexReader*>(0x1);
+ binding.lucene_reader = std::shared_ptr<lucene::index::IndexReader>(
+ dummy_reader, [](lucene::index::IndexReader* /*ptr*/) {});
+
+ std::string key =
+ resolver.binding_key_for("content",
InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+ binding.binding_key = key;
+ resolver._cache[key] = binding;
+
+ inverted_index::query_v2::QueryPtr out;
+ std::string out_binding_key;
+ Status st =
+ function_search->build_leaf_query(clause, context, resolver, &out,
&out_binding_key);
+ EXPECT_TRUE(st.ok());
+
+ auto phrase_query =
std::dynamic_pointer_cast<inverted_index::query_v2::PhraseQuery>(out);
+ EXPECT_NE(phrase_query, nullptr);
+}
+
+TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) {
+ using doris::segment_v2::InvertedIndexQueryInfo;
+ using doris::segment_v2::TermInfo;
+ using doris::CollectionStatistics;
+ using doris::CollectionStatisticsPtr;
+
+ auto context = std::make_shared<IndexQueryContext>();
+ context->collection_statistics = std::make_shared<CollectionStatistics>();
+ context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+ std::wstring field =
doris::segment_v2::inverted_index::StringHelper::to_wstring("content");
+
+ std::vector<TermInfo> term_infos;
+
+ TermInfo t1;
+ t1.term = std::vector<std::string> {"quick", "fast", "speedy"};
+ t1.position = 0;
+ term_infos.push_back(t1);
+
+ TermInfo t2;
+ t2.term = std::string("brown");
+ t2.position = 1;
+ term_infos.push_back(t2);
+
+ auto query =
std::make_shared<doris::segment_v2::inverted_index::query_v2::MultiPhraseQuery>(
+ context, field, term_infos);
+ ASSERT_NE(query, nullptr);
+
+ auto weight = query->weight(false);
+ ASSERT_NE(weight, nullptr);
+
+ auto multi_phrase_weight = std::dynamic_pointer_cast<
+
doris::segment_v2::inverted_index::query_v2::MultiPhraseWeight>(weight);
+ ASSERT_NE(multi_phrase_weight, nullptr);
+}
} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]