This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 1f0b95f4076 [fix](inverted index) fix multi-position phrase query 
handling in MultiPhraseQuery (#57993)
1f0b95f4076 is described below

commit 1f0b95f4076058820c436fbeeb8e1963b24979ca
Author: zzzxl <[email protected]>
AuthorDate: Tue Nov 25 13:42:01 2025 +0800

    [fix](inverted index) fix multi-position phrase query handling in 
MultiPhraseQuery (#57993)
    
    https://github.com/apache/doris-website/pull/3114
---
 .../inverted_index/query/query_helper.cpp          | 29 ++++++++
 .../segment_v2/inverted_index/query/query_helper.h |  3 +
 .../rowset/segment_v2/inverted_index_common.cpp    | 42 +++++++++++
 .../olap/rowset/segment_v2/inverted_index_common.h | 62 +++++++---------
 .../rowset/segment_v2/inverted_index_common_impl.h | 61 ++++++++++++++++
 be/src/vec/functions/function_search.cpp           | 30 ++++----
 .../rowset/segment_v2/index_reader_helper_test.cpp |  2 +-
 be/test/vec/function/function_search_test.cpp      | 85 +++++++++++++++++++++-
 8 files changed, 258 insertions(+), 56 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
index a1ac87797de..d38d38a57a5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.cpp
@@ -74,5 +74,34 @@ bool QueryHelper::is_simple_phrase(const 
std::vector<TermInfo>& term_infos) {
                                [](const auto& term_info) { return 
term_info.is_single_term(); });
 }
 
+std::vector<TermInfo> QueryHelper::build_phrase_term_infos(const 
std::vector<TermInfo>& src) {
+    std::vector<TermInfo> dst;
+    dst.reserve(src.size());
+    size_t idx = 0;
+    while (idx < src.size()) {
+        int32_t pos = src[idx].position;
+        std::vector<std::string> group_terms;
+        while (idx < src.size() && src[idx].position == pos) {
+            const auto& info = src[idx];
+            if (info.is_single_term()) {
+                group_terms.emplace_back(info.get_single_term());
+            } else {
+                const auto& terms = info.get_multi_terms();
+                group_terms.insert(group_terms.end(), terms.begin(), 
terms.end());
+            }
+            ++idx;
+        }
+        TermInfo t;
+        t.position = pos;
+        if (group_terms.size() == 1) {
+            t.term = std::move(group_terms[0]);
+        } else {
+            t.term = std::move(group_terms);
+        }
+        dst.emplace_back(std::move(t));
+    }
+    return dst;
+}
+
 #include "common/compile_check_end.h"
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
index 319554ec46a..faa2c67120a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_helper.h
@@ -18,8 +18,10 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
 #include "olap/rowset/segment_v2/inverted_index/query/query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/query_info.h"
 
 namespace doris::segment_v2 {
 #include "common/compile_check_begin.h"
@@ -39,6 +41,7 @@ public:
                               const DocRange& doc_range);
 
     static bool is_simple_phrase(const std::vector<TermInfo>& term_infos);
+    static std::vector<TermInfo> build_phrase_term_infos(const 
std::vector<TermInfo>& src);
 };
 
 #include "common/compile_check_end.h"
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_common.cpp
new file mode 100644
index 00000000000..7c5ac5aa231
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common.cpp
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_common.h"
+
+#include <CLucene.h>
+
+namespace doris::segment_v2 {
+
+void DirectoryDeleter::operator()(lucene::store::Directory* p) const {
+    if (p != nullptr) {
+        _CLDECDELETE(p);
+    }
+}
+
+void TermDeleter::operator()(lucene::index::Term* p) const {
+    if (p != nullptr) {
+        _CLDECDELETE(p);
+    }
+}
+
+void CLuceneDeleter::operator()(lucene::index::TermDocs* p) const {
+    if (p != nullptr) {
+        _CLLDELETE(p);
+    }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common.h 
b/be/src/olap/rowset/segment_v2/inverted_index_common.h
index 9d5b232a100..207ea473f0a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_common.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common.h
@@ -17,47 +17,50 @@
 
 #pragma once
 
-#include <CLucene.h> // IWYU pragma: keep
-
+#include <exception>
 #include <memory>
+#include <string>
+
+namespace lucene {
+namespace store {
+class Directory;
+} // namespace store
+
+namespace index {
+class Term;
+class TermDocs;
+class TermPositions;
+class IndexReader;
+} // namespace index
+} // namespace lucene
 
-#include "common/logging.h"
+class CLuceneError;
 
 namespace doris::segment_v2 {
 
 struct DirectoryDeleter {
-    void operator()(lucene::store::Directory* ptr) const { _CLDECDELETE(ptr); }
+    void operator()(lucene::store::Directory* p) const;
 };
 
 struct TermDeleter {
-    void operator()(lucene::index::Term* p) const { _CLDECDELETE(p); }
+    void operator()(lucene::index::Term* p) const;
 };
 using TermPtr = std::unique_ptr<lucene::index::Term, TermDeleter>;
 
 template <typename... Args>
-TermPtr make_term_ptr(Args&&... args) {
-    return TermPtr(new lucene::index::Term(std::forward<Args>(args)...));
-}
+TermPtr make_term_ptr(Args&&... args);
 
 struct CLuceneDeleter {
-    void operator()(lucene::index::TermDocs* p) const {
-        if (p) {
-            _CLDELETE(p);
-        }
-    }
+    void operator()(lucene::index::TermDocs* p) const;
 };
 using TermDocsPtr = std::unique_ptr<lucene::index::TermDocs, CLuceneDeleter>;
 using TermPositionsPtr = std::unique_ptr<lucene::index::TermPositions, 
CLuceneDeleter>;
 
 template <typename... Args>
-TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&... 
args) {
-    return TermDocsPtr(reader->termDocs(std::forward<Args>(args)...));
-}
+TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&... 
args);
 
 template <typename... Args>
-TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader, 
Args&&... args) {
-    return 
TermPositionsPtr(reader->termPositions(std::forward<Args>(args)...));
-}
+TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader, 
Args&&... args);
 
 struct ErrorContext {
     std::string err_msg;
@@ -71,22 +74,7 @@ concept HasClose = requires(T t) {
 
 template <typename PtrType>
     requires HasClose<PtrType>
-void finally_close(PtrType& resource, ErrorContext& error_context) {
-    if (resource) {
-        try {
-            resource->close();
-        } catch (CLuceneError& err) {
-            error_context.eptr = std::current_exception();
-            error_context.err_msg.append("Error occurred while closing 
resource: ");
-            error_context.err_msg.append(err.what());
-            LOG(ERROR) << error_context.err_msg;
-        } catch (...) {
-            error_context.eptr = std::current_exception();
-            error_context.err_msg.append("Error occurred while closing 
resource");
-            LOG(ERROR) << error_context.err_msg;
-        }
-    }
-}
+void finally_close(PtrType& resource, ErrorContext& error_context);
 
 #if defined(__clang__)
 #pragma clang diagnostic push
@@ -126,4 +114,6 @@ void finally_close(PtrType& resource, ErrorContext& 
error_context) {
 #pragma clang diagnostic pop
 #endif
 
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
+
+#include "inverted_index_common_impl.h"
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h 
b/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h
new file mode 100644
index 00000000000..6d89326cc1e
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index_common_impl.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+
+#include "common/logging.h"
+#include "olap/rowset/segment_v2/inverted_index_common.h"
+
+namespace doris::segment_v2 {
+
+template <typename... Args>
+TermPtr make_term_ptr(Args&&... args) {
+    return TermPtr(new lucene::index::Term(std::forward<Args>(args)...));
+}
+
+template <typename... Args>
+TermDocsPtr make_term_doc_ptr(lucene::index::IndexReader* reader, Args&&... 
args) {
+    return TermDocsPtr(reader->termDocs(std::forward<Args>(args)...));
+}
+
+template <typename... Args>
+TermPositionsPtr make_term_positions_ptr(lucene::index::IndexReader* reader, 
Args&&... args) {
+    return 
TermPositionsPtr(reader->termPositions(std::forward<Args>(args)...));
+}
+
+template <typename PtrType>
+    requires HasClose<PtrType>
+void finally_close(PtrType& resource, ErrorContext& error_context) {
+    if (resource) {
+        try {
+            resource->close();
+        } catch (CLuceneError& err) {
+            error_context.eptr = std::current_exception();
+            error_context.err_msg.append("Error occurred while closing 
resource: ");
+            error_context.err_msg.append(err.what());
+            LOG(ERROR) << error_context.err_msg;
+        } catch (...) {
+            error_context.eptr = std::current_exception();
+            error_context.err_msg.append("Error occurred while closing 
resource");
+            LOG(ERROR) << error_context.err_msg;
+        }
+    }
+}
+
+} // namespace doris::segment_v2
diff --git a/be/src/vec/functions/function_search.cpp 
b/be/src/vec/functions/function_search.cpp
index 0c5a3dab1a1..f9c0b5b010a 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -566,22 +566,22 @@ Status FunctionSearch::build_leaf_query(const 
TSearchClause& clause,
                              << "', returning empty BitSetQuery";
                 *out = 
std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
                 return Status::OK();
-            } else if (term_infos.size() == 1) {
-                if (term_infos.size() == 1) {
-                    const auto& term_info = term_infos[0];
-                    if (term_info.is_single_term()) {
-                        std::wstring term_wstr =
-                                
StringHelper::to_wstring(term_info.get_single_term());
-                        *out = std::make_shared<query_v2::TermQuery>(context, 
field_wstr,
-                                                                     
term_wstr);
-                    } else {
-                        query_v2::BooleanQuery::Builder 
builder(query_v2::OperatorType::OP_OR);
-                        for (const auto& term : term_info.get_multi_terms()) {
-                            std::wstring term_wstr = 
StringHelper::to_wstring(term);
-                            builder.add(make_term_query(term_wstr), 
binding.binding_key);
-                        }
-                        *out = builder.build();
+            }
+
+            std::vector<TermInfo> phrase_term_infos =
+                    QueryHelper::build_phrase_term_infos(term_infos);
+            if (phrase_term_infos.size() == 1) {
+                const auto& term_info = term_infos[0];
+                if (term_info.is_single_term()) {
+                    std::wstring term_wstr = 
StringHelper::to_wstring(term_info.get_single_term());
+                    *out = std::make_shared<query_v2::TermQuery>(context, 
field_wstr, term_wstr);
+                } else {
+                    query_v2::BooleanQuery::Builder 
builder(query_v2::OperatorType::OP_OR);
+                    for (const auto& term : term_info.get_multi_terms()) {
+                        std::wstring term_wstr = 
StringHelper::to_wstring(term);
+                        builder.add(make_term_query(term_wstr), 
binding.binding_key);
                     }
+                    *out = builder.build();
                 }
             } else {
                 if (QueryHelper::is_simple_phrase(term_infos)) {
diff --git a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp 
b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
index 8292d3d0d14..6d0b6c2cb02 100644
--- a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
+++ b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
@@ -33,7 +33,7 @@
 using namespace doris;
 using namespace doris::segment_v2;
 
-class MockIndexReader : public IndexReader {
+class MockIndexReader : public segment_v2::IndexReader {
 public:
     MockIndexReader(IndexType type, uint64_t id) : _type(type), _id(id) {}
 
diff --git a/be/test/vec/function/function_search_test.cpp 
b/be/test/vec/function/function_search_test.cpp
index 4d89ad023e5..4b6b27ed861 100644
--- a/be/test/vec/function/function_search_test.cpp
+++ b/be/test/vec/function/function_search_test.cpp
@@ -26,6 +26,9 @@
 
 #include "gen_cpp/Exprs_types.h"
 #include "olap/rowset/segment_v2/index_iterator.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_query.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/multi_phrase_weight.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h"
 #include "vec/core/block.h"
 
 namespace doris::vectorized {
@@ -1678,9 +1681,83 @@ TEST_F(FunctionSearchTest, 
TestOrWithNotSameFieldMatchesMatchAllRows) {
     EXPECT_TRUE(result_diff.isEmpty());
 }
 
-// Note: Full testing of evaluate_inverted_index_with_search_param with real 
InvertedIndexIterator
-// and actual file operations would require complex setup with real index files
-// and is better suited for integration tests. The tests above cover the main
-// execution paths and error handling logic in the function.
+TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) {
+    TSearchClause clause;
+    clause.clause_type = "PHRASE";
+    clause.field_name = "content";
+    clause.value = "hello world";
+    clause.__isset.field_name = true;
+    clause.__isset.value = true;
+
+    auto context = std::make_shared<IndexQueryContext>();
+
+    std::unordered_map<std::string, vectorized::IndexFieldNameAndTypePair> 
data_type_with_names;
+    data_type_with_names.emplace("content",
+                                 vectorized::IndexFieldNameAndTypePair 
{"content", nullptr});
+
+    std::unordered_map<std::string, IndexIterator*> iterators;
+    FieldReaderResolver resolver(data_type_with_names, iterators, context);
+
+    FieldReaderBinding binding;
+    binding.logical_field_name = "content";
+    binding.stored_field_name = "content";
+    binding.stored_field_wstr = L"content";
+    binding.index_properties["parser"] = "unicode";
+    binding.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
+
+    auto* dummy_reader = reinterpret_cast<lucene::index::IndexReader*>(0x1);
+    binding.lucene_reader = std::shared_ptr<lucene::index::IndexReader>(
+            dummy_reader, [](lucene::index::IndexReader* /*ptr*/) {});
+
+    std::string key =
+            resolver.binding_key_for("content", 
InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+    binding.binding_key = key;
+    resolver._cache[key] = binding;
+
+    inverted_index::query_v2::QueryPtr out;
+    std::string out_binding_key;
+    Status st =
+            function_search->build_leaf_query(clause, context, resolver, &out, 
&out_binding_key);
+    EXPECT_TRUE(st.ok());
+
+    auto phrase_query = 
std::dynamic_pointer_cast<inverted_index::query_v2::PhraseQuery>(out);
+    EXPECT_NE(phrase_query, nullptr);
+}
+
+TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) {
+    using doris::segment_v2::InvertedIndexQueryInfo;
+    using doris::segment_v2::TermInfo;
+    using doris::CollectionStatistics;
+    using doris::CollectionStatisticsPtr;
+
+    auto context = std::make_shared<IndexQueryContext>();
+    context->collection_statistics = std::make_shared<CollectionStatistics>();
+    context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+    std::wstring field = 
doris::segment_v2::inverted_index::StringHelper::to_wstring("content");
+
+    std::vector<TermInfo> term_infos;
+
+    TermInfo t1;
+    t1.term = std::vector<std::string> {"quick", "fast", "speedy"};
+    t1.position = 0;
+    term_infos.push_back(t1);
+
+    TermInfo t2;
+    t2.term = std::string("brown");
+    t2.position = 1;
+    term_infos.push_back(t2);
+
+    auto query = 
std::make_shared<doris::segment_v2::inverted_index::query_v2::MultiPhraseQuery>(
+            context, field, term_infos);
+    ASSERT_NE(query, nullptr);
+
+    auto weight = query->weight(false);
+    ASSERT_NE(weight, nullptr);
+
+    auto multi_phrase_weight = std::dynamic_pointer_cast<
+            
doris::segment_v2::inverted_index::query_v2::MultiPhraseWeight>(weight);
+    ASSERT_NE(multi_phrase_weight, nullptr);
+}
 
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to