(doris) branch branch-4.0 updated: branch-4.0: [fix](inverted index) fix PhraseScorer seek duplicate position read #59940 (#59949)

yiguolei Thu, 15 Jan 2026 22:59:39 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 0b78670e83f branch-4.0: [fix](inverted index) fix PhraseScorer seek 
duplicate position read #59940 (#59949)
0b78670e83f is described below

commit 0b78670e83f0a7f86f609755fe267a86c0c4424b
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Jan 16 14:59:23 2026 +0800

    branch-4.0: [fix](inverted index) fix PhraseScorer seek duplicate position 
read #59940 (#59949)
    
    Cherry-picked from #59940
    
    Co-authored-by: zzzxl <[email protected]>
---
 .../query_v2/phrase_query/phrase_scorer.cpp        |   7 +
 .../occur_boolean_query_real_index_test.cpp        | 347 +++++++++++++++++++++
 2 files changed, 354 insertions(+)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp
index bccd9fd85a4..77d57fa2e15 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_scorer.cpp
@@ -64,6 +64,13 @@ uint32_t PhraseScorer<TPostings>::advance() {
 template <typename TPostings>
 uint32_t PhraseScorer<TPostings>::seek(uint32_t target) {
     assert(target >= doc());
+    // If the target doc is the same as the current doc, return the current 
doc directly.
+    // This is important because phrase_match() reads position info from 
segment postings.
+    // SegmentPostings does not support reading position info multiple times 
for the same doc.
+    // If we call phrase_match() again for the same doc, it will read wrong 
position info.
+    if (target <= doc()) {
+        return doc();
+    }
     uint32_t doc = _intersection_docset->seek(target);
     if (doc == TERMINATED || phrase_match()) {
         return doc;
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
new file mode 100644
index 00000000000..4fc01f43e1d
--- /dev/null
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp
@@ -0,0 +1,347 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <roaring/roaring.hh>
+#include <string>
+#include <vector>
+
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+#include "olap/rowset/segment_v2/index_query_context.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h"
+#include "olap/rowset/segment_v2/inverted_index/query/query_info.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_query.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query_v2/phrase_query/phrase_query.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+
+namespace doris::segment_v2 {
+
+using namespace inverted_index;
+using namespace inverted_index::query_v2;
+
+class OccurBooleanQueryRealIndexTest : public testing::Test {
+public:
+    const std::string kTestDir = 
"./ut_dir/occur_boolean_query_real_index_test";
+
+    void SetUp() override {
+        auto st = io::global_local_filesystem()->delete_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+        st = io::global_local_filesystem()->create_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+        std::string field_name = "title";
+        create_test_index(field_name, kTestDir);
+    }
+
+    void TearDown() override {
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok());
+    }
+
+private:
+    void create_test_index(const std::string& field_name, const std::string& 
dir) {
+        std::vector<std::string> test_data = {
+                "apple banana cherry", "apple banana",    "apple",     "banana 
cherry",
+                "cherry date",         "date elderberry", "fig grape", "apple 
fig"};
+
+        CustomAnalyzerConfig::Builder builder;
+        builder.with_tokenizer_config("standard", {});
+        auto custom_analyzer_config = builder.build();
+        auto custom_analyzer = 
CustomAnalyzer::build_custom_analyzer(custom_analyzer_config);
+
+        auto* indexwriter =
+                _CLNEW lucene::index::IndexWriter(dir.c_str(), 
custom_analyzer.get(), true);
+        indexwriter->setMaxBufferedDocs(100);
+        indexwriter->setRAMBufferSizeMB(-1);
+        indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+        indexwriter->setMergeFactor(1000000000);
+        indexwriter->setUseCompoundFile(false);
+
+        auto char_string_reader = 
std::make_shared<lucene::util::SStringReader<char>>();
+
+        auto* doc = _CLNEW lucene::document::Document();
+        int32_t field_config = lucene::document::Field::STORE_NO;
+        field_config |= lucene::document::Field::INDEX_NONORMS;
+        field_config |= lucene::document::Field::INDEX_TOKENIZED;
+        auto field_name_w = std::wstring(field_name.begin(), field_name.end());
+        auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(), 
field_config);
+        field->setOmitTermFreqAndPositions(false);
+        doc->add(*field);
+
+        for (const auto& data : test_data) {
+            char_string_reader->init(data.data(), data.size(), false);
+            auto* stream = custom_analyzer->reusableTokenStream(field->name(), 
char_string_reader);
+            field->setValue(stream);
+            indexwriter->addDocument(doc);
+        }
+
+        indexwriter->close();
+        _CLLDELETE(indexwriter);
+        _CLLDELETE(doc);
+    }
+};
+
+static std::shared_ptr<lucene::index::IndexReader> make_shared_reader(
+        lucene::index::IndexReader* raw_reader) {
+    return {raw_reader, [](lucene::index::IndexReader* reader) {
+                if (reader != nullptr) {
+                    reader->close();
+                    _CLDELETE(reader);
+                }
+            }};
+}
+
+TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQuery) {
+    auto context = std::make_shared<IndexQueryContext>();
+    context->collection_statistics = std::make_shared<CollectionStatistics>();
+    context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+    auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+    auto reader_holder = 
make_shared_reader(lucene::index::IndexReader::open(dir, true));
+    ASSERT_TRUE(reader_holder != nullptr);
+
+    std::wstring field = StringHelper::to_wstring("title");
+
+    std::vector<std::wstring> phrase_terms = 
{StringHelper::to_wstring("apple"),
+                                              
StringHelper::to_wstring("banana")};
+    std::vector<TermInfo> term_infos;
+    term_infos.reserve(phrase_terms.size());
+    for (size_t i = 0; i < phrase_terms.size(); ++i) {
+        TermInfo term_info;
+        term_info.term = StringHelper::to_string(phrase_terms[i]);
+        term_info.position = static_cast<int32_t>(i);
+        term_infos.push_back(term_info);
+    }
+
+    auto phrase_query = std::make_shared<PhraseQuery>(context, field, 
term_infos);
+
+    uint32_t max_doc = reader_holder->maxDoc();
+    auto all_query = std::make_shared<AllQuery>(max_doc);
+
+    std::vector<std::pair<Occur, QueryPtr>> clauses;
+    clauses.emplace_back(Occur::SHOULD, all_query);
+    clauses.emplace_back(Occur::MUST_NOT, phrase_query);
+
+    OccurBooleanQuery boolean_query(std::move(clauses));
+    auto weight = boolean_query.weight(false);
+
+    QueryExecutionContext exec_ctx;
+    exec_ctx.segment_num_rows = reader_holder->maxDoc();
+    exec_ctx.readers = {reader_holder};
+    exec_ctx.field_reader_bindings.emplace(field, reader_holder);
+
+    auto scorer = weight->scorer(exec_ctx);
+    ASSERT_NE(scorer, nullptr);
+
+    std::vector<uint32_t> matched_docs;
+    uint32_t doc = scorer->doc();
+    while (doc != TERMINATED) {
+        matched_docs.push_back(doc);
+        doc = scorer->advance();
+    }
+
+    EXPECT_EQ(matched_docs.size(), 6);
+
+    EXPECT_TRUE(std::find(matched_docs.begin(), matched_docs.end(), 0) == 
matched_docs.end())
+            << "Doc 0 should be excluded (contains 'apple banana')";
+    EXPECT_TRUE(std::find(matched_docs.begin(), matched_docs.end(), 1) == 
matched_docs.end())
+            << "Doc 1 should be excluded (contains 'apple banana')";
+
+    for (uint32_t expected_id : {2, 3, 4, 5, 6, 7}) {
+        EXPECT_TRUE(std::find(matched_docs.begin(), matched_docs.end(), 
expected_id) !=
+                    matched_docs.end())
+                << "Doc " << expected_id << " should be included";
+    }
+
+    _CLDECDELETE(dir);
+}
+
+TEST_F(OccurBooleanQueryRealIndexTest, PhraseQueryOnly) {
+    auto context = std::make_shared<IndexQueryContext>();
+    context->collection_statistics = std::make_shared<CollectionStatistics>();
+    context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+    auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+    auto reader_holder = 
make_shared_reader(lucene::index::IndexReader::open(dir, true));
+    ASSERT_TRUE(reader_holder != nullptr);
+
+    std::wstring field = StringHelper::to_wstring("title");
+
+    std::vector<std::wstring> phrase_terms = 
{StringHelper::to_wstring("apple"),
+                                              
StringHelper::to_wstring("banana")};
+    std::vector<TermInfo> term_infos;
+    term_infos.reserve(phrase_terms.size());
+    for (size_t i = 0; i < phrase_terms.size(); ++i) {
+        TermInfo term_info;
+        term_info.term = StringHelper::to_string(phrase_terms[i]);
+        term_info.position = static_cast<int32_t>(i);
+        term_infos.push_back(term_info);
+    }
+
+    auto phrase_query = std::make_shared<PhraseQuery>(context, field, 
term_infos);
+    auto weight = phrase_query->weight(false);
+
+    QueryExecutionContext exec_ctx;
+    exec_ctx.segment_num_rows = reader_holder->maxDoc();
+    exec_ctx.readers = {reader_holder};
+    exec_ctx.field_reader_bindings.emplace(field, reader_holder);
+
+    auto scorer = weight->scorer(exec_ctx);
+    ASSERT_NE(scorer, nullptr);
+
+    std::vector<uint32_t> matched_docs;
+    uint32_t doc = scorer->doc();
+    while (doc != TERMINATED) {
+        matched_docs.push_back(doc);
+        std::cout << "Phrase query matched doc: " << doc << std::endl;
+        doc = scorer->advance();
+    }
+
+    std::cout << "Total matched docs: " << matched_docs.size() << std::endl;
+
+    EXPECT_EQ(matched_docs.size(), 2);
+    if (matched_docs.size() >= 1) {
+        EXPECT_EQ(matched_docs[0], 0);
+    }
+    if (matched_docs.size() >= 2) {
+        EXPECT_EQ(matched_docs[1], 1);
+    }
+
+    _CLDECDELETE(dir);
+}
+
+TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQueryNonExistent) {
+    auto context = std::make_shared<IndexQueryContext>();
+    context->collection_statistics = std::make_shared<CollectionStatistics>();
+    context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+    auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+    auto reader_holder = 
make_shared_reader(lucene::index::IndexReader::open(dir, true));
+    ASSERT_TRUE(reader_holder != nullptr);
+
+    std::wstring field = StringHelper::to_wstring("title");
+
+    std::vector<std::wstring> phrase_terms = 
{StringHelper::to_wstring("nonexistent"),
+                                              
StringHelper::to_wstring("phrase")};
+    std::vector<TermInfo> term_infos;
+    term_infos.reserve(phrase_terms.size());
+    for (size_t i = 0; i < phrase_terms.size(); ++i) {
+        TermInfo term_info;
+        term_info.term = StringHelper::to_string(phrase_terms[i]);
+        term_info.position = static_cast<int32_t>(i);
+        term_infos.push_back(term_info);
+    }
+
+    auto phrase_query = std::make_shared<PhraseQuery>(context, field, 
term_infos);
+
+    uint32_t max_doc = reader_holder->maxDoc();
+    auto all_query = std::make_shared<AllQuery>(max_doc);
+
+    std::vector<std::pair<Occur, QueryPtr>> clauses;
+    clauses.emplace_back(Occur::SHOULD, all_query);
+    clauses.emplace_back(Occur::MUST_NOT, phrase_query);
+
+    OccurBooleanQuery boolean_query(std::move(clauses));
+    auto weight = boolean_query.weight(false);
+
+    QueryExecutionContext exec_ctx;
+    exec_ctx.segment_num_rows = reader_holder->maxDoc();
+    exec_ctx.readers = {reader_holder};
+    exec_ctx.field_reader_bindings.emplace(field, reader_holder);
+
+    auto scorer = weight->scorer(exec_ctx);
+    ASSERT_NE(scorer, nullptr);
+
+    std::vector<uint32_t> matched_docs;
+    uint32_t doc = scorer->doc();
+    while (doc != TERMINATED) {
+        matched_docs.push_back(doc);
+        doc = scorer->advance();
+    }
+
+    EXPECT_EQ(matched_docs.size(), 8);
+
+    _CLDECDELETE(dir);
+}
+
+TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQueryExcludesPartial) {
+    auto context = std::make_shared<IndexQueryContext>();
+    context->collection_statistics = std::make_shared<CollectionStatistics>();
+    context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+    auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+    auto reader_holder = 
make_shared_reader(lucene::index::IndexReader::open(dir, true));
+    ASSERT_TRUE(reader_holder != nullptr);
+
+    std::wstring field = StringHelper::to_wstring("title");
+
+    std::vector<std::wstring> phrase_terms = 
{StringHelper::to_wstring("banana"),
+                                              
StringHelper::to_wstring("cherry")};
+    std::vector<TermInfo> term_infos;
+    term_infos.reserve(phrase_terms.size());
+    for (size_t i = 0; i < phrase_terms.size(); ++i) {
+        TermInfo term_info;
+        term_info.term = StringHelper::to_string(phrase_terms[i]);
+        term_info.position = static_cast<int32_t>(i);
+        term_infos.push_back(term_info);
+    }
+
+    auto phrase_query = std::make_shared<PhraseQuery>(context, field, 
term_infos);
+
+    uint32_t max_doc = reader_holder->maxDoc();
+    auto all_query = std::make_shared<AllQuery>(max_doc);
+
+    std::vector<std::pair<Occur, QueryPtr>> clauses;
+    clauses.emplace_back(Occur::SHOULD, all_query);
+    clauses.emplace_back(Occur::MUST_NOT, phrase_query);
+
+    OccurBooleanQuery boolean_query(std::move(clauses));
+    auto weight = boolean_query.weight(false);
+
+    QueryExecutionContext exec_ctx;
+    exec_ctx.segment_num_rows = reader_holder->maxDoc();
+    exec_ctx.readers = {reader_holder};
+    exec_ctx.field_reader_bindings.emplace(field, reader_holder);
+
+    auto scorer = weight->scorer(exec_ctx);
+    ASSERT_NE(scorer, nullptr);
+
+    std::vector<uint32_t> matched_docs;
+    uint32_t doc = scorer->doc();
+    while (doc != TERMINATED) {
+        matched_docs.push_back(doc);
+        doc = scorer->advance();
+    }
+
+    EXPECT_EQ(matched_docs.size(), 6);
+
+    EXPECT_TRUE(std::find(matched_docs.begin(), matched_docs.end(), 0) == 
matched_docs.end())
+            << "Doc 0 should be excluded (contains 'banana cherry')";
+    EXPECT_TRUE(std::find(matched_docs.begin(), matched_docs.end(), 3) == 
matched_docs.end())
+            << "Doc 3 should be excluded (contains 'banana cherry')";
+
+    _CLDECDELETE(dir);
+}
+
+} // namespace doris::segment_v2


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [fix](inverted index) fix PhraseScorer seek duplicate position read #59940 (#59949)

Reply via email to