This is an automated email from the ASF dual-hosted git repository.

yangsiyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0eaac8c104f [fix](inverted index) implement phrase-level BM25 scoring 
with phrase frequency (#60331)
0eaac8c104f is described below

commit 0eaac8c104f41700767201fc3fbca33a34600f8e
Author: zzzxl <[email protected]>
AuthorDate: Mon Mar 2 16:34:28 2026 +0800

    [fix](inverted index) implement phrase-level BM25 scoring with phrase 
frequency (#60331)
---
 .../inverted_index/query/phrase_query.cpp          |  52 +-
 .../segment_v2/inverted_index/query/phrase_query.h |   3 +-
 .../query/phrase_query/exact_phrase_matcher.cpp    |   9 +
 .../query/phrase_query/exact_phrase_matcher.h      |   1 +
 .../phrase_query/ordered_sloppy_phrase_matcher.cpp |  13 +
 .../phrase_query/ordered_sloppy_phrase_matcher.h   |   2 +
 .../query/phrase_query/sloppy_phrase_matcher.cpp   |  13 +
 .../query/phrase_query/sloppy_phrase_matcher.h     |   2 +
 .../inverted_index/util/docid_set_iterator.h       |   7 +
 .../segment_v2/inverted_index/util/mock_iterator.h |   2 +
 .../inverted_index/util/union_term_iterator.h      |   6 +-
 .../query/phrase_query/phrase_freq_test.cpp        | 559 +++++++++++++++++++++
 12 files changed, 642 insertions(+), 27 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
index 9494a3571d2..0e9cbd1fb8c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -24,7 +24,6 @@
 #include "CLucene/index/Terms.h"
 #include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
 #include "olap/rowset/segment_v2/inverted_index/query/query.h"
-#include "olap/rowset/segment_v2/inverted_index/query/query_helper.h"
 #include "olap/rowset/segment_v2/inverted_index/util/term_position_iterator.h"
 
 namespace doris::segment_v2 {
@@ -138,29 +137,16 @@ void 
PhraseQuery::init_ordered_sloppy_phrase_matcher(const InvertedIndexQueryInf
 }
 
 void PhraseQuery::init_similarities(const std::wstring& field_name, bool 
is_similarity) {
-    // TODO: Current implementation - computes BM25 scores separately for each 
term
-    // Note: This approach is suitable for TermQuery but does not conform to 
BM25 specification for PhraseQuery
-    // BM25 phrase query specification requires:
-    //   idf component = sum of idf values for all terms
-    //   tf component = phrase frequency (number of times entire phrase 
appears in document)
-    //   doc_length = total document length
-    //
-    // Future optimization direction:
-    //   1. Shift to unified phrase scoring: calculate sum of idf for all 
terms as combined idf
-    //   2. Use phrase frequency instead of individual term frequencies
-    //   3. Maintain document length normalization
-    //   4. Refactor to create a single Similarity object handling the entire 
phrase
     if (is_similarity) {
-        _similarities.resize(_iterators.size());
-        for (size_t i = 0; i < _iterators.size(); i++) {
-            const auto& iter = _iterators[i];
+        std::vector<std::wstring> all_terms;
+        for (const auto& iter : _iterators) {
             if (std::holds_alternative<TermPositionsIterPtr>(iter)) {
                 const auto& term_iter = std::get<TermPositionsIterPtr>(iter);
-                auto similarity = std::make_unique<BM25Similarity>();
-                similarity->for_one_term(_context, field_name, 
term_iter->term());
-                _similarities[i] = std::move(similarity);
+                all_terms.push_back(term_iter->term());
             }
         }
+        _phrase_similarity = std::make_unique<BM25Similarity>();
+        _phrase_similarity->for_terms(_context, field_name, all_terms);
     }
 }
 
@@ -176,13 +162,21 @@ void PhraseQuery::search(roaring::Roaring& roaring) {
 void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) {
     int32_t doc = 0;
     while ((doc = do_next(visit_node(*_lead1, NextDoc {}))) != INT32_MAX) {
-        if (!matches(doc)) {
-            continue;
-        }
-        roaring.add(doc);
+        if (_phrase_similarity) {
+            float phrase_freq = count_phrase_freq(doc);
+            if (phrase_freq <= 0.0F) {
+                continue;
+            }
+            roaring.add(doc);
+            int32_t norm = visit_node(*_lead1, Norm {});
+            float score = _phrase_similarity->score(phrase_freq, 
static_cast<int64_t>(norm));
 
-        if (!_similarities.empty()) {
-            QueryHelper::collect(_context, _similarities, _iterators, doc);
+            _context->collection_similarity->collect(doc, score);
+        } else {
+            if (!matches(doc)) {
+                continue;
+            }
+            roaring.add(doc);
         }
     }
 }
@@ -230,6 +224,14 @@ bool PhraseQuery::matches(int32_t doc) {
     });
 }
 
+float PhraseQuery::count_phrase_freq(int32_t doc) {
+    float total_freq = 0.0F;
+    for (auto& matcher : _matchers) {
+        total_freq += std::visit([&doc](auto&& m) -> float { return 
m.phrase_freq(doc); }, matcher);
+    }
+    return total_freq;
+}
+
 void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& 
query_info) {
     auto is_digits = [](const std::string_view& str) {
         return std::all_of(str.begin(), str.end(), [](unsigned char c) { 
return std::isdigit(c); });
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
index 2251fce7740..aaf21c8b199 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -51,6 +51,7 @@ private:
 
     int32_t do_next(int32_t doc);
     bool matches(int32_t doc);
+    float count_phrase_freq(int32_t doc);
 
     void init_exact_phrase_matcher(const InvertedIndexQueryInfo& query_info, 
bool is_similarity);
     void init_sloppy_phrase_matcher(const InvertedIndexQueryInfo& query_info, 
bool is_similarity);
@@ -78,7 +79,7 @@ private:
 
     std::vector<Matcher> _matchers;
 
-    std::vector<SimilarityPtr> _similarities;
+    SimilarityPtr _phrase_similarity;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
index e88f2d93652..db4ec1708dd 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
@@ -89,4 +89,13 @@ bool 
ExactPhraseMatcher::advance_position(PostingsAndPosition& posting, int32_t
     return true;
 }
 
+float ExactPhraseMatcher::phrase_freq(int32_t doc) {
+    reset(doc);
+    float freq = 0.0F;
+    while (next_match()) {
+        freq += 1.0F;
+    }
+    return freq;
+}
+
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
index c6e6c631ab4..b1e5c16b59e 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
@@ -27,6 +27,7 @@ public:
 
     void reset(int32_t doc);
     bool next_match();
+    float phrase_freq(int32_t doc);
 
 private:
     bool advance_position(PostingsAndPosition& posting, int32_t target);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
index 64cbd855cc9..97a03fc8af7 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
@@ -81,4 +81,17 @@ bool 
OrderedSloppyPhraseMatcher::advance_position(PostingsAndPosition& posting,
     return true;
 }
 
+float OrderedSloppyPhraseMatcher::sloppy_weight() const {
+    return 1.0F / (1.0F + static_cast<float>(_match_width));
+}
+
+float OrderedSloppyPhraseMatcher::phrase_freq(int32_t doc) {
+    reset(doc);
+    float freq = 0.0F;
+    while (next_match()) {
+        freq += sloppy_weight();
+    }
+    return freq;
+}
+
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
index 7ac8b748890..7573d5cae59 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
@@ -27,6 +27,8 @@ public:
 
     void reset(int32_t doc);
     bool next_match();
+    float sloppy_weight() const;
+    float phrase_freq(int32_t doc);
 
 private:
     bool stretch_to_order(PostingsAndPosition* prev_posting);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
index 342e04b1490..e98ad8b9f90 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
@@ -315,5 +315,18 @@ bool SloppyPhraseMatcher::init_complex() {
     return true;
 }
 
+float SloppyPhraseMatcher::sloppy_weight() const {
+    return 1.0F / (1.0F + static_cast<float>(_match_length));
+}
+
+float SloppyPhraseMatcher::phrase_freq(int32_t doc) {
+    reset(doc);
+    float freq = 0.0F;
+    while (next_match()) {
+        freq += sloppy_weight();
+    }
+    return freq;
+}
+
 #include "common/compile_check_end.h"
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
index d008389d6fc..7e2c360692a 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
@@ -32,6 +32,8 @@ public:
     void reset(int32_t doc);
     bool next_match();
     bool advance_rpts(PhrasePositions* pp);
+    float sloppy_weight() const;
+    float phrase_freq(int32_t doc);
 
 private:
     bool advance_pp(PhrasePositions* pp);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h 
b/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
index 7ebda6cbc82..26fbd129d73 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
@@ -79,5 +79,12 @@ struct NextPosition {
     }
 };
 
+struct Norm {
+    template <typename T>
+    int32_t operator()(const T& iter) const {
+        return iter->norm();
+    }
+};
+
 #include "common/compile_check_end.h"
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h 
b/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
index 5e3540b7803..aacfb29ffc1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
@@ -89,6 +89,8 @@ public:
         return current_doc->second[pos_idx++];
     }
 
+    int32_t norm() const MOCK_DEFINE(override) { return 1; }
+
     bool read_range(DocRange* doc_range) const MOCK_DEFINE(override) {
         if (!doc_range) {
             return false;
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h 
b/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
index 0ed9ec2f539..73ecc87268d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
@@ -129,7 +129,6 @@ public:
             top = _docs_queue->update_top();
         } while (top->doc_id() == doc);
         return top->doc_id();
-        return 0;
     }
 
     int32_t advance(int32_t target) const {
@@ -143,6 +142,11 @@ public:
 
     int32_t doc_freq() const { return _cost; }
 
+    int32_t norm() const {
+        throw Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
+                        "UnionTermIterator does not support scoring");
+    }
+
 private:
     int32_t _cost = 0;
     int32_t pos_queue_doc = -2;
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
new file mode 100644
index 00000000000..265cc0fef83
--- /dev/null
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
@@ -0,0 +1,559 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include 
"olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h"
+#include 
"olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h"
+#include "olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h"
+#include "olap/rowset/segment_v2/inverted_index/util/mock_iterator.h"
+#include "olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h"
+
+namespace doris::segment_v2 {
+
+using namespace inverted_index;
+
+class PhraseFreqTest : public ::testing::Test {
+protected:
+    DISI create_mock_disi(std::map<int32_t, std::vector<int32_t>> postings) {
+        auto mock = std::make_shared<MockIterator>();
+        mock->set_postings(postings);
+        return mock;
+    }
+};
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_SingleMatch) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_MultipleMatches) {
+    auto disi1 = create_mock_disi({{1, {0, 2}}});
+    auto disi2 = create_mock_disi({{1, {1, 3}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 2.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_NoMatch) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_ThreeTermPhrase) {
+    auto disi1 = create_mock_disi({{1, {1}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+    auto disi3 = create_mock_disi({{1, {3}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+    postings.emplace_back(disi3, 2);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ExactMatch) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_WithSlop) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 0.0F);
+    EXPECT_LE(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_SloppyWeight_AfterMatch) {
+    // Test sloppy_weight after next_match() returns true
+    // Positions: term1 at 0, term2 at 1 (exact match, match_width = 0)
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+    matcher.reset(1);
+    ASSERT_TRUE(matcher.next_match());
+
+    // match_width = 0 for exact match, so weight = 1/(1+0) = 1.0
+    float weight = matcher.sloppy_weight();
+    EXPECT_FLOAT_EQ(weight, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_SloppyWeight_WithGap) {
+    // Test sloppy_weight with gap between terms
+    // Positions: term1 at 0, term2 at 3 (gap of 2, match_width = 2)
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {3}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+    matcher.reset(1);
+    ASSERT_TRUE(matcher.next_match());
+
+    // match_width = 2, so weight = 1/(1+2) = 0.333...
+    float weight = matcher.sloppy_weight();
+    EXPECT_FLOAT_EQ(weight, 1.0F / 3.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ExceedsSlop) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {5}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ExactMatch) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+    SloppyPhraseMatcher matcher(postings, 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_SloppyWeight_AfterMatch) {
+    // Test sloppy_weight after next_match() returns true
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+    SloppyPhraseMatcher matcher(postings, 5);
+    matcher.reset(1);
+    ASSERT_TRUE(matcher.next_match());
+
+    float weight = matcher.sloppy_weight();
+    EXPECT_GT(weight, 0.0F);
+    EXPECT_LE(weight, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ReorderedTerms) {
+    auto disi1 = create_mock_disi({{1, {1}}});
+    auto disi2 = create_mock_disi({{1, {0}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+    SloppyPhraseMatcher matcher(postings, 3);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GE(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, NormVisitor_MockIterator) {
+    auto mock = std::make_shared<MockIterator>();
+    mock->set_postings({{1, {0, 1, 2}}});
+
+    int32_t norm = mock->norm();
+    EXPECT_EQ(norm, 1);
+}
+
+TEST_F(PhraseFreqTest, NormVisitor_WithDISI) {
+    auto disi = create_mock_disi({{1, {0}}});
+
+    int32_t norm = visit_node(disi, Norm {});
+    EXPECT_EQ(norm, 1);
+}
+
+TEST_F(PhraseFreqTest, UnionTermIterator_NormThrowsException) {
+    auto mock1 = std::make_shared<MockIterator>();
+    mock1->set_postings({{1, {0}}});
+    auto mock2 = std::make_shared<MockIterator>();
+    mock2->set_postings({{1, {1}}});
+
+    std::vector<std::shared_ptr<MockIterator>> subs = {mock1, mock2};
+    UnionTermIterator<MockIterator> union_iter(subs);
+
+    EXPECT_THROW(union_iter.norm(), Exception);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_MultipleDocuments) {
+    {
+        auto d1 = create_mock_disi({{1, {0}}});
+        auto d2 = create_mock_disi({{1, {1}}});
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(d1, 0);
+        postings.emplace_back(d2, 1);
+        ExactPhraseMatcher matcher(std::move(postings));
+        EXPECT_FLOAT_EQ(matcher.phrase_freq(1), 1.0F);
+    }
+
+    {
+        auto d1 = create_mock_disi({{3, {0, 2}}});
+        auto d2 = create_mock_disi({{3, {1, 3}}});
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(d1, 0);
+        postings.emplace_back(d2, 1);
+        ExactPhraseMatcher matcher(std::move(postings));
+        EXPECT_FLOAT_EQ(matcher.phrase_freq(3), 2.0F);
+    }
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_MultipleMatches) {
+    auto disi1 = create_mock_disi({{1, {0, 3}}});
+    auto disi2 = create_mock_disi({{1, {2, 5}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 0.0F);
+}
+
+// ==================== Additional Coverage Tests ====================
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_FourTermPhrase) {
+    // Test longer phrase matching
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+    auto disi3 = create_mock_disi({{1, {2}}});
+    auto disi4 = create_mock_disi({{1, {3}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+    postings.emplace_back(disi3, 2);
+    postings.emplace_back(disi4, 3);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_PartialMatch) {
+    // First two terms match but third doesn't
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+    auto disi3 = create_mock_disi({{1, {5}}}); // Gap breaks the phrase
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+    postings.emplace_back(disi3, 2);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_OverlappingMatches) {
+    // Positions that could form overlapping phrases: "a b a b"
+    // Phrase "a b" appears at positions (0,1) and (2,3)
+    auto disi1 = create_mock_disi({{1, {0, 2}}});
+    auto disi2 = create_mock_disi({{1, {1, 3}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 2.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ThreeTerms) {
+    // Three term phrase with slop
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+    auto disi3 = create_mock_disi({{1, {4}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+    postings.emplace_back(disi3, 2);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ThreeTerms_ExceedsSlop) {
+    // Three term phrase exceeds slop
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {5}}});
+    auto disi3 = create_mock_disi({{1, {10}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+    postings.emplace_back(disi3, 2);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_PhraseFreqAccumulation) {
+    // Multiple matches with different sloppy weights
+    // Match 1: positions 0, 1 (match_width=0, weight=1.0)
+    // Match 2: positions 3, 5 (match_width=1, weight=0.5)
+    auto disi1 = create_mock_disi({{1, {0, 3}}});
+    auto disi2 = create_mock_disi({{1, {1, 5}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+    float freq = matcher.phrase_freq(1);
+
+    // Expected: 1.0 + 0.5 = 1.5
+    EXPECT_FLOAT_EQ(freq, 1.5F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_NoMatch) {
+    // Terms too far apart
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {100}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"hello"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"world"});
+
+    SloppyPhraseMatcher matcher(postings, 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ThreeTerms) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+    auto disi3 = create_mock_disi({{1, {2}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"the"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"quick"});
+    postings.emplace_back(disi3, 2, std::vector<std::string> {"fox"});
+
+    SloppyPhraseMatcher matcher(postings, 3);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_MultipleMatches) {
+    // Document with phrase appearing twice
+    auto disi1 = create_mock_disi({{1, {0, 5}}});
+    auto disi2 = create_mock_disi({{1, {1, 6}}});
+
+    std::vector<PostingsAndFreq> postings;
+    postings.emplace_back(disi1, 0, std::vector<std::string> {"hello"});
+    postings.emplace_back(disi2, 1, std::vector<std::string> {"world"});
+
+    SloppyPhraseMatcher matcher(postings, 2);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_GT(freq, 1.0F); // Should have accumulated weight from multiple 
matches
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_Matches_Consistency) {
+    // Verify matches() and phrase_freq() are consistent
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    {
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(disi1, 0);
+        postings.emplace_back(disi2, 1);
+        ExactPhraseMatcher matcher(std::move(postings));
+        EXPECT_TRUE(matcher.matches(1));
+    }
+
+    // Reset iterators
+    disi1 = create_mock_disi({{1, {0}}});
+    disi2 = create_mock_disi({{1, {1}}});
+
+    {
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(disi1, 0);
+        postings.emplace_back(disi2, 1);
+        ExactPhraseMatcher matcher(std::move(postings));
+        EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+    }
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_Matches_Consistency) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+
+    {
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(disi1, 0);
+        postings.emplace_back(disi2, 1);
+        OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+        EXPECT_TRUE(matcher.matches(1));
+    }
+
+    disi1 = create_mock_disi({{1, {0}}});
+    disi2 = create_mock_disi({{1, {2}}});
+
+    {
+        std::vector<PostingsAndPosition> postings;
+        postings.emplace_back(disi1, 0);
+        postings.emplace_back(disi2, 1);
+        OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+        EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+    }
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_Matches_Consistency) {
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    {
+        std::vector<PostingsAndFreq> postings;
+        postings.emplace_back(disi1, 0, std::vector<std::string> {"a"});
+        postings.emplace_back(disi2, 1, std::vector<std::string> {"b"});
+        SloppyPhraseMatcher matcher(postings, 2);
+        EXPECT_TRUE(matcher.matches(1));
+    }
+
+    disi1 = create_mock_disi({{1, {0}}});
+    disi2 = create_mock_disi({{1, {1}}});
+
+    {
+        std::vector<PostingsAndFreq> postings;
+        postings.emplace_back(disi1, 0, std::vector<std::string> {"a"});
+        postings.emplace_back(disi2, 1, std::vector<std::string> {"b"});
+        SloppyPhraseMatcher matcher(postings, 2);
+        EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+    }
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_HighFrequencyTerms) {
+    // Test with terms appearing many times
+    auto disi1 = create_mock_disi({{1, {0, 2, 4, 6, 8}}});
+    auto disi2 = create_mock_disi({{1, {1, 3, 5, 7, 9}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    ExactPhraseMatcher matcher(std::move(postings));
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 5.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ZeroSlop) {
+    // Zero slop should behave like exact match
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {1}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 0);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ZeroSlop_NoMatch) {
+    // Zero slop with gap should not match
+    auto disi1 = create_mock_disi({{1, {0}}});
+    auto disi2 = create_mock_disi({{1, {2}}});
+
+    std::vector<PostingsAndPosition> postings;
+    postings.emplace_back(disi1, 0);
+    postings.emplace_back(disi2, 1);
+
+    OrderedSloppyPhraseMatcher matcher(std::move(postings), 0);
+    float freq = matcher.phrase_freq(1);
+
+    EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+} // namespace doris::segment_v2


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to