This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 441d45f7490 [fix](inverted index) fix boolean query for NOT operator
(#56329)
441d45f7490 is described below
commit 441d45f749071012e3540b3f6e287e61649b03d6
Author: Jack <[email protected]>
AuthorDate: Tue Sep 23 17:39:50 2025 +0800
[fix](inverted index) fix boolean query for NOT operator (#56329)
Problem Summary:
This PR implements support for NOT operations in the boolean query
system for inverted index queries. The changes add proper handling of
negation operations that were previously missing or incomplete.
Adds MatchAllDocsScorer class to handle matching all documents in an
index
Implements AndNotScorer for excluding documents based on exclusion
criteria
Updates BooleanWeight to properly handle NOT operations and combinations
with AND/OR
---
.../query_v2/boolean_query/boolean_weight.h | 83 ++++++++++++++++--
.../inverted_index/query_v2/composite_reader.h | 25 +++++-
.../query_v2/intersection_scorer.cpp | 75 ++++++++++++++++-
.../inverted_index/query_v2/intersection_scorer.h | 21 ++++-
.../query_v2/match_all_docs_scorer.h | 98 ++++++++++++++++++++++
.../inverted_index/query_v2/boolean_query_test.cpp | 78 +++++++++++++++++
6 files changed, 370 insertions(+), 10 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
index 7437757a972..9f9290267da 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
@@ -17,10 +17,12 @@
#pragma once
+#include <utility>
#include <vector>
#include
"olap/rowset/segment_v2/inverted_index/query_v2/buffered_union_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.h"
+#include
"olap/rowset/segment_v2/inverted_index/query_v2/match_all_docs_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/operator.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
@@ -37,20 +39,87 @@ public:
~BooleanWeight() override = default;
ScorerPtr scorer(const CompositeReaderPtr& composite_reader) override {
- std::vector<ScorerPtr> sub_scorers = per_scorers(composite_reader);
- if (_type == OperatorType::OP_AND) {
- return intersection_scorer_build(sub_scorers);
- } else if (_type == OperatorType::OP_OR) {
+ const auto make_empty = []() -> ScorerPtr { return
std::make_shared<EmptyScorer>(); };
+
+ auto collect_and_scorers = [&]() {
+ std::pair<std::vector<ScorerPtr>, std::vector<ScorerPtr>> result;
+ result.first.reserve(_sub_weights.size());
+ result.second.reserve(_sub_weights.size());
+
+ for (const auto& sub_weight : _sub_weights) {
+ auto boolean_weight =
+
std::dynamic_pointer_cast<BooleanWeight<ScoreCombinerPtrT>>(sub_weight);
+ if (boolean_weight != nullptr && boolean_weight->_type ==
OperatorType::OP_NOT) {
+ auto excludes =
boolean_weight->per_scorers(composite_reader);
+ for (auto& exclude : excludes) {
+ if (exclude != nullptr) {
+ result.second.emplace_back(std::move(exclude));
+ }
+ }
+ continue;
+ }
+
+ auto scorer = sub_weight->scorer(composite_reader);
+ if (scorer != nullptr) {
+ result.first.emplace_back(std::move(scorer));
+ }
+ }
+
+ return result;
+ };
+
+ switch (_type) {
+ case OperatorType::OP_AND: {
+ auto [include_scorers, exclude_scorers] = collect_and_scorers();
+ if (include_scorers.empty()) {
+ return make_empty();
+ }
+
+ auto intersection = intersection_scorer_build(include_scorers);
+ if (exclude_scorers.empty()) {
+ return intersection;
+ }
+
+ return std::make_shared<AndNotScorer>(std::move(intersection),
+ std::move(exclude_scorers));
+ }
+ case OperatorType::OP_NOT: {
+ uint32_t max_doc = composite_reader->max_doc();
+ if (max_doc == 0) {
+ return make_empty();
+ }
+ auto match_all =
+ std::make_shared<MatchAllDocsScorer>(max_doc,
composite_reader->readers());
+ if (_sub_weights.empty()) {
+ return match_all;
+ }
+ auto excludes = per_scorers(composite_reader);
+ if (excludes.empty()) {
+ return match_all;
+ }
+ return std::make_shared<AndNotScorer>(std::move(match_all),
std::move(excludes));
+ }
+ case OperatorType::OP_OR: {
+ auto sub_scorers = per_scorers(composite_reader);
+ if (sub_scorers.empty()) {
+ return make_empty();
+ }
return buffered_union_scorer_build<ScoreCombinerPtrT>(sub_scorers,
_score_combiner);
}
- return nullptr;
+ default:
+ return make_empty();
+ }
}
private:
std::vector<ScorerPtr> per_scorers(const CompositeReaderPtr&
composite_reader) {
std::vector<ScorerPtr> sub_scorers;
+ sub_scorers.reserve(_sub_weights.size());
for (const auto& sub_weight : _sub_weights) {
- sub_scorers.emplace_back(sub_weight->scorer(composite_reader));
+ auto scorer = sub_weight->scorer(composite_reader);
+ if (scorer != nullptr) {
+ sub_scorers.emplace_back(std::move(scorer));
+ }
}
return sub_scorers;
}
@@ -60,4 +129,4 @@ private:
ScoreCombinerPtrT _score_combiner;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
index 67161d9910b..543f8211ff3 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
@@ -20,7 +20,10 @@
#include <CLucene.h>
#include <CLucene/index/IndexReader.h>
+#include <algorithm>
#include <ranges>
+#include <unordered_map>
+#include <vector>
#include "common/exception.h"
#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
@@ -56,9 +59,29 @@ public:
}
}
+ [[nodiscard]] uint32_t max_doc() const {
+ if (_field_readers.empty()) {
+ throw Exception(ErrorCode::INDEX_INVALID_PARAMETERS,
"CompositeReader has no readers");
+ }
+ uint32_t max_doc = 0;
+ for (auto* reader : std::views::values(_field_readers)) {
+ max_doc = std::max(max_doc,
static_cast<uint32_t>(reader->maxDoc()));
+ }
+ return max_doc;
+ }
+
+ [[nodiscard]] std::vector<lucene::index::IndexReader*> readers() const {
+ std::vector<lucene::index::IndexReader*> readers;
+ readers.reserve(_field_readers.size());
+ for (auto* reader : std::views::values(_field_readers)) {
+ readers.push_back(reader);
+ }
+ return readers;
+ }
+
private:
std::unordered_map<std::wstring, lucene::index::IndexReader*>
_field_readers;
};
using CompositeReaderPtr = std::unique_ptr<CompositeReader>;
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.cpp
index f5e75a1c77b..330192929b3 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.cpp
@@ -19,6 +19,7 @@
#include <algorithm>
#include <cassert>
+#include <utility>
#include
"olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_scorer.h"
@@ -90,6 +91,78 @@ ScorerPtr intersection_scorer_build(std::vector<ScorerPtr>&
scorers) {
std::move(others));
}
+AndNotScorer::AndNotScorer(ScorerPtr include, std::vector<ScorerPtr> excludes)
+ : _include(std::move(include)), _excludes(std::move(excludes)) {
+ if (_include != nullptr) {
+ _align(_include->doc());
+ }
+}
+
+uint32_t AndNotScorer::advance() {
+ if (_include == nullptr) {
+ return TERMINATED;
+ }
+ return _align(_include->advance());
+}
+
+uint32_t AndNotScorer::seek(uint32_t target) {
+ if (_include == nullptr) {
+ return TERMINATED;
+ }
+ return _align(_include->seek(target));
+}
+
+uint32_t AndNotScorer::doc() const {
+ if (_include == nullptr) {
+ return TERMINATED;
+ }
+ return _include->doc();
+}
+
+uint32_t AndNotScorer::size_hint() const {
+ if (_include == nullptr) {
+ return 0;
+ }
+ return _include->size_hint();
+}
+
+float AndNotScorer::score() {
+ if (_include == nullptr) {
+ return 0.0F;
+ }
+ return _include->score();
+}
+
+uint32_t AndNotScorer::_align(uint32_t candidate) {
+ if (_include == nullptr) {
+ return TERMINATED;
+ }
+
+ while (candidate != TERMINATED) {
+ bool excluded = false;
+ for (auto& scorer : _excludes) {
+ if (scorer == nullptr) {
+ continue;
+ }
+ uint32_t exclude_doc = scorer->doc();
+ if (exclude_doc < candidate) {
+ exclude_doc = scorer->seek(candidate);
+ }
+ if (exclude_doc == candidate) {
+ candidate = _include->advance();
+ excluded = true;
+ break;
+ }
+ }
+
+ if (!excluded) {
+ return candidate;
+ }
+ }
+
+ return TERMINATED;
+}
+
template <typename PivotScorerPtr>
IntersectionScorer<PivotScorerPtr>::IntersectionScorer(PivotScorerPtr left,
PivotScorerPtr right,
std::vector<ScorerPtr>
others)
@@ -155,4 +228,4 @@ float IntersectionScorer<PivotScorerPtr>::score() {
return total;
}
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.h
index 45b40ad9c33..ee11c6b0dbe 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/intersection_scorer.h
@@ -25,6 +25,25 @@ namespace doris::segment_v2::inverted_index::query_v2 {
ScorerPtr intersection_scorer_build(std::vector<ScorerPtr>& scorers);
+class AndNotScorer final : public Scorer {
+public:
+ AndNotScorer(ScorerPtr include, std::vector<ScorerPtr> excludes);
+ ~AndNotScorer() override = default;
+
+ uint32_t advance() override;
+ uint32_t seek(uint32_t target) override;
+ uint32_t doc() const override;
+ uint32_t size_hint() const override;
+
+ float score() override;
+
+private:
+ uint32_t _align(uint32_t candidate);
+
+ ScorerPtr _include;
+ std::vector<ScorerPtr> _excludes;
+};
+
template <typename PivotScorerPtr>
class IntersectionScorer final : public Scorer {
public:
@@ -44,4 +63,4 @@ private:
std::vector<ScorerPtr> _others;
};
-} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/match_all_docs_scorer.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/match_all_docs_scorer.h
new file mode 100644
index 00000000000..775cb9ef6fd
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/match_all_docs_scorer.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "CLucene.h" // IWYU pragma: keep
+#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
+
+namespace doris::segment_v2::inverted_index::query_v2 {
+
+class MatchAllDocsScorer : public Scorer {
+public:
+ MatchAllDocsScorer(uint32_t max_doc,
std::vector<lucene::index::IndexReader*> readers)
+ : _max_doc(max_doc), _readers(std::move(readers)) {
+ if (_max_doc == 0) {
+ _doc = TERMINATED;
+ } else {
+ _doc = _next_live_doc(0);
+ }
+ }
+ ~MatchAllDocsScorer() override = default;
+
+ uint32_t advance() override {
+ if (_doc == TERMINATED) {
+ return TERMINATED;
+ }
+ uint32_t candidate = _doc + 1;
+ if (candidate >= _max_doc) {
+ _doc = TERMINATED;
+ return TERMINATED;
+ }
+ _doc = _next_live_doc(candidate);
+ return _doc;
+ }
+
+ uint32_t seek(uint32_t target) override {
+ if (_doc == TERMINATED) {
+ return TERMINATED;
+ }
+ if (target <= _doc) {
+ return _doc;
+ }
+ if (target >= _max_doc) {
+ _doc = TERMINATED;
+ return TERMINATED;
+ }
+ _doc = _next_live_doc(target);
+ return _doc;
+ }
+
+ uint32_t doc() const override { return _doc; }
+
+ uint32_t size_hint() const override { return _max_doc; }
+
+ float score() override { return 1.0F; }
+
+private:
+ [[nodiscard]] uint32_t _next_live_doc(uint32_t start) const {
+ for (uint32_t doc = start; doc < _max_doc; ++doc) {
+ if (_is_live(doc)) {
+ return doc;
+ }
+ }
+ return TERMINATED;
+ }
+
+ [[nodiscard]] bool _is_live(uint32_t doc) const {
+ for (auto* reader : _readers) {
+ if (reader != nullptr &&
reader->isDeleted(static_cast<int32_t>(doc))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ uint32_t _max_doc;
+ std::vector<lucene::index::IndexReader*> _readers;
+ uint32_t _doc = TERMINATED;
+};
+
+} // namespace doris::segment_v2::inverted_index::query_v2
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
index 33683deaa09..5144f5fc72f 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
@@ -250,6 +250,84 @@ TEST_F(BooleanQueryTest, test_boolean_query_or_operation) {
_CLDECDELETE(dir);
}
+TEST_F(BooleanQueryTest, test_boolean_query_not_operation) {
+ std::wstring field = StringHelper::to_wstring("name1");
+
+ auto context = std::make_shared<IndexQueryContext>();
+ context->collection_statistics = std::make_shared<CollectionStatistics>();
+ context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+ auto* dir = FSDirectory::getDirectory(kTestDir1.c_str());
+ auto* reader = IndexReader::open(dir, true);
+ ASSERT_TRUE(reader != nullptr);
+
+ auto composite_reader = std::make_unique<query_v2::CompositeReader>();
+ composite_reader->set_reader(field, reader);
+
+ query_v2::BooleanQuery::Builder builder(query_v2::OperatorType::OP_NOT);
+ builder.add(std::make_shared<query_v2::TermQuery>(context, field,
+
StringHelper::to_wstring("apple")));
+ auto query = builder.build();
+
+ auto weight = query->weight(false);
+ auto scorer = weight->scorer(composite_reader);
+
+ uint32_t doc = scorer->doc();
+ uint32_t count = 0;
+ while (doc != query_v2::TERMINATED) {
+ ++count;
+ doc = scorer->advance();
+ }
+
+ EXPECT_EQ(count, 40);
+
+ reader->close();
+ _CLLDELETE(reader);
+ _CLDECDELETE(dir);
+}
+
+TEST_F(BooleanQueryTest, test_boolean_query_or_with_not_operation) {
+ std::wstring field = StringHelper::to_wstring("name1");
+
+ auto context = std::make_shared<IndexQueryContext>();
+ context->collection_statistics = std::make_shared<CollectionStatistics>();
+ context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+ auto* dir = FSDirectory::getDirectory(kTestDir1.c_str());
+ auto* reader = IndexReader::open(dir, true);
+ ASSERT_TRUE(reader != nullptr);
+
+ auto composite_reader = std::make_unique<query_v2::CompositeReader>();
+ composite_reader->set_reader(field, reader);
+
+ query_v2::BooleanQuery::Builder builder(query_v2::OperatorType::OP_OR);
+ builder.add(std::make_shared<query_v2::TermQuery>(context, field,
+
StringHelper::to_wstring("apple")));
+ {
+ query_v2::BooleanQuery::Builder
not_builder(query_v2::OperatorType::OP_NOT);
+ not_builder.add(std::make_shared<query_v2::TermQuery>(context, field,
+
StringHelper::to_wstring("banana")));
+ builder.add(not_builder.build());
+ }
+ auto query = builder.build();
+
+ auto weight = query->weight(false);
+ auto scorer = weight->scorer(composite_reader);
+
+ uint32_t doc = scorer->doc();
+ uint32_t count = 0;
+ while (doc != query_v2::TERMINATED) {
+ ++count;
+ doc = scorer->advance();
+ }
+
+ EXPECT_EQ(count, 50);
+
+ reader->close();
+ _CLLDELETE(reader);
+ _CLDECDELETE(dir);
+}
+
TEST_F(BooleanQueryTest, test_boolean_query_scoring_or) {
std::wstring field = StringHelper::to_wstring("name1");
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]