This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ac01e67c533 [fix](inverted index) Add CompositeReader to support
multi-field boolean queries (#55960)
ac01e67c533 is described below
commit ac01e67c53393e64fbf7ee3ae36a012b428a5acc
Author: zzzxl <[email protected]>
AuthorDate: Sun Sep 14 16:19:48 2025 +0800
[fix](inverted index) Add CompositeReader to support multi-field boolean
queries (#55960)
---
.../query_v2/boolean_query/boolean_weight.h | 8 +-
.../inverted_index/query_v2/composite_reader.h | 64 ++++++++++
.../query_v2/term_query/term_weight.h | 3 +-
.../segment_v2/inverted_index/query_v2/weight.h | 8 +-
.../inverted_index/query_v2/boolean_query_test.cpp | 131 +++++++++++++++++----
.../query_v2/composite_reader_test.cpp | 80 +++++++++++++
6 files changed, 259 insertions(+), 35 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
index dd5b65837b8..7437757a972 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_weight.h
@@ -36,8 +36,8 @@ public:
_score_combiner(std::move(score_combiner)) {}
~BooleanWeight() override = default;
- ScorerPtr scorer(lucene::index::IndexReader* reader) override {
- std::vector<ScorerPtr> sub_scorers = per_scorers(reader);
+ ScorerPtr scorer(const CompositeReaderPtr& composite_reader) override {
+ std::vector<ScorerPtr> sub_scorers = per_scorers(composite_reader);
if (_type == OperatorType::OP_AND) {
return intersection_scorer_build(sub_scorers);
} else if (_type == OperatorType::OP_OR) {
@@ -47,10 +47,10 @@ public:
}
private:
- std::vector<ScorerPtr> per_scorers(lucene::index::IndexReader* reader) {
+ std::vector<ScorerPtr> per_scorers(const CompositeReaderPtr&
composite_reader) {
std::vector<ScorerPtr> sub_scorers;
for (const auto& sub_weight : _sub_weights) {
- sub_scorers.emplace_back(sub_weight->scorer(reader));
+ sub_scorers.emplace_back(sub_weight->scorer(composite_reader));
}
return sub_scorers;
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
new file mode 100644
index 00000000000..67161d9910b
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/index/IndexReader.h>
+
+#include <ranges>
+
+#include "common/exception.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+CL_NS_USE(index)
+
+namespace doris::segment_v2::inverted_index::query_v2 {
+
+class CompositeReader {
+public:
+ CompositeReader() = default;
+ ~CompositeReader() = default;
+
+ void set_reader(const std::wstring& field, lucene::index::IndexReader*
reader) {
+ if (_field_readers.contains(field)) {
+ throw Exception(ErrorCode::INDEX_INVALID_PARAMETERS, "Field {}
already exists",
+ StringHelper::to_string(field));
+ }
+ _field_readers[field] = reader;
+ }
+
+ lucene::index::IndexReader* get_reader(const std::wstring& field) {
+ if (!_field_readers.contains(field)) {
+ throw Exception(ErrorCode::NOT_FOUND, "Field {} not found",
+ StringHelper::to_string(field));
+ }
+ return _field_readers[field];
+ }
+
+ void close() {
+ for (auto* reader : std::views::values(_field_readers)) {
+ reader->close();
+ }
+ }
+
+private:
+ std::unordered_map<std::wstring, lucene::index::IndexReader*>
_field_readers;
+};
+using CompositeReaderPtr = std::unique_ptr<CompositeReader>;
+
+} // namespace doris::segment_v2::inverted_index::query_v2
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
index fb1985b7fd8..af899af3f9e 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_weight.h
@@ -35,8 +35,9 @@ public:
_enable_scoring(enable_scoring) {}
~TermWeight() override = default;
- ScorerPtr scorer(lucene::index::IndexReader* reader) override {
+ ScorerPtr scorer(const CompositeReaderPtr& composite_reader) override {
auto t = make_term_ptr(_field.c_str(), _term.c_str());
+ auto* reader = composite_reader->get_reader(_field);
auto iter = make_term_doc_ptr(reader, t.get(), _enable_scoring,
_context->io_ctx);
auto make_scorer = [this](auto segment_postings) -> ScorerPtr {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
index a82bbeb57df..480c95c1a4d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/weight.h
@@ -17,13 +17,9 @@
#pragma once
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-
+#include "olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
-CL_NS_USE(index)
-
namespace doris::segment_v2::inverted_index::query_v2 {
class Weight {
@@ -31,7 +27,7 @@ public:
Weight() = default;
virtual ~Weight() = default;
- virtual ScorerPtr scorer(lucene::index::IndexReader* reader) = 0;
+ virtual ScorerPtr scorer(const CompositeReaderPtr& composite_reader) = 0;
};
using WeightPtr = std::shared_ptr<Weight>;
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
index 71c79789c1c..33683deaa09 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query_test.cpp
@@ -39,23 +39,34 @@ using namespace inverted_index;
class BooleanQueryTest : public testing::Test {
public:
- const std::string kTestDir = "./ut_dir/query_test";
- std::string field_name = "name";
+ const std::string kTestDir1 = "./ut_dir/query_test1";
+ const std::string kTestDir2 = "./ut_dir/query_test2";
void SetUp() override {
- auto st = io::global_local_filesystem()->delete_directory(kTestDir);
- ASSERT_TRUE(st.ok()) << st;
- st = io::global_local_filesystem()->create_directory(kTestDir);
- ASSERT_TRUE(st.ok()) << st;
-
- create_test_index();
+ {
+ auto st =
io::global_local_filesystem()->delete_directory(kTestDir1);
+ ASSERT_TRUE(st.ok()) << st;
+ st = io::global_local_filesystem()->create_directory(kTestDir1);
+ ASSERT_TRUE(st.ok()) << st;
+ std::string field_name1 = "name1";
+ create_test_index(field_name1, kTestDir1);
+ }
+ {
+ auto st =
io::global_local_filesystem()->delete_directory(kTestDir2);
+ ASSERT_TRUE(st.ok()) << st;
+ st = io::global_local_filesystem()->create_directory(kTestDir2);
+ ASSERT_TRUE(st.ok()) << st;
+ std::string field_name2 = "name2";
+ create_test_index(field_name2, kTestDir2);
+ }
}
void TearDown() override {
-
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir1).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir2).ok());
}
private:
- void create_test_index() {
+ void create_test_index(const std::string& field_name, const std::string&
dir) {
std::vector<std::string> test_data = {"apple banana orange", "apple
cherry grape",
"banana cherry kiwi", "orange
grape strawberry",
"apple orange kiwi", "cherry
banana grape",
@@ -67,7 +78,7 @@ private:
auto custom_analyzer =
CustomAnalyzer::build_custom_analyzer(custom_analyzer_config);
auto* indexwriter =
- _CLNEW lucene::index::IndexWriter(kTestDir.c_str(),
custom_analyzer.get(), true);
+ _CLNEW lucene::index::IndexWriter(dir.c_str(),
custom_analyzer.get(), true);
indexwriter->setMaxBufferedDocs(100);
indexwriter->setRAMBufferSizeMB(-1);
indexwriter->setMaxFieldLength(0x7FFFFFFFL);
@@ -109,6 +120,9 @@ static Status boolean_query_search(
query_v2::OperatorType op, roaring::Roaring& out_bitmap) {
std::wstring field = StringHelper::to_wstring(name);
+ auto composite_reader = std::make_unique<query_v2::CompositeReader>();
+ composite_reader->set_reader(field, reader);
+
auto context = std::make_shared<IndexQueryContext>();
context->collection_statistics = std::make_shared<CollectionStatistics>();
context->collection_similarity = std::make_shared<CollectionSimilarity>();
@@ -136,7 +150,7 @@ static Status boolean_query_search(
}
auto boolean_query = builder.build();
auto weight = boolean_query->weight(false);
- auto scorer = weight->scorer(reader);
+ auto scorer = weight->scorer(composite_reader);
uint32_t doc = scorer->doc();
while (doc != query_v2::TERMINATED) {
@@ -173,7 +187,7 @@ TEST_F(BooleanQueryTest, test_boolean_query) {
{{"cherry"}, {"strawberry"}},
{{"apple", "banana"}, {"kiwi"}}};
- auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+ auto* dir = FSDirectory::getDirectory(kTestDir1.c_str());
auto* reader = IndexReader::open(dir, true);
ASSERT_TRUE(reader != nullptr) << "Failed to open index reader";
@@ -186,7 +200,7 @@ TEST_F(BooleanQueryTest, test_boolean_query) {
roaring::Roaring result;
try {
- Status res = boolean_query_search(field_name, reader, terms,
+ Status res = boolean_query_search("name1", reader, terms,
query_v2::OperatorType::OP_AND,
result);
EXPECT_TRUE(res.ok()) << "Boolean query case " << i << " should
execute successfully";
EXPECT_EQ(result.cardinality(), expected_cards[i])
@@ -210,7 +224,7 @@ TEST_F(BooleanQueryTest, test_boolean_query_or_operation) {
std::vector<std::pair<std::vector<std::string>, std::vector<std::string>>>
test_cases = {
{{"apple"}, {"banana"}}, {{"nonexistent"}, {"apple"}}};
- auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+ auto* dir = FSDirectory::getDirectory(kTestDir1.c_str());
auto* reader = IndexReader::open(dir, true);
const std::vector<uint32_t> expected_cards = {70, 40};
@@ -220,8 +234,8 @@ TEST_F(BooleanQueryTest, test_boolean_query_or_operation) {
roaring::Roaring result;
try {
- Status res = boolean_query_search(field_name, reader, terms,
- query_v2::OperatorType::OP_OR,
result);
+ Status res = boolean_query_search("name1", reader, terms,
query_v2::OperatorType::OP_OR,
+ result);
EXPECT_TRUE(res.ok()) << "Boolean OR query case " << i
<< " should execute successfully";
EXPECT_EQ(result.cardinality(), expected_cards[i])
@@ -237,13 +251,13 @@ TEST_F(BooleanQueryTest, test_boolean_query_or_operation)
{
}
TEST_F(BooleanQueryTest, test_boolean_query_scoring_or) {
- std::wstring field = StringHelper::to_wstring(field_name);
+ std::wstring field = StringHelper::to_wstring("name1");
auto context = std::make_shared<IndexQueryContext>();
context->collection_statistics = std::make_shared<CollectionStatistics>();
context->collection_similarity = std::make_shared<CollectionSimilarity>();
- std::wstring ws_field = StringHelper::to_wstring(field_name);
+ std::wstring ws_field = StringHelper::to_wstring("name1");
// 直接访问成员填充统计信息
context->collection_statistics->_total_num_docs = 80;
context->collection_statistics->_total_num_tokens[ws_field] = 240; // 80*3
@@ -276,12 +290,15 @@ TEST_F(BooleanQueryTest, test_boolean_query_scoring_or) {
}
auto boolean_query = builder.build();
- auto* dir = FSDirectory::getDirectory(kTestDir.c_str());
+ auto* dir = FSDirectory::getDirectory(kTestDir1.c_str());
auto* reader = IndexReader::open(dir, true);
ASSERT_TRUE(reader != nullptr);
+ auto composite_reader = std::make_unique<query_v2::CompositeReader>();
+ composite_reader->set_reader(field, reader);
+
auto weight = boolean_query->weight(true);
- auto scorer = weight->scorer(reader);
+ auto scorer = weight->scorer(composite_reader);
uint32_t doc = scorer->doc();
uint32_t count = 0;
@@ -300,9 +317,6 @@ TEST_F(BooleanQueryTest, test_boolean_query_scoring_or) {
doc = scorer->advance();
}
- std::cout << "count: " << count << std::endl;
- std::cout << "score_single: " << score_single << std::endl;
- std::cout << "score_both: " << score_both << std::endl;
EXPECT_EQ(count, 50);
EXPECT_GT(score_single, 0.0F);
EXPECT_GT(score_both, 0.0F);
@@ -313,4 +327,73 @@ TEST_F(BooleanQueryTest, test_boolean_query_scoring_or) {
_CLDECDELETE(dir);
}
+TEST_F(BooleanQueryTest,
test_boolean_query_cross_fields_with_composite_reader) {
+ std::string field_name1 = "name1";
+ std::string field_name2 = "name2";
+ std::wstring wfield1 = StringHelper::to_wstring(field_name1);
+ std::wstring wfield2 = StringHelper::to_wstring(field_name2);
+
+ auto* dir1 = FSDirectory::getDirectory(kTestDir1.c_str());
+ auto* dir2 = FSDirectory::getDirectory(kTestDir2.c_str());
+ auto* ir1 = IndexReader::open(dir1, true);
+ auto* ir2 = IndexReader::open(dir2, true);
+ ASSERT_TRUE(ir1 != nullptr);
+ ASSERT_TRUE(ir2 != nullptr);
+ EXPECT_EQ(ir1->numDocs(), 80);
+ EXPECT_EQ(ir2->numDocs(), 80);
+
+ auto composite_reader = std::make_unique<query_v2::CompositeReader>();
+ composite_reader->set_reader(wfield1, ir1);
+ composite_reader->set_reader(wfield2, ir2);
+
+ auto context = std::make_shared<IndexQueryContext>();
+ context->collection_statistics = std::make_shared<CollectionStatistics>();
+ context->collection_similarity = std::make_shared<CollectionSimilarity>();
+
+ {
+ query_v2::BooleanQuery::Builder b(query_v2::OperatorType::OP_AND);
+ b.add(std::make_shared<query_v2::TermQuery>(context, wfield1,
+
StringHelper::to_wstring("apple")));
+ b.add(std::make_shared<query_v2::TermQuery>(context, wfield2,
+
StringHelper::to_wstring("banana")));
+ auto q = b.build();
+ auto w = q->weight(false);
+ auto s = w->scorer(composite_reader);
+
+ uint32_t doc = s->doc();
+ uint32_t count = 0;
+ while (doc != query_v2::TERMINATED) {
+ ++count;
+ doc = s->advance();
+ }
+ EXPECT_EQ(count, 10);
+ }
+
+ {
+ query_v2::BooleanQuery::Builder b(query_v2::OperatorType::OP_OR);
+ b.add(std::make_shared<query_v2::TermQuery>(context, wfield1,
+
StringHelper::to_wstring("apple")));
+ b.add(std::make_shared<query_v2::TermQuery>(context, wfield2,
+
StringHelper::to_wstring("banana")));
+ auto q = b.build();
+ auto w = q->weight(false);
+ auto s = w->scorer(composite_reader);
+
+ uint32_t doc = s->doc();
+ uint32_t count = 0;
+ while (doc != query_v2::TERMINATED) {
+ ++count;
+ doc = s->advance();
+ }
+ EXPECT_EQ(count, 70);
+ }
+
+ ir1->close();
+ ir2->close();
+ _CLLDELETE(ir1);
+ _CLLDELETE(ir2);
+ _CLDECDELETE(dir1);
+ _CLDECDELETE(dir2);
+}
+
} // namespace doris::segment_v2
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader_test.cpp
new file mode 100644
index 00000000000..653df299aa1
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/composite_reader_test.cpp
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index/query_v2/composite_reader.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "common/exception.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+namespace doris::segment_v2 {
+
+using namespace inverted_index;
+
+TEST(CompositeReaderTest, SetAndGetNullptr) {
+ query_v2::CompositeReader cr;
+
+ std::wstring field = StringHelper::to_wstring("f1");
+ lucene::index::IndexReader* reader = nullptr;
+
+ cr.set_reader(field, reader);
+ auto* got = cr.get_reader(field);
+ EXPECT_EQ(got, reader);
+}
+
+TEST(CompositeReaderTest, GetNonExistingThrowsNotFound) {
+ query_v2::CompositeReader cr;
+
+ std::wstring field = StringHelper::to_wstring("no_such_field");
+
+ try {
+ (void)cr.get_reader(field);
+ FAIL() << "Expected doris::Exception to be thrown";
+ } catch (const doris::Exception& e) {
+ EXPECT_EQ(e.code(), doris::ErrorCode::NOT_FOUND);
+ } catch (...) {
+ FAIL() << "Unexpected exception type";
+ }
+}
+
+TEST(CompositeReaderTest, DuplicateSetThrowsIndexInvalidParameters) {
+ query_v2::CompositeReader cr;
+
+ std::wstring field = StringHelper::to_wstring("dup");
+ lucene::index::IndexReader* reader = nullptr;
+
+ cr.set_reader(field, reader);
+
+ try {
+ cr.set_reader(field, reader);
+ FAIL() << "Expected doris::Exception to be thrown";
+ } catch (const doris::Exception& e) {
+ EXPECT_EQ(e.code(), doris::ErrorCode::INDEX_INVALID_PARAMETERS);
+ } catch (...) {
+ FAIL() << "Unexpected exception type";
+ }
+}
+
+TEST(CompositeReaderTest, CloseOnEmptyDoesNotCrash) {
+ query_v2::CompositeReader cr;
+ cr.close();
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]