This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 96bce532895 [opt](match_phrase) Optimizing match_phrase with a new
algorithm (#29444)
96bce532895 is described below
commit 96bce532895b6d17c943ec0203d219dff3caa498
Author: zzzxl <[email protected]>
AuthorDate: Mon Jan 8 11:30:41 2024 +0800
[opt](match_phrase) Optimizing match_phrase with a new algorithm (#29444)
Based on the latest Lucene algorithm
---
be/src/clucene | 2 +-
.../inverted_index/query/conjunction_query.cpp | 3 +-
.../inverted_index/query/phrase_query.cpp | 201 +++++++++++++++++++++
.../segment_v2/inverted_index/query/phrase_query.h | 77 ++++++++
.../rowset/segment_v2/inverted_index_reader.cpp | 34 ++--
.../olap/rowset/segment_v2/inverted_index_reader.h | 6 +
6 files changed, 308 insertions(+), 15 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index d05cb8154ef..1b59ae8184c 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit d05cb8154ef4368bd40c43c94e8e3c679e13c490
+Subproject commit 1b59ae8184ca5f37f400e02aa4a2dd5af290e8d0
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index b2448a8fa8e..56cccdf3e3f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -123,8 +123,7 @@ void ConjunctionQuery::search_by_bitmap(roaring::Roaring&
roaring) {
void ConjunctionQuery::search_by_skiplist(roaring::Roaring& roaring) {
int32_t doc = 0;
- int32_t first_doc = _lead1.nextDoc();
- while ((doc = do_next(first_doc)) != INT32_MAX) {
+ while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) {
roaring.add(doc);
}
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
new file mode 100644
index 00000000000..527e89a8878
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "phrase_query.h"
+
+namespace doris::segment_v2 {
+
+PhraseQuery::~PhraseQuery() {
+ for (auto& term_doc : _term_docs) {
+ if (term_doc) {
+ _CLDELETE(term_doc);
+ }
+ }
+ for (auto& term : _terms) {
+ if (term) {
+ _CLDELETE(term);
+ }
+ }
+}
+
+void PhraseQuery::add(const std::wstring& field_name, const
std::vector<std::string>& terms) {
+ if (terms.empty()) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
+ }
+
+ if (terms.size() == 1) {
+ std::wstring ws_term = StringUtil::string_to_wstring(terms[0]);
+ Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
+ _terms.push_back(t);
+ TermDocs* term_doc = _searcher->getReader()->termDocs(t);
+ _term_docs.push_back(term_doc);
+ _lead1 = TermIterator(term_doc);
+ return;
+ }
+
+ std::vector<TermIterator> iterators;
+ for (size_t i = 0; i < terms.size(); i++) {
+ std::wstring ws_term = StringUtil::string_to_wstring(terms[i]);
+ Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
+ _terms.push_back(t);
+ TermPositions* term_pos = _searcher->getReader()->termPositions(t);
+ _term_docs.push_back(term_pos);
+ iterators.emplace_back(term_pos);
+ _postings.emplace_back(term_pos, i);
+ }
+
+ std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a,
const TermIterator& b) {
+ return a.docFreq() < b.docFreq();
+ });
+
+ _lead1 = iterators[0];
+ _lead2 = iterators[1];
+ for (int32_t i = 2; i < _terms.size(); i++) {
+ _others.push_back(iterators[i]);
+ }
+}
+
+void PhraseQuery::search(roaring::Roaring& roaring) {
+ if (_lead1.isEmpty()) {
+ return;
+ }
+ if (_lead2.isEmpty()) {
+ search_by_bitmap(roaring);
+ return;
+ }
+ search_by_skiplist(roaring);
+}
+
+void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) {
+ DocRange doc_range;
+ while (_lead1.readRange(&doc_range)) {
+ if (doc_range.type_ == DocRangeType::kMany) {
+ roaring.addMany(doc_range.doc_many_size_,
doc_range.doc_many->data());
+ } else {
+ roaring.addRange(doc_range.doc_range.first,
doc_range.doc_range.second);
+ }
+ }
+}
+
+void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) {
+ int32_t doc = 0;
+ while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) {
+ reset();
+ if (next_match()) {
+ roaring.add(doc);
+ }
+ }
+}
+
+int32_t PhraseQuery::do_next(int32_t doc) {
+ while (true) {
+ assert(doc == _lead1.docID());
+
+ // the skip list is used to find the two smallest inverted lists
+ int32_t next2 = _lead2.advance(doc);
+ if (next2 != doc) {
+ doc = _lead1.advance(next2);
+ if (next2 != doc) {
+ continue;
+ }
+ }
+
+ // if both lead1 and lead2 exist, use skip list to lookup other
inverted indexes
+ bool advance_head = false;
+ for (auto& other : _others) {
+ if (other.isEmpty()) {
+ continue;
+ }
+
+ if (other.docID() < doc) {
+ int32_t next = other.advance(doc);
+ if (next > doc) {
+ doc = _lead1.advance(next);
+ advance_head = true;
+ break;
+ }
+ }
+ }
+ if (advance_head) {
+ continue;
+ }
+
+ return doc;
+ }
+}
+
+bool PhraseQuery::next_match() {
+ PostingsAndPosition& lead = _postings[0];
+ if (lead._upTo < lead._freq) {
+ lead._pos = lead._postings.nextPosition();
+ lead._upTo += 1;
+ } else {
+ return false;
+ }
+
+ while (true) {
+ int32_t phrasePos = lead._pos - lead._offset;
+
+ bool advance_head = false;
+ for (size_t j = 1; j < _postings.size(); ++j) {
+ PostingsAndPosition& posting = _postings[j];
+ int32_t expectedPos = phrasePos + posting._offset;
+ // advance up to the same position as the lead
+ if (!advance_position(posting, expectedPos)) {
+ return false;
+ }
+
+ if (posting._pos != expectedPos) { // we advanced too far
+ if (advance_position(lead, posting._pos - posting._offset +
lead._offset)) {
+ advance_head = true;
+ break;
+ } else {
+ return false;
+ }
+ }
+ }
+ if (advance_head) {
+ continue;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+bool PhraseQuery::advance_position(PostingsAndPosition& posting, int32_t
target) {
+ while (posting._pos < target) {
+ if (posting._upTo == posting._freq) {
+ return false;
+ } else {
+ posting._pos = posting._postings.nextPosition();
+ posting._upTo += 1;
+ }
+ }
+ return true;
+}
+
+void PhraseQuery::reset() {
+ for (PostingsAndPosition& posting : _postings) {
+ posting._freq = posting._postings.freq();
+ posting._pos = -1;
+ posting._upTo = 0;
+ }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
new file mode 100644
index 00000000000..f4b464ce358
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/index/Term.h>
+#include <CLucene/search/query/TermIterator.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+
+#include <memory>
+
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris::segment_v2 {
+
+class PhraseQuery {
+public:
+ PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher)
+ : _searcher(searcher) {}
+ ~PhraseQuery();
+
+ void add(const std::wstring& field_name, const std::vector<std::string>&
terms);
+ void search(roaring::Roaring& roaring);
+
+private:
+ class PostingsAndPosition {
+ public:
+ PostingsAndPosition(const TermPositionIterator& postings, int32_t
offset)
+ : _postings(postings), _offset(offset) {}
+
+ TermPositionIterator _postings;
+ int32_t _offset = 0;
+ int32_t _freq = 0;
+ int32_t _upTo = 0;
+ int32_t _pos = 0;
+ };
+
+ void search_by_bitmap(roaring::Roaring& roaring);
+ void search_by_skiplist(roaring::Roaring& roaring);
+
+ int32_t do_next(int32_t doc);
+ bool next_match();
+ bool advance_position(PostingsAndPosition& posting, int32_t target);
+ void reset();
+
+private:
+ std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
+ TermIterator _lead1;
+ TermIterator _lead2;
+ std::vector<TermIterator> _others;
+
+ std::vector<PostingsAndPosition> _postings;
+
+ std::vector<Term*> _terms;
+ std::vector<TermDocs*> _term_docs;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index b8406156f82..5eaacb4640f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -60,6 +60,7 @@
#include
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -329,18 +330,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
Status res = Status::OK();
if (query_type ==
InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
- auto* phrase_query = new lucene::search::PhraseQuery();
- for (auto& token : analyse_result) {
- std::wstring wtoken =
StringUtil::string_to_wstring(token);
- auto* term =
- _CLNEW
lucene::index::Term(field_ws.c_str(), wtoken.c_str());
- phrase_query->add(term);
- _CLDECDELETE(term);
- }
- query.reset(phrase_query);
- res = normal_index_search(stats, query_type,
*searcher_ptr,
- null_bitmap_already_read,
query,
- term_match_bitmap);
+ res = match_phrase_index_search(stats, runtime_state,
field_ws,
+ analyse_result,
*searcher_ptr,
+ term_match_bitmap);
} else if (query_type ==
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
res = match_phrase_prefix_index_search(stats,
runtime_state, field_ws,
analyse_result,
*searcher_ptr,
@@ -526,6 +518,24 @@ Status FullTextIndexReader::match_all_index_search(
return Status::OK();
}
+Status FullTextIndexReader::match_phrase_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
+ const std::vector<std::string>& analyse_result,
+ const FulltextIndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ TQueryOptions queryOptions = runtime_state->query_options();
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ PhraseQuery query(index_searcher);
+ query.add(field_ws, analyse_result);
+ query.search(*term_match_bitmap);
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+ return Status::OK();
+}
+
Status FullTextIndexReader::match_phrase_prefix_index_search(
OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
const std::vector<std::string>& analyse_result,
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 67fe0e4ae63..0d81022d5ba 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -167,6 +167,12 @@ private:
const FulltextIndexSearcherPtr&
index_searcher,
const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
+ Status match_phrase_index_search(OlapReaderStatistics* stats,
RuntimeState* runtime_state,
+ const std::wstring& field_ws,
+ const std::vector<std::string>&
analyse_result,
+ const FulltextIndexSearcherPtr&
index_searcher,
+ const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
+
Status match_phrase_prefix_index_search(
OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
const std::vector<std::string>& analyse_result,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]