This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 96bce532895 [opt](match_phrase) Optimizing match_phrase with a new 
algorithm (#29444)
96bce532895 is described below

commit 96bce532895b6d17c943ec0203d219dff3caa498
Author: zzzxl <[email protected]>
AuthorDate: Mon Jan 8 11:30:41 2024 +0800

    [opt](match_phrase) Optimizing match_phrase with a new algorithm (#29444)
    
    Based on the latest Lucene algorithm
---
 be/src/clucene                                     |   2 +-
 .../inverted_index/query/conjunction_query.cpp     |   3 +-
 .../inverted_index/query/phrase_query.cpp          | 201 +++++++++++++++++++++
 .../segment_v2/inverted_index/query/phrase_query.h |  77 ++++++++
 .../rowset/segment_v2/inverted_index_reader.cpp    |  34 ++--
 .../olap/rowset/segment_v2/inverted_index_reader.h |   6 +
 6 files changed, 308 insertions(+), 15 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index d05cb8154ef..1b59ae8184c 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit d05cb8154ef4368bd40c43c94e8e3c679e13c490
+Subproject commit 1b59ae8184ca5f37f400e02aa4a2dd5af290e8d0
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index b2448a8fa8e..56cccdf3e3f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -123,8 +123,7 @@ void ConjunctionQuery::search_by_bitmap(roaring::Roaring& 
roaring) {
 
 void ConjunctionQuery::search_by_skiplist(roaring::Roaring& roaring) {
     int32_t doc = 0;
-    int32_t first_doc = _lead1.nextDoc();
-    while ((doc = do_next(first_doc)) != INT32_MAX) {
+    while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) {
         roaring.add(doc);
     }
 }
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
new file mode 100644
index 00000000000..527e89a8878
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "phrase_query.h"
+
+namespace doris::segment_v2 {
+
+PhraseQuery::~PhraseQuery() {
+    for (auto& term_doc : _term_docs) {
+        if (term_doc) {
+            _CLDELETE(term_doc);
+        }
+    }
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+}
+
+void PhraseQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
+    if (terms.empty()) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
+    }
+
+    if (terms.size() == 1) {
+        std::wstring ws_term = StringUtil::string_to_wstring(terms[0]);
+        Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
+        _terms.push_back(t);
+        TermDocs* term_doc = _searcher->getReader()->termDocs(t);
+        _term_docs.push_back(term_doc);
+        _lead1 = TermIterator(term_doc);
+        return;
+    }
+
+    std::vector<TermIterator> iterators;
+    for (size_t i = 0; i < terms.size(); i++) {
+        std::wstring ws_term = StringUtil::string_to_wstring(terms[i]);
+        Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
+        _terms.push_back(t);
+        TermPositions* term_pos = _searcher->getReader()->termPositions(t);
+        _term_docs.push_back(term_pos);
+        iterators.emplace_back(term_pos);
+        _postings.emplace_back(term_pos, i);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    _lead1 = iterators[0];
+    _lead2 = iterators[1];
+    for (int32_t i = 2; i < _terms.size(); i++) {
+        _others.push_back(iterators[i]);
+    }
+}
+
+void PhraseQuery::search(roaring::Roaring& roaring) {
+    if (_lead1.isEmpty()) {
+        return;
+    }
+    if (_lead2.isEmpty()) {
+        search_by_bitmap(roaring);
+        return;
+    }
+    search_by_skiplist(roaring);
+}
+
+void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) {
+    DocRange doc_range;
+    while (_lead1.readRange(&doc_range)) {
+        if (doc_range.type_ == DocRangeType::kMany) {
+            roaring.addMany(doc_range.doc_many_size_, 
doc_range.doc_many->data());
+        } else {
+            roaring.addRange(doc_range.doc_range.first, 
doc_range.doc_range.second);
+        }
+    }
+}
+
+void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) {
+    int32_t doc = 0;
+    while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) {
+        reset();
+        if (next_match()) {
+            roaring.add(doc);
+        }
+    }
+}
+
+int32_t PhraseQuery::do_next(int32_t doc) {
+    while (true) {
+        assert(doc == _lead1.docID());
+
+        // the skip list is used to find the two smallest inverted lists
+        int32_t next2 = _lead2.advance(doc);
+        if (next2 != doc) {
+            doc = _lead1.advance(next2);
+            if (next2 != doc) {
+                continue;
+            }
+        }
+
+        // if both lead1 and lead2 exist, use skip list to lookup other 
inverted indexes
+        bool advance_head = false;
+        for (auto& other : _others) {
+            if (other.isEmpty()) {
+                continue;
+            }
+
+            if (other.docID() < doc) {
+                int32_t next = other.advance(doc);
+                if (next > doc) {
+                    doc = _lead1.advance(next);
+                    advance_head = true;
+                    break;
+                }
+            }
+        }
+        if (advance_head) {
+            continue;
+        }
+
+        return doc;
+    }
+}
+
+bool PhraseQuery::next_match() {
+    PostingsAndPosition& lead = _postings[0];
+    if (lead._upTo < lead._freq) {
+        lead._pos = lead._postings.nextPosition();
+        lead._upTo += 1;
+    } else {
+        return false;
+    }
+
+    while (true) {
+        int32_t phrasePos = lead._pos - lead._offset;
+
+        bool advance_head = false;
+        for (size_t j = 1; j < _postings.size(); ++j) {
+            PostingsAndPosition& posting = _postings[j];
+            int32_t expectedPos = phrasePos + posting._offset;
+            // advance up to the same position as the lead
+            if (!advance_position(posting, expectedPos)) {
+                return false;
+            }
+
+            if (posting._pos != expectedPos) { // we advanced too far
+                if (advance_position(lead, posting._pos - posting._offset + 
lead._offset)) {
+                    advance_head = true;
+                    break;
+                } else {
+                    return false;
+                }
+            }
+        }
+        if (advance_head) {
+            continue;
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+bool PhraseQuery::advance_position(PostingsAndPosition& posting, int32_t 
target) {
+    while (posting._pos < target) {
+        if (posting._upTo == posting._freq) {
+            return false;
+        } else {
+            posting._pos = posting._postings.nextPosition();
+            posting._upTo += 1;
+        }
+    }
+    return true;
+}
+
+void PhraseQuery::reset() {
+    for (PostingsAndPosition& posting : _postings) {
+        posting._freq = posting._postings.freq();
+        posting._pos = -1;
+        posting._upTo = 0;
+    }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
new file mode 100644
index 00000000000..f4b464ce358
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/index/Term.h>
+#include <CLucene/search/query/TermIterator.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+
+#include <memory>
+
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris::segment_v2 {
+
+class PhraseQuery {
+public:
+    PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher)
+            : _searcher(searcher) {}
+    ~PhraseQuery();
+
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
+    void search(roaring::Roaring& roaring);
+
+private:
+    class PostingsAndPosition {
+    public:
+        PostingsAndPosition(const TermPositionIterator& postings, int32_t 
offset)
+                : _postings(postings), _offset(offset) {}
+
+        TermPositionIterator _postings;
+        int32_t _offset = 0;
+        int32_t _freq = 0;
+        int32_t _upTo = 0;
+        int32_t _pos = 0;
+    };
+
+    void search_by_bitmap(roaring::Roaring& roaring);
+    void search_by_skiplist(roaring::Roaring& roaring);
+
+    int32_t do_next(int32_t doc);
+    bool next_match();
+    bool advance_position(PostingsAndPosition& posting, int32_t target);
+    void reset();
+
+private:
+    std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
+    TermIterator _lead1;
+    TermIterator _lead2;
+    std::vector<TermIterator> _others;
+
+    std::vector<PostingsAndPosition> _postings;
+
+    std::vector<Term*> _terms;
+    std::vector<TermDocs*> _term_docs;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index b8406156f82..5eaacb4640f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -60,6 +60,7 @@
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -329,18 +330,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
 
                     Status res = Status::OK();
                     if (query_type == 
InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
-                        auto* phrase_query = new lucene::search::PhraseQuery();
-                        for (auto& token : analyse_result) {
-                            std::wstring wtoken = 
StringUtil::string_to_wstring(token);
-                            auto* term =
-                                    _CLNEW 
lucene::index::Term(field_ws.c_str(), wtoken.c_str());
-                            phrase_query->add(term);
-                            _CLDECDELETE(term);
-                        }
-                        query.reset(phrase_query);
-                        res = normal_index_search(stats, query_type, 
*searcher_ptr,
-                                                  null_bitmap_already_read, 
query,
-                                                  term_match_bitmap);
+                        res = match_phrase_index_search(stats, runtime_state, 
field_ws,
+                                                        analyse_result, 
*searcher_ptr,
+                                                        term_match_bitmap);
                     } else if (query_type == 
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
                         res = match_phrase_prefix_index_search(stats, 
runtime_state, field_ws,
                                                                analyse_result, 
*searcher_ptr,
@@ -526,6 +518,24 @@ Status FullTextIndexReader::match_all_index_search(
     return Status::OK();
 }
 
+Status FullTextIndexReader::match_phrase_index_search(
+        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
+        const std::vector<std::string>& analyse_result,
+        const FulltextIndexSearcherPtr& index_searcher,
+        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+    TQueryOptions queryOptions = runtime_state->query_options();
+    try {
+        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+        PhraseQuery query(index_searcher);
+        query.add(field_ws, analyse_result);
+        query.search(*term_match_bitmap);
+    } catch (const CLuceneError& e) {
+        return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
+                                                                      
e.what());
+    }
+    return Status::OK();
+}
+
 Status FullTextIndexReader::match_phrase_prefix_index_search(
         OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
         const std::vector<std::string>& analyse_result,
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 67fe0e4ae63..0d81022d5ba 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -167,6 +167,12 @@ private:
                                   const FulltextIndexSearcherPtr& 
index_searcher,
                                   const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
 
+    Status match_phrase_index_search(OlapReaderStatistics* stats, 
RuntimeState* runtime_state,
+                                     const std::wstring& field_ws,
+                                     const std::vector<std::string>& 
analyse_result,
+                                     const FulltextIndexSearcherPtr& 
index_searcher,
+                                     const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
+
     Status match_phrase_prefix_index_search(
             OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
             const std::vector<std::string>& analyse_result,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to