This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ee83a20a33a [opt](invert index) optimize the code structure of 
inverted index queries (#29771)
ee83a20a33a is described below

commit ee83a20a33abc27ab1e5e0b2d5c83c34380d18b9
Author: zzzxl <[email protected]>
AuthorDate: Tue Jan 16 15:03:49 2024 +0800

    [opt](invert index) optimize the code structure of inverted index queries 
(#29771)
---
 .../inverted_index/query/conjunction_query.cpp     |  15 ++--
 .../inverted_index/query/conjunction_query.h       |  32 +++----
 .../inverted_index/query/disjunction_query.cpp     |  10 ++-
 .../inverted_index/query/disjunction_query.h       |  27 +++---
 .../inverted_index/query/phrase_prefix_query.cpp   |  27 +++---
 .../inverted_index/query/phrase_prefix_query.h     |  31 +++----
 .../inverted_index/query/phrase_query.cpp          |   4 +
 .../segment_v2/inverted_index/query/phrase_query.h |  23 ++---
 .../inverted_index/query/prefix_query.cpp          |   4 +-
 .../segment_v2/inverted_index/query/prefix_query.h |   6 +-
 .../query/{conjunction_query.h => query.h}         |  48 ++++------
 .../inverted_index/query/query_factory.h           |  53 +++++++++++
 .../inverted_index/query/regexp_query.cpp          |  19 ++--
 .../segment_v2/inverted_index/query/regexp_query.h |  18 ++--
 .../rowset/segment_v2/inverted_index_reader.cpp    | 100 ++++-----------------
 .../olap/rowset/segment_v2/inverted_index_reader.h |  27 ++----
 docs/en/docs/data-table/index/inverted-index.md    |  12 +++
 docs/zh-CN/docs/data-table/index/inverted-index.md |  14 ++-
 18 files changed, 223 insertions(+), 247 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index 56cccdf3e3f..fb247951716 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -17,12 +17,13 @@
 
 #include "conjunction_query.h"
 
-#include <cstdint>
+namespace doris::segment_v2 {
 
-namespace doris {
-
-ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
-        : _reader(reader), _index_version(reader->getIndexVersion()) {}
+ConjunctionQuery::ConjunctionQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                                   const TQueryOptions& query_options)
+        : _searcher(searcher),
+          _index_version(_searcher->getReader()->getIndexVersion()),
+          
_conjunction_ratio(query_options.inverted_index_conjunction_opt_threshold) {}
 
 ConjunctionQuery::~ConjunctionQuery() {
     for (auto& term_doc : _term_docs) {
@@ -47,7 +48,7 @@ void ConjunctionQuery::add(const std::wstring& field_name, 
const std::vector<std
         std::wstring ws_term = StringUtil::string_to_wstring(term);
         Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
-        TermDocs* term_doc = _reader->termDocs(t);
+        TermDocs* term_doc = _searcher->getReader()->termDocs(t);
         _term_docs.push_back(term_doc);
         iterators.emplace_back(term_doc);
     }
@@ -165,4 +166,4 @@ int32_t ConjunctionQuery::do_next(int32_t doc) {
     }
 }
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
index 36d9478c20d..2571392d529 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
@@ -17,29 +17,21 @@
 
 #pragma once
 
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
-
-#include "roaring/roaring.hh"
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
 
 CL_NS_USE(index)
+CL_NS_USE(search)
 
-namespace doris {
+namespace doris::segment_v2 {
 
-class ConjunctionQuery {
+class ConjunctionQuery : public Query {
 public:
-    ConjunctionQuery(IndexReader* reader);
-    ~ConjunctionQuery();
-
-    void set_conjunction_ratio(int32_t conjunction_ratio) {
-        _conjunction_ratio = conjunction_ratio;
-    }
+    ConjunctionQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                     const TQueryOptions& query_options);
+    ~ConjunctionQuery() override;
 
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
-    void search(roaring::Roaring& roaring);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
+    void search(roaring::Roaring& roaring) override;
 
 private:
     void search_by_bitmap(roaring::Roaring& roaring);
@@ -47,7 +39,9 @@ private:
 
     int32_t do_next(int32_t doc);
 
-    IndexReader* _reader = nullptr;
+public:
+    std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
     IndexVersion _index_version = IndexVersion::kV0;
     int32_t _conjunction_ratio = 1000;
     bool _use_skip = false;
@@ -60,4 +54,4 @@ private:
     std::vector<TermDocs*> _term_docs;
 };
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
index 7b797d7b54a..0514e1a372a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
@@ -17,9 +17,11 @@
 
 #include "disjunction_query.h"
 
-namespace doris {
+namespace doris::segment_v2 {
 
-DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {}
+DisjunctionQuery::DisjunctionQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                                   const TQueryOptions& query_options)
+        : _searcher(searcher) {}
 
 DisjunctionQuery::~DisjunctionQuery() {
     for (auto& term_doc : _term_docs) {
@@ -43,7 +45,7 @@ void DisjunctionQuery::add(const std::wstring& field_name, 
const std::vector<std
         std::wstring ws_term = StringUtil::string_to_wstring(term);
         Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
-        TermDocs* term_doc = _reader->termDocs(t);
+        TermDocs* term_doc = _searcher->getReader()->termDocs(t);
         _term_docs.push_back(term_doc);
         _term_iterators.emplace_back(term_doc);
     }
@@ -77,4 +79,4 @@ void DisjunctionQuery::search(roaring::Roaring& roaring) {
     }
 }
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
index bb0a837f42a..9a1e5df759c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
@@ -17,31 +17,28 @@
 
 #pragma once
 
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
-
-#include "roaring/roaring.hh"
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
 
 CL_NS_USE(index)
+CL_NS_USE(search)
 
-namespace doris {
+namespace doris::segment_v2 {
 
-class DisjunctionQuery {
+class DisjunctionQuery : public Query {
 public:
-    DisjunctionQuery(IndexReader* reader);
-    ~DisjunctionQuery();
+    DisjunctionQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                     const TQueryOptions& query_options);
+    ~DisjunctionQuery() override;
 
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
-    void search(roaring::Roaring& roaring);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
+    void search(roaring::Roaring& roaring) override;
 
 private:
-    IndexReader* _reader = nullptr;
+    std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
     std::vector<Term*> _terms;
     std::vector<TermDocs*> _term_docs;
     std::vector<TermIterator> _term_iterators;
 };
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
index 4b0340cda4a..7920336c752 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
@@ -19,24 +19,25 @@
 
 #include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h"
 
-namespace doris {
+namespace doris::segment_v2 {
 
-namespace segment_v2 {
-
-PhrasePrefixQuery::PhrasePrefixQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher)
-        : _searcher(searcher) {}
+PhrasePrefixQuery::PhrasePrefixQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                                     const TQueryOptions& query_options)
+        : _searcher(searcher),
+          _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()),
+          _max_expansions(query_options.inverted_index_max_expansions) {}
 
 void PhrasePrefixQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
     if (terms.empty()) {
-        return;
+        _CLTHROWA(CL_ERR_IllegalArgument, "PhrasePrefixQuery::add: terms 
empty");
     }
 
     for (size_t i = 0; i < terms.size(); i++) {
         if (i < terms.size() - 1) {
             std::wstring ws = StringUtil::string_to_wstring(terms[i]);
             Term* t = _CLNEW Term(field_name.c_str(), ws.c_str());
-            _query.add(t);
-            _CLDECDELETE(t);
+            _query->add(t);
+            _CLLDECDELETE(t);
         } else {
             std::vector<CL_NS(index)::Term*> prefix_terms;
             PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name, 
terms[i],
@@ -44,20 +45,18 @@ void PhrasePrefixQuery::add(const std::wstring& field_name, 
const std::vector<st
             if (prefix_terms.empty()) {
                 continue;
             }
-            _query.add(prefix_terms);
+            _query->add(prefix_terms);
             for (auto& t : prefix_terms) {
-                _CLDECDELETE(t);
+                _CLLDECDELETE(t);
             }
         }
     }
 }
 
 void PhrasePrefixQuery::search(roaring::Roaring& roaring) {
-    _searcher->_search(&_query, [&roaring](const int32_t docid, const float_t 
/*score*/) {
+    _searcher->_search(_query.get(), [&roaring](const int32_t docid, const 
float_t /*score*/) {
         roaring.add(docid);
     });
 }
 
-} // namespace segment_v2
-
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
index 28007620ce5..e565c0409cf 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
@@ -17,38 +17,31 @@
 
 #pragma once
 
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-
 #include <memory>
 
+// clang-format off
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
 #include "CLucene/search/MultiPhraseQuery.h"
-#include "roaring/roaring.hh"
+// clang-format on
 
-CL_NS_USE(index)
 CL_NS_USE(search)
 
-namespace doris {
-
-namespace segment_v2 {
+namespace doris::segment_v2 {
 
-class PhrasePrefixQuery {
+class PhrasePrefixQuery : public Query {
 public:
-    PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher);
-    ~PhrasePrefixQuery() = default;
+    PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                      const TQueryOptions& query_options);
+    ~PhrasePrefixQuery() override = default;
 
-    void set_max_expansions(int32_t max_expansions) { _max_expansions = 
max_expansions; }
-
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
-    void search(roaring::Roaring& roaring);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
+    void search(roaring::Roaring& roaring) override;
 
 private:
     std::shared_ptr<lucene::search::IndexSearcher> _searcher;
-    MultiPhraseQuery _query;
 
+    std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query;
     int32_t _max_expansions = 50;
 };
 
-} // namespace segment_v2
-
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
index 527e89a8878..a4b7f7502d1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -19,6 +19,10 @@
 
 namespace doris::segment_v2 {
 
+PhraseQuery::PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                         const TQueryOptions& query_options)
+        : _searcher(searcher) {}
+
 PhraseQuery::~PhraseQuery() {
     for (auto& term_doc : _term_docs) {
         if (term_doc) {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
index f4b464ce358..8f62989d86b 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -17,28 +17,21 @@
 
 #pragma once
 
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
-#include <CLucene/search/query/TermPositionIterator.h>
-
-#include <memory>
-
-#include "roaring/roaring.hh"
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
 
 CL_NS_USE(index)
+CL_NS_USE(search)
 
 namespace doris::segment_v2 {
 
-class PhraseQuery {
+class PhraseQuery : public Query {
 public:
-    PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher)
-            : _searcher(searcher) {}
-    ~PhraseQuery();
+    PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                const TQueryOptions& query_options);
+    ~PhraseQuery() override;
 
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
-    void search(roaring::Roaring& roaring);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
+    void search(roaring::Roaring& roaring) override;
 
 private:
     class PostingsAndPosition {
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
index 7d23d6eb60f..14006227352 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
@@ -17,7 +17,7 @@
 
 #include "prefix_query.h"
 
-namespace doris {
+namespace doris::segment_v2 {
 
 void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring& 
field_name,
                                    const std::string& prefix,
@@ -77,4 +77,4 @@ void PrefixQuery::get_prefix_terms(IndexReader* reader, const 
std::wstring& fiel
     });
 }
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
index 5deb0c1a362..9a33b13dd8d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
@@ -24,12 +24,12 @@
 
 CL_NS_USE(index)
 
-namespace doris {
+namespace doris::segment_v2 {
 
 class PrefixQuery {
 public:
     PrefixQuery() = default;
-    ~PrefixQuery() = default;
+    virtual ~PrefixQuery() = default;
 
     static void get_prefix_terms(IndexReader* reader, const std::wstring& 
field_name,
                                  const std::string& prefix,
@@ -37,4 +37,4 @@ public:
                                  int32_t max_expansions = 50);
 };
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
similarity index 52%
copy from be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
copy to be/src/olap/rowset/segment_v2/inverted_index/query/query.h
index 36d9478c20d..091ba7d3958 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
@@ -17,47 +17,35 @@
 
 #pragma once
 
-#include <CLucene.h>
+#include <CLucene.h> // IWYU pragma: keep
 #include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
 #include <CLucene/index/Term.h>
 #include <CLucene/search/query/TermIterator.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+#include <gen_cpp/PaloInternalService_types.h>
+
+#include <memory>
 
 #include "roaring/roaring.hh"
 
 CL_NS_USE(index)
+CL_NS_USE(search)
+CL_NS_USE(util)
 
-namespace doris {
+namespace doris::segment_v2 {
 
-class ConjunctionQuery {
+class Query {
 public:
-    ConjunctionQuery(IndexReader* reader);
-    ~ConjunctionQuery();
-
-    void set_conjunction_ratio(int32_t conjunction_ratio) {
-        _conjunction_ratio = conjunction_ratio;
-    }
-
-    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
-    void search(roaring::Roaring& roaring);
-
-private:
-    void search_by_bitmap(roaring::Roaring& roaring);
-    void search_by_skiplist(roaring::Roaring& roaring);
-
-    int32_t do_next(int32_t doc);
-
-    IndexReader* _reader = nullptr;
-    IndexVersion _index_version = IndexVersion::kV0;
-    int32_t _conjunction_ratio = 1000;
-    bool _use_skip = false;
+    virtual ~Query() = default;
 
-    TermIterator _lead1;
-    TermIterator _lead2;
-    std::vector<TermIterator> _others;
+    // a unified data preparation interface that provides the field names to 
be queried and the terms for the query.
+    // @param field_name The name of the field within the data source to 
search against.
+    // @param terms a vector of tokenized strings that represent the search 
terms.
+    virtual void add(const std::wstring& field_name, const 
std::vector<std::string>& terms) = 0;
 
-    std::vector<Term*> _terms;
-    std::vector<TermDocs*> _term_docs;
+    // a unified query interface for retrieving the ids obtained from the 
search.
+    // @param roaring a Roaring bitmap to be populated with the search results,
+    virtual void search(roaring::Roaring& roaring) = 0;
 };
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h
new file mode 100644
index 00000000000..6de8a7e4c25
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index_query_type.h"
+
+namespace doris::segment_v2 {
+
+class Query;
+class DisjunctionQuery;
+class ConjunctionQuery;
+class PhraseQuery;
+class PhrasePrefixQuery;
+class RegexpQuery;
+
+class QueryFactory {
+public:
+    template <typename... Args>
+    static std::unique_ptr<Query> create(InvertedIndexQueryType query_type, 
Args&&... args) {
+        switch (query_type) {
+        case InvertedIndexQueryType::MATCH_ANY_QUERY:
+        case InvertedIndexQueryType::EQUAL_QUERY:
+            return 
std::make_unique<DisjunctionQuery>(std::forward<Args>(args)...);
+        case InvertedIndexQueryType::MATCH_ALL_QUERY:
+            return 
std::make_unique<ConjunctionQuery>(std::forward<Args>(args)...);
+        case InvertedIndexQueryType::MATCH_PHRASE_QUERY:
+            return std::make_unique<PhraseQuery>(std::forward<Args>(args)...);
+        case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY:
+            return 
std::make_unique<PhrasePrefixQuery>(std::forward<Args>(args)...);
+        case InvertedIndexQueryType::MATCH_REGEXP_QUERY:
+            return std::make_unique<RegexpQuery>(std::forward<Args>(args)...);
+        default:
+            return nullptr;
+        }
+    }
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
index ee959e1da73..007da8289dc 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
@@ -24,10 +24,19 @@
 
 namespace doris::segment_v2 {
 
-RegexpQuery::RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher)
-        : _searcher(searcher), query(searcher->getReader()) {}
+RegexpQuery::RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                         const TQueryOptions& query_options)
+        : _searcher(searcher),
+          _max_expansions(query_options.inverted_index_max_expansions),
+          _query(searcher, query_options) {}
+
+void RegexpQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& patterns) {
+    if (patterns.size() != 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "RegexpQuery::add: terms size != 1");
+    }
+
+    const std::string& pattern = patterns[0];
 
-void RegexpQuery::add(const std::wstring& field_name, const std::string& 
pattern) {
     hs_database_t* database = nullptr;
     hs_compile_error_t* compile_err = nullptr;
     hs_scratch_t* scratch = nullptr;
@@ -94,11 +103,11 @@ void RegexpQuery::add(const std::wstring& field_name, 
const std::string& pattern
         return;
     }
 
-    query.add(field_name, terms);
+    _query.add(field_name, terms);
 }
 
 void RegexpQuery::search(roaring::Roaring& roaring) {
-    query.search(roaring);
+    _query.search(roaring);
 }
 
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
index 3791ad50d8f..336b2d0b6a6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
@@ -17,30 +17,28 @@
 
 #pragma once
 
-#include <memory>
-
 #include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
 
 CL_NS_USE(index)
 CL_NS_USE(search)
 
 namespace doris::segment_v2 {
 
-class RegexpQuery {
+class RegexpQuery : public Query {
 public:
-    RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher);
-    ~RegexpQuery() = default;
-
-    void set_max_expansions(int32_t max_expansions) { _max_expansions = 
max_expansions; }
+    RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                const TQueryOptions& query_options);
+    ~RegexpQuery() override = default;
 
-    void add(const std::wstring& field_name, const std::string& pattern);
-    void search(roaring::Roaring& roaring);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
patterns) override;
+    void search(roaring::Roaring& roaring) override;
 
 private:
     std::shared_ptr<lucene::search::IndexSearcher> _searcher;
 
     int32_t _max_expansions = 50;
-    DisjunctionQuery query;
+    DisjunctionQuery _query;
 };
 
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 5eaacb4640f..172679f1fa6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -61,6 +61,7 @@
 #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/query_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -328,20 +329,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                             
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant)) {
                     term_match_bitmap = std::make_shared<roaring::Roaring>();
 
-                    Status res = Status::OK();
-                    if (query_type == 
InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
-                        res = match_phrase_index_search(stats, runtime_state, 
field_ws,
-                                                        analyse_result, 
*searcher_ptr,
-                                                        term_match_bitmap);
-                    } else if (query_type == 
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
-                        res = match_phrase_prefix_index_search(stats, 
runtime_state, field_ws,
-                                                               analyse_result, 
*searcher_ptr,
-                                                               
term_match_bitmap);
-                    } else {
-                        res = match_all_index_search(stats, runtime_state, 
field_ws, analyse_result,
-                                                     *searcher_ptr, 
term_match_bitmap);
-                    }
-                    if (!res.ok()) {
+                    Status res =
+                            match_index_search(stats, runtime_state, 
query_type, field_ws,
+                                               analyse_result, *searcher_ptr, 
term_match_bitmap);
+                    if (!res) {
                         return res;
                     }
 
@@ -352,8 +343,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             }
             query_match_bitmap = *term_match_bitmap;
         } else if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
-            const std::string& pattern = analyse_result[0];
-
             std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
             auto* cache = InvertedIndexQueryCache::instance();
 
@@ -361,7 +350,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             cache_key.index_path = index_file_path;
             cache_key.column_name = column_name;
             cache_key.query_type = query_type;
-            cache_key.value = pattern;
+            cache_key.value = analyse_result[0];
             InvertedIndexQueryCacheHandle cache_handle;
             if (cache->lookup(cache_key, &cache_handle)) {
                 stats->inverted_index_query_cache_hit++;
@@ -377,8 +366,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                             
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant)) {
                     term_match_bitmap = std::make_shared<roaring::Roaring>();
 
-                    Status res = match_regexp_index_search(stats, 
runtime_state, field_ws, pattern,
-                                                           *searcher_ptr, 
term_match_bitmap);
+                    Status res =
+                            match_index_search(stats, runtime_state, 
query_type, field_ws,
+                                               analyse_result, *searcher_ptr, 
term_match_bitmap);
                     if (!res.ok()) {
                         return res;
                     }
@@ -499,73 +489,21 @@ Status FullTextIndexReader::normal_index_search(
     return Status::OK();
 }
 
-Status FullTextIndexReader::match_all_index_search(
-        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-        const std::vector<std::string>& analyse_result,
-        const FulltextIndexSearcherPtr& index_searcher,
-        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
-    TQueryOptions queryOptions = runtime_state->query_options();
-    try {
-        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-        ConjunctionQuery query(index_searcher->getReader());
-        
query.set_conjunction_ratio(queryOptions.inverted_index_conjunction_opt_threshold);
-        query.add(field_ws, analyse_result);
-        query.search(*term_match_bitmap);
-    } catch (const CLuceneError& e) {
-        return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
-                                                                      
e.what());
-    }
-    return Status::OK();
-}
-
-Status FullTextIndexReader::match_phrase_index_search(
-        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-        const std::vector<std::string>& analyse_result,
-        const FulltextIndexSearcherPtr& index_searcher,
-        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
-    TQueryOptions queryOptions = runtime_state->query_options();
-    try {
-        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-        PhraseQuery query(index_searcher);
-        query.add(field_ws, analyse_result);
-        query.search(*term_match_bitmap);
-    } catch (const CLuceneError& e) {
-        return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
-                                                                      
e.what());
-    }
-    return Status::OK();
-}
-
-Status FullTextIndexReader::match_phrase_prefix_index_search(
-        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-        const std::vector<std::string>& analyse_result,
+Status FullTextIndexReader::match_index_search(
+        OlapReaderStatistics* stats, RuntimeState* runtime_state, 
InvertedIndexQueryType query_type,
+        const std::wstring& field_ws, const std::vector<std::string>& 
analyse_result,
         const FulltextIndexSearcherPtr& index_searcher,
         const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
     TQueryOptions queryOptions = runtime_state->query_options();
     try {
         SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-        PhrasePrefixQuery query(index_searcher);
-        query.set_max_expansions(queryOptions.inverted_index_max_expansions);
-        query.add(field_ws, analyse_result);
-        query.search(*term_match_bitmap);
-    } catch (const CLuceneError& e) {
-        return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
-                                                                      
e.what());
-    }
-    return Status::OK();
-}
-
-Status FullTextIndexReader::match_regexp_index_search(
-        OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-        const std::string& pattern, const FulltextIndexSearcherPtr& 
index_searcher,
-        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
-    TQueryOptions queryOptions = runtime_state->query_options();
-    try {
-        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-        RegexpQuery query(index_searcher);
-        query.set_max_expansions(queryOptions.inverted_index_max_expansions);
-        query.add(field_ws, pattern);
-        query.search(*term_match_bitmap);
+        auto query = QueryFactory::create(query_type, index_searcher, 
queryOptions);
+        if (!query) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+                    "query type " + query_type_to_string(query_type) + ", 
query is nullptr");
+        }
+        query->add(field_ws, analyse_result);
+        query->search(*term_match_bitmap);
     } catch (const CLuceneError& e) {
         return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
                                                                       
e.what());
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 0d81022d5ba..430ec917329 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -161,28 +161,11 @@ private:
                                const std::unique_ptr<lucene::search::Query>& 
query,
                                const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
 
-    Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,
-                                  const std::wstring& field_ws,
-                                  const std::vector<std::string>& 
analyse_result,
-                                  const FulltextIndexSearcherPtr& 
index_searcher,
-                                  const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
-
-    Status match_phrase_index_search(OlapReaderStatistics* stats, 
RuntimeState* runtime_state,
-                                     const std::wstring& field_ws,
-                                     const std::vector<std::string>& 
analyse_result,
-                                     const FulltextIndexSearcherPtr& 
index_searcher,
-                                     const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
-
-    Status match_phrase_prefix_index_search(
-            OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-            const std::vector<std::string>& analyse_result,
-            const FulltextIndexSearcherPtr& index_searcher,
-            const std::shared_ptr<roaring::Roaring>& term_match_bitmap);
-
-    Status match_regexp_index_search(OlapReaderStatistics* stats, 
RuntimeState* runtime_state,
-                                     const std::wstring& field_ws, const 
std::string& pattern,
-                                     const FulltextIndexSearcherPtr& 
index_searcher,
-                                     const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
+    Status match_index_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,
+                              InvertedIndexQueryType query_type, const 
std::wstring& field_ws,
+                              const std::vector<std::string>& analyse_result,
+                              const FulltextIndexSearcherPtr& index_searcher,
+                              const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
 
     void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher,
                            bool& null_bitmap_already_read);
diff --git a/docs/en/docs/data-table/index/inverted-index.md 
b/docs/en/docs/data-table/index/inverted-index.md
index 75a8f6a3b1b..2f511f312f4 100644
--- a/docs/en/docs/data-table/index/inverted-index.md
+++ b/docs/en/docs/data-table/index/inverted-index.md
@@ -51,6 +51,9 @@ The features for inverted index is as follows:
 
 - add fulltext search on text(string, varchar, char) field
   - MATCH_ALL matches all keywords, MATCH_ANY matches any keywords
+  - support phrase query MATCH_PHRASE
+  - support phrase + prefix query MATCH_PHRASE_PREFIX
+  - support regexp query MATCH_REGEXP
   - support fulltext on array of text field
   - support english, chinese and mixed unicode word parser
 - accelerate normal equal, range query, replacing bitmap index in the future
@@ -181,6 +184,15 @@ SELECT * FROM table_name WHERE logmsg MATCH_ALL 'keyword1 
keyword2';
 -- 1.4 find rows that logmsg contains both keyword1 and keyword2, and in the 
order of keyword1 appearing first and keyword2 appearing later.
 SELECT * FROM table_name WHERE logmsg MATCH_PHRASE 'keyword1 keyword2';
 
+-- 1.5 perform prefix matching on the last word "keyword2" while maintaining 
the order of words, defaulting to finding 50 prefix words (controlled by the 
session variable inverted_index_max_expansions)
+SELECT * FROM table_name WHERE logmsg MATCH_PHRASE_PREFIX 'keyword1 keyword2';
+
+-- 1.6 If only one word is entered, it degrades to a prefix query, defaulting 
to finding 50 prefix words (controlled by the session variable 
inverted_index_max_expansions)
+SELECT * FROM table_name WHERE logmsg MATCH_PHRASE_PREFIX 'keyword1';
+
+-- 1.7 perform regex matching on the tokenized words, defaulting to matching 
50 tokens (controlled by the session variable inverted_index_max_expansions)
+SELECT * FROM table_name WHERE logmsg MATCH_REGEXP 'key*';
+
 -- 2. normal equal, range query
 SELECT * FROM table_name WHERE id = 123;
 SELECT * FROM table_name WHERE ts > '2023-01-01 00:00:00';
diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md 
b/docs/zh-CN/docs/data-table/index/inverted-index.md
index 2f4c3f85bfe..ff29c75d6db 100644
--- a/docs/zh-CN/docs/data-table/index/inverted-index.md
+++ b/docs/zh-CN/docs/data-table/index/inverted-index.md
@@ -50,7 +50,10 @@ under the License.
 Doris倒排索引的功能简要介绍如下:
 
 - 增加了字符串类型的全文检索
-  - 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY、匹配短语词组MATCH_PHRASE
+  - 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY
+  - 支持短语查询 MATCH_PHRASE
+  - 支持短语+前缀 MATCH_PHRASE_PREFIX
+  - 支持正则查询 MATCH_REGEXP
   - 支持字符串数组类型的全文检索
   - 支持英文、中文以及Unicode多语言分词
 - 加速普通等值、范围查询,覆盖bitmap索引的功能,未来会代替bitmap索引
@@ -179,6 +182,15 @@ SELECT * FROM table_name WHERE logmsg MATCH_ALL 'keyword1 
keyword2';
 -- 1.4 logmsg中同时包含keyword1和keyword2的行,并且按照keyword1在前,keyword2在后的顺序
 SELECT * FROM table_name WHERE logmsg MATCH_PHRASE 'keyword1 keyword2';
 
+-- 1.5 
在保持词顺序的前提下,对最后一个词keyword2做前缀匹配,默认找50个前缀词(session变量inverted_index_max_expansions控制)
+SELECT * FROM table_name WHERE logmsg MATCH_PHRASE_PREFIX 'keyword1 keyword2';
+
+-- 1.6 如果只填一个词会退化为前缀查询,默认找50个前缀词(session变量inverted_index_max_expansions控制)
+SELECT * FROM table_name WHERE logmsg MATCH_PHRASE_PREFIX 'keyword1';
+
+-- 1.7 对分词后的词进行正则匹配,默认匹配50个(session变量inverted_index_max_expansions控制)
+SELECT * FROM table_name WHERE logmsg MATCH_REGEXP 'key*';
+
 
 -- 2. 普通等值、范围、IN、NOT IN,正常的SQL语句即可,例如
 SELECT * FROM table_name WHERE id = 123;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to