This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 6708cb6c3bb [feature](invert index) add match_phrase_prefix and
match_regexp #27404 #28257 (#28715)
6708cb6c3bb is described below
commit 6708cb6c3bb848b95c4ca2946d234347630474da
Author: zzzxl <[email protected]>
AuthorDate: Thu Dec 21 20:32:34 2023 +0800
[feature](invert index) add match_phrase_prefix and match_regexp #27404
#28257 (#28715)
---
be/src/clucene | 2 +-
be/src/exec/olap_common.h | 4 +
be/src/exec/olap_utils.h | 17 ++-
be/src/olap/match_predicate.cpp | 8 +-
.../inverted_index/query/conjunction_query.cpp | 6 +-
.../inverted_index/query/disjunction_query.cpp | 17 ++-
.../inverted_index/query/disjunction_query.h | 1 -
.../inverted_index/query/phrase_prefix_query.cpp | 63 ++++++++++
.../{disjunction_query.h => phrase_prefix_query.h} | 28 +++--
.../inverted_index/query/prefix_query.cpp | 80 +++++++++++++
.../query/{disjunction_query.h => prefix_query.h} | 26 ++--
.../inverted_index/query/regexp_query.cpp | 98 +++++++++++++++
.../query/{disjunction_query.h => regexp_query.h} | 32 +++--
.../rowset/segment_v2/inverted_index_query_type.h | 8 ++
.../rowset/segment_v2/inverted_index_reader.cpp | 131 +++++++++++++++++----
.../olap/rowset/segment_v2/inverted_index_reader.h | 21 +++-
be/src/vec/functions/function_tokenize.cpp | 8 +-
be/src/vec/functions/match.cpp | 39 +++---
be/src/vec/functions/match.h | 34 ++++++
.../antlr4/org/apache/doris/nereids/DorisLexer.g4 | 2 +
.../antlr4/org/apache/doris/nereids/DorisParser.g4 | 2 +-
fe/fe-core/src/main/cup/sql_parser.cup | 8 +-
.../org/apache/doris/analysis/MatchPredicate.java | 22 ++++
.../doris/nereids/parser/LogicalPlanBuilder.java | 14 +++
.../doris/nereids/trees/expressions/Match.java | 4 +
.../trees/expressions/MatchPhrasePrefix.java | 49 ++++++++
.../nereids/trees/expressions/MatchRegexp.java | 49 ++++++++
.../expressions/visitor/ExpressionVisitor.java | 10 ++
.../java/org/apache/doris/qe/SessionVariable.java | 8 ++
fe/fe-core/src/main/jflex/sql_scanner.flex | 2 +
gensrc/thrift/Opcodes.thrift | 2 +
gensrc/thrift/PaloInternalService.thrift | 2 +
.../test_index_match_phrase_prefix.out | 31 +++++
.../inverted_index_p0/test_index_match_regexp.out | 16 +++
.../test_index_match_phrase_prefix.groovy | 98 +++++++++++++++
.../test_index_match_regexp.groovy | 89 ++++++++++++++
36 files changed, 919 insertions(+), 112 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 70c1a692bbb..6f8a21ffe15 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 70c1a692bbb1277f107ff2ddedda41b3a223c632
+Subproject commit 6f8a21ffe15bd78a1cd3e685067ee5c9ed071827
diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index 5bd06d8d540..7133f40e7ad 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -304,6 +304,10 @@ public:
condition.__set_condition_op("match_all");
} else if (value.first == MatchType::MATCH_PHRASE) {
condition.__set_condition_op("match_phrase");
+ } else if (value.first == MatchType::MATCH_PHRASE_PREFIX) {
+ condition.__set_condition_op("match_phrase_prefix");
+ } else if (value.first == MatchType::MATCH_REGEXP) {
+ condition.__set_condition_op("match_regexp");
} else if (value.first == MatchType::MATCH_ELEMENT_EQ) {
condition.__set_condition_op("match_element_eq");
} else if (value.first == MatchType::MATCH_ELEMENT_LT) {
diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h
index 1d6bdf95930..1b8525dc1b9 100644
--- a/be/src/exec/olap_utils.h
+++ b/be/src/exec/olap_utils.h
@@ -170,6 +170,8 @@ enum class MatchType {
MATCH_ELEMENT_GT = 5,
MATCH_ELEMENT_LE = 6,
MATCH_ELEMENT_GE = 7,
+ MATCH_PHRASE_PREFIX = 8,
+ MATCH_REGEXP = 9,
};
inline MatchType to_match_type(TExprOpcode::type type) {
@@ -183,6 +185,12 @@ inline MatchType to_match_type(TExprOpcode::type type) {
case TExprOpcode::type::MATCH_PHRASE:
return MatchType::MATCH_PHRASE;
break;
+ case TExprOpcode::type::MATCH_PHRASE_PREFIX:
+ return MatchType::MATCH_PHRASE_PREFIX;
+ break;
+ case TExprOpcode::type::MATCH_REGEXP:
+ return MatchType::MATCH_REGEXP;
+ break;
case TExprOpcode::type::MATCH_ELEMENT_EQ:
return MatchType::MATCH_ELEMENT_EQ;
break;
@@ -212,6 +220,10 @@ inline MatchType to_match_type(const std::string&
condition_op) {
return MatchType::MATCH_ALL;
} else if (condition_op.compare("match_phrase") == 0) {
return MatchType::MATCH_PHRASE;
+ } else if (condition_op.compare("match_phrase_prefix") == 0) {
+ return MatchType::MATCH_PHRASE_PREFIX;
+ } else if (condition_op.compare("match_regexp") == 0) {
+ return MatchType::MATCH_REGEXP;
} else if (condition_op.compare("match_element_eq") == 0) {
return MatchType::MATCH_ELEMENT_EQ;
} else if (condition_op.compare("match_element_lt") == 0) {
@@ -229,6 +241,8 @@ inline MatchType to_match_type(const std::string&
condition_op) {
inline bool is_match_condition(const std::string& op) {
if (0 == strcasecmp(op.c_str(), "match_any") || 0 ==
strcasecmp(op.c_str(), "match_all") ||
0 == strcasecmp(op.c_str(), "match_phrase") ||
+ 0 == strcasecmp(op.c_str(), "match_phrase_prefix") ||
+ 0 == strcasecmp(op.c_str(), "match_regexp") ||
0 == strcasecmp(op.c_str(), "match_element_eq") ||
0 == strcasecmp(op.c_str(), "match_element_lt") ||
0 == strcasecmp(op.c_str(), "match_element_gt") ||
@@ -241,7 +255,8 @@ inline bool is_match_condition(const std::string& op) {
inline bool is_match_operator(const TExprOpcode::type& op_type) {
return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL ==
op_type ||
- TExprOpcode::MATCH_PHRASE == op_type ||
TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
+ TExprOpcode::MATCH_PHRASE == op_type ||
TExprOpcode::MATCH_PHRASE_PREFIX == op_type ||
+ TExprOpcode::MATCH_REGEXP == op_type ||
TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
TExprOpcode::MATCH_ELEMENT_LT == op_type ||
TExprOpcode::MATCH_ELEMENT_GT == op_type ||
TExprOpcode::MATCH_ELEMENT_LE == op_type ||
TExprOpcode::MATCH_ELEMENT_GE == op_type;
}
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index 61d25723155..8ffd6d99936 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -95,6 +95,12 @@ InvertedIndexQueryType
MatchPredicate::_to_inverted_index_query_type(MatchType m
case MatchType::MATCH_PHRASE:
ret = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
break;
+ case MatchType::MATCH_PHRASE_PREFIX:
+ ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
+ break;
+ case MatchType::MATCH_REGEXP:
+ ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY;
+ break;
case MatchType::MATCH_ELEMENT_EQ:
ret = InvertedIndexQueryType::EQUAL_QUERY;
break;
@@ -117,7 +123,7 @@ InvertedIndexQueryType
MatchPredicate::_to_inverted_index_query_type(MatchType m
}
bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const {
- if (_match_type == MatchType::MATCH_PHRASE &&
+ if ((_match_type == MatchType::MATCH_PHRASE || _match_type ==
MatchType::MATCH_PHRASE_PREFIX) &&
iterator->get_inverted_index_reader_type() ==
InvertedIndexReaderType::FULLTEXT &&
get_parser_phrase_support_string_from_properties(iterator->get_index_properties())
==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index b77edc79ade..b2448a8fa8e 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -38,12 +38,12 @@ ConjunctionQuery::~ConjunctionQuery() {
}
void ConjunctionQuery::add(const std::wstring& field_name, const
std::vector<std::string>& terms) {
- if (terms.size() < 1) {
- _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size()
< 1");
+ if (terms.empty()) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms
empty");
}
std::vector<TermIterator> iterators;
- for (auto& term : terms) {
+ for (const auto& term : terms) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
index 07a159b3222..7b797d7b54a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp
@@ -22,26 +22,25 @@ namespace doris {
DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {}
DisjunctionQuery::~DisjunctionQuery() {
- for (auto& term : _terms) {
- if (term) {
- _CLDELETE(term);
- }
- }
for (auto& term_doc : _term_docs) {
if (term_doc) {
_CLDELETE(term_doc);
}
}
+ for (auto& term : _terms) {
+ if (term) {
+ _CLDELETE(term);
+ }
+ }
}
void DisjunctionQuery::add(const std::wstring& field_name, const
std::vector<std::string>& terms) {
- if (terms.size() < 1) {
- _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size()
< 1");
+ if (terms.empty()) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "DisjunctionQuery::add: terms
empty");
}
- for (auto& term : terms) {
+ for (const auto& term : terms) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
- _wsterms.emplace_back(&ws_term);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
TermDocs* term_doc = _reader->termDocs(t);
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
index f42fd69dabc..bb0a837f42a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
@@ -39,7 +39,6 @@ public:
private:
IndexReader* _reader = nullptr;
- std::vector<std::wstring*> _wsterms;
std::vector<Term*> _terms;
std::vector<TermDocs*> _term_docs;
std::vector<TermIterator> _term_iterators;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
new file mode 100644
index 00000000000..4b0340cda4a
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "phrase_prefix_query.h"
+
+#include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h"
+
+namespace doris {
+
+namespace segment_v2 {
+
+PhrasePrefixQuery::PhrasePrefixQuery(const
std::shared_ptr<lucene::search::IndexSearcher>& searcher)
+ : _searcher(searcher) {}
+
+void PhrasePrefixQuery::add(const std::wstring& field_name, const
std::vector<std::string>& terms) {
+ if (terms.empty()) {
+ return;
+ }
+
+ for (size_t i = 0; i < terms.size(); i++) {
+ if (i < terms.size() - 1) {
+ std::wstring ws = StringUtil::string_to_wstring(terms[i]);
+ Term* t = _CLNEW Term(field_name.c_str(), ws.c_str());
+ _query.add(t);
+ _CLDECDELETE(t);
+ } else {
+ std::vector<CL_NS(index)::Term*> prefix_terms;
+ PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name,
terms[i],
+ prefix_terms, _max_expansions);
+ if (prefix_terms.empty()) {
+ continue;
+ }
+ _query.add(prefix_terms);
+ for (auto& t : prefix_terms) {
+ _CLDECDELETE(t);
+ }
+ }
+ }
+}
+
+void PhrasePrefixQuery::search(roaring::Roaring& roaring) {
+ _searcher->_search(&_query, [&roaring](const int32_t docid, const float_t
/*score*/) {
+ roaring.add(docid);
+ });
+}
+
+} // namespace segment_v2
+
+} // namespace doris
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
similarity index 67%
copy from be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
copy to be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
index f42fd69dabc..28007620ce5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h
@@ -19,30 +19,36 @@
#include <CLucene.h>
#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
+#include <memory>
+
+#include "CLucene/search/MultiPhraseQuery.h"
#include "roaring/roaring.hh"
CL_NS_USE(index)
+CL_NS_USE(search)
namespace doris {
-class DisjunctionQuery {
+namespace segment_v2 {
+
+class PhrasePrefixQuery {
public:
- DisjunctionQuery(IndexReader* reader);
- ~DisjunctionQuery();
+ PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>&
searcher);
+ ~PhrasePrefixQuery() = default;
+
+ void set_max_expansions(int32_t max_expansions) { _max_expansions =
max_expansions; }
void add(const std::wstring& field_name, const std::vector<std::string>&
terms);
void search(roaring::Roaring& roaring);
private:
- IndexReader* _reader = nullptr;
- std::vector<std::wstring*> _wsterms;
- std::vector<Term*> _terms;
- std::vector<TermDocs*> _term_docs;
- std::vector<TermIterator> _term_iterators;
+ std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+ MultiPhraseQuery _query;
+
+ int32_t _max_expansions = 50;
};
+} // namespace segment_v2
+
} // namespace doris
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
new file mode 100644
index 00000000000..7d23d6eb60f
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "prefix_query.h"
+
+namespace doris {
+
+void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring&
field_name,
+ const std::string& prefix,
+ std::vector<CL_NS(index)::Term*>&
prefix_terms,
+ int32_t max_expansions) {
+ std::wstring ws_prefix = StringUtil::string_to_wstring(prefix);
+
+ Term* prefix_term = _CLNEW Term(field_name.c_str(), ws_prefix.c_str());
+ TermEnum* enumerator = reader->terms(prefix_term);
+
+ int32_t count = 0;
+ Term* lastTerm = nullptr;
+ try {
+ const TCHAR* prefixText = prefix_term->text();
+ const TCHAR* prefixField = prefix_term->field();
+ const TCHAR* tmp = nullptr;
+ size_t i = 0;
+ size_t prefixLen = prefix_term->textLength();
+ do {
+ lastTerm = enumerator->term();
+ if (lastTerm != nullptr && lastTerm->field() == prefixField) {
+ size_t termLen = lastTerm->textLength();
+ if (prefixLen > termLen) {
+ break;
+ }
+
+ tmp = lastTerm->text();
+
+ for (i = prefixLen - 1; i != -1; --i) {
+ if (tmp[i] != prefixText[i]) {
+ tmp = nullptr;
+ break;
+ }
+ }
+ if (tmp == nullptr) {
+ break;
+ }
+
+ if (max_expansions > 0 && count >= max_expansions) {
+ break;
+ }
+
+ Term* t = _CLNEW Term(field_name.c_str(), tmp);
+ prefix_terms.push_back(t);
+ count++;
+ } else {
+ break;
+ }
+ _CLDECDELETE(lastTerm);
+ } while (enumerator->next());
+ }
+ _CLFINALLY({
+ enumerator->close();
+ _CLDELETE(enumerator);
+ _CLDECDELETE(lastTerm);
+ _CLDECDELETE(prefix_term);
+ });
+}
+
+} // namespace doris
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
similarity index 60%
copy from be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
copy to be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
index f42fd69dabc..5deb0c1a362 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
@@ -19,30 +19,22 @@
#include <CLucene.h>
#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
-#include "roaring/roaring.hh"
+#include <cstdint>
CL_NS_USE(index)
namespace doris {
-class DisjunctionQuery {
+class PrefixQuery {
public:
- DisjunctionQuery(IndexReader* reader);
- ~DisjunctionQuery();
-
- void add(const std::wstring& field_name, const std::vector<std::string>&
terms);
- void search(roaring::Roaring& roaring);
-
-private:
- IndexReader* _reader = nullptr;
- std::vector<std::wstring*> _wsterms;
- std::vector<Term*> _terms;
- std::vector<TermDocs*> _term_docs;
- std::vector<TermIterator> _term_iterators;
+ PrefixQuery() = default;
+ ~PrefixQuery() = default;
+
+ static void get_prefix_terms(IndexReader* reader, const std::wstring&
field_name,
+ const std::string& prefix,
+ std::vector<CL_NS(index)::Term*>&
prefix_terms,
+ int32_t max_expansions = 50);
};
} // namespace doris
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
new file mode 100644
index 00000000000..83c5401bac0
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "regexp_query.h"
+
+#include <CLucene/config/repl_wchar.h>
+#include <hs/hs.h>
+
+#include "common/logging.h"
+
+namespace doris::segment_v2 {
+
+RegexpQuery::RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>&
searcher)
+ : _searcher(searcher), query(searcher->getReader()) {}
+
+void RegexpQuery::add(const std::wstring& field_name, const std::string&
pattern) {
+ hs_database_t* database = nullptr;
+ hs_compile_error_t* compile_err = nullptr;
+ hs_scratch_t* scratch = nullptr;
+
+ if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY |
HS_FLAG_UTF8,
+ HS_MODE_BLOCK, nullptr, &database, &compile_err) !=
HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message;
+ hs_free_compile_error(compile_err);
+ return;
+ }
+
+ if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan could not allocate scratch space.";
+ hs_free_database(database);
+ return;
+ }
+
+ auto on_match = [](unsigned int id, unsigned long long from, unsigned long
long to,
+ unsigned int flags, void* context) -> int {
+ *((bool*)context) = true;
+ return 0;
+ };
+
+ Term* term = nullptr;
+ TermEnum* enumerator = nullptr;
+ std::vector<std::string> terms;
+ int32_t count = 0;
+
+ try {
+ enumerator = _searcher->getReader()->terms();
+ while (enumerator->next()) {
+ term = enumerator->term();
+ std::string input = lucene_wcstoutf8string(term->text(),
term->textLength());
+
+ bool is_match = false;
+ if (hs_scan(database, input.data(), input.size(), 0, scratch,
on_match,
+ (void*)&is_match) != HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan match failed: " << input;
+ break;
+ }
+
+ if (is_match) {
+ terms.emplace_back(std::move(input));
+ if (++count >= _max_expansions) {
+ break;
+ }
+ }
+
+ _CLDECDELETE(term);
+ }
+ }
+ _CLFINALLY({
+ _CLDECDELETE(term);
+ enumerator->close();
+ _CLDELETE(enumerator);
+
+ hs_free_scratch(scratch);
+ hs_free_database(database);
+ })
+
+ query.add(field_name, terms);
+}
+
+void RegexpQuery::search(roaring::Roaring& roaring) {
+ query.search(roaring);
+}
+
+} // namespace doris::segment_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
similarity index 59%
copy from be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
copy to be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
index f42fd69dabc..3791ad50d8f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h
@@ -17,32 +17,30 @@
#pragma once
-#include <CLucene.h>
-#include <CLucene/index/IndexReader.h>
-#include <CLucene/index/IndexVersion.h>
-#include <CLucene/index/Term.h>
-#include <CLucene/search/query/TermIterator.h>
+#include <memory>
-#include "roaring/roaring.hh"
+#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h"
CL_NS_USE(index)
+CL_NS_USE(search)
-namespace doris {
+namespace doris::segment_v2 {
-class DisjunctionQuery {
+class RegexpQuery {
public:
- DisjunctionQuery(IndexReader* reader);
- ~DisjunctionQuery();
+ RegexpQuery(const std::shared_ptr<lucene::search::IndexSearcher>&
searcher);
+ ~RegexpQuery() = default;
- void add(const std::wstring& field_name, const std::vector<std::string>&
terms);
+ void set_max_expansions(int32_t max_expansions) { _max_expansions =
max_expansions; }
+
+ void add(const std::wstring& field_name, const std::string& pattern);
void search(roaring::Roaring& roaring);
private:
- IndexReader* _reader = nullptr;
- std::vector<std::wstring*> _wsterms;
- std::vector<Term*> _terms;
- std::vector<TermDocs*> _term_docs;
- std::vector<TermIterator> _term_iterators;
+ std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
+ int32_t _max_expansions = 50;
+ DisjunctionQuery query;
};
-} // namespace doris
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
index 1ebfe635918..3037f979f6e 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
@@ -32,6 +32,8 @@ enum class InvertedIndexQueryType {
MATCH_ANY_QUERY = 5,
MATCH_ALL_QUERY = 6,
MATCH_PHRASE_QUERY = 7,
+ MATCH_PHRASE_PREFIX_QUERY = 8,
+ MATCH_REGEXP_QUERY = 9,
};
inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType
query_type) {
@@ -63,6 +65,12 @@ inline std::string
InvertedIndexQueryType_toString(InvertedIndexQueryType query_
case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
return "MPHRASE";
}
+ case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: {
+ return "MPHRASEPREFIX";
+ }
+ case InvertedIndexQueryType::MATCH_REGEXP_QUERY: {
+ return "MREGEXP";
+ }
default:
return "";
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 7d710d72c38..292884e631b 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -50,12 +50,15 @@
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/config.h"
#include "common/logging.h"
+#include "inverted_index_query_type.h"
#include "io/fs/file_system.h"
#include "olap/inverted_index_parser.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
#include
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
#include "olap/rowset/segment_v2/inverted_index_desc.h"
@@ -90,7 +93,9 @@ bool
InvertedIndexReader::_is_range_query(InvertedIndexQueryType query_type) {
bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) {
return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+ query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY);
}
bool InvertedIndexReader::indexExists(io::Path& index_file_path) {
@@ -141,10 +146,13 @@ std::unique_ptr<lucene::util::Reader>
InvertedIndexReader::create_reader(
return reader;
}
-std::vector<std::string> InvertedIndexReader::get_analyse_result(
- lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer,
- const std::string& field_name, InvertedIndexQueryType query_type, bool
drop_duplicates) {
- std::vector<std::string> analyse_result;
+void InvertedIndexReader::get_analyse_result(std::vector<std::string>&
analyse_result,
+ lucene::util::Reader* reader,
+ lucene::analysis::Analyzer*
analyzer,
+ const std::string& field_name,
+ InvertedIndexQueryType query_type,
+ bool drop_duplicates) {
+ analyse_result.clear();
std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
@@ -168,8 +176,6 @@ std::vector<std::string>
InvertedIndexReader::get_analyse_result(
std::set<std::string> unrepeated_result(analyse_result.begin(),
analyse_result.end());
analyse_result.assign(unrepeated_result.begin(),
unrepeated_result.end());
}
-
- return analyse_result;
}
Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle*
cache_handle,
@@ -246,19 +252,25 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(path.filename(),
_index_meta.index_id());
auto index_file_path = index_dir / index_file_name;
- InvertedIndexCtxSPtr inverted_index_ctx =
std::make_shared<InvertedIndexCtx>();
- inverted_index_ctx->parser_type =
get_inverted_index_parser_type_from_string(
- get_parser_string_from_properties(_index_meta.properties()));
- inverted_index_ctx->parser_mode =
- get_parser_mode_string_from_properties(_index_meta.properties());
- inverted_index_ctx->char_filter_map =
-
get_parser_char_filter_map_from_properties(_index_meta.properties());
+
try {
- auto analyzer = create_analyzer(inverted_index_ctx.get());
- auto reader = create_reader(inverted_index_ctx.get(), search_str);
- inverted_index_ctx->analyzer = analyzer.get();
- std::vector<std::string> analyse_result =
- get_analyse_result(reader.get(), analyzer.get(), column_name,
query_type);
+ std::vector<std::string> analyse_result;
+ if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+ analyse_result.emplace_back(search_str);
+ } else {
+ InvertedIndexCtxSPtr inverted_index_ctx =
std::make_shared<InvertedIndexCtx>();
+ inverted_index_ctx->parser_type =
get_inverted_index_parser_type_from_string(
+
get_parser_string_from_properties(_index_meta.properties()));
+ inverted_index_ctx->parser_mode =
+
get_parser_mode_string_from_properties(_index_meta.properties());
+ inverted_index_ctx->char_filter_map =
+
get_parser_char_filter_map_from_properties(_index_meta.properties());
+ auto analyzer = create_analyzer(inverted_index_ctx.get());
+ auto reader = create_reader(inverted_index_ctx.get(), search_str);
+ inverted_index_ctx->analyzer = analyzer.get();
+ get_analyse_result(analyse_result, reader.get(), analyzer.get(),
column_name,
+ query_type);
+ }
if (analyse_result.empty()) {
auto msg = fmt::format(
@@ -267,7 +279,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
search_str,
get_parser_string_from_properties(_index_meta.properties()));
if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+ query_type ==
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
LOG(WARNING) << msg;
return Status::OK();
} else {
@@ -294,6 +308,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
roaring::Roaring query_match_bitmap;
bool null_bitmap_already_read = false;
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
query_type == InvertedIndexQueryType::EQUAL_QUERY) {
std::string str_tokens;
@@ -302,7 +317,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
str_tokens += " ";
}
- auto cache = InvertedIndexQueryCache::instance();
+ auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCache::CacheKey cache_key;
cache_key.index_path = index_file_path;
cache_key.column_name = column_name;
@@ -333,6 +348,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
query.reset(phrase_query);
res = normal_index_search(stats, query_type,
index_searcher,
null_bitmap_already_read, query,
term_match_bitmap);
+ } else if (query_type ==
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
+ res = match_phrase_prefix_index_search(stats,
runtime_state, field_ws,
+ analyse_result,
index_searcher,
+ term_match_bitmap);
} else {
res = match_all_index_search(stats, runtime_state,
field_ws, analyse_result,
index_searcher,
term_match_bitmap);
@@ -346,13 +365,45 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
cache->insert(cache_key, term_match_bitmap, &cache_handle);
}
query_match_bitmap = *term_match_bitmap;
+ } else if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+ const std::string& pattern = analyse_result[0];
+
+ std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
+ auto* cache = InvertedIndexQueryCache::instance();
+
+ InvertedIndexQueryCache::CacheKey cache_key;
+ cache_key.index_path = index_file_path;
+ cache_key.column_name = column_name;
+ cache_key.query_type = query_type;
+ cache_key.value = pattern;
+ InvertedIndexQueryCacheHandle cache_handle;
+ if (cache->lookup(cache_key, &cache_handle)) {
+ stats->inverted_index_query_cache_hit++;
+ term_match_bitmap = cache_handle.get_bitmap();
+ } else {
+ stats->inverted_index_query_cache_miss++;
+
+ auto index_searcher = get_index_search();
+
+ term_match_bitmap = std::make_shared<roaring::Roaring>();
+
+ Status res = match_regexp_index_search(stats, runtime_state,
field_ws, pattern,
+ index_searcher,
term_match_bitmap);
+ if (!res.ok()) {
+ return res;
+ }
+
+ term_match_bitmap->runOptimize();
+ cache->insert(cache_key, term_match_bitmap, &cache_handle);
+ }
+ query_match_bitmap = *term_match_bitmap;
} else {
bool first = true;
for (auto token : analyse_result) {
std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
// try to get term bitmap match result from cache to avoid
query index on cache hit
- auto cache = InvertedIndexQueryCache::instance();
+ auto* cache = InvertedIndexQueryCache::instance();
// use EQUAL_QUERY type here since cache is for each term/token
//auto token = lucene_wcstoutf8string(token_ws.c_str(),
token_ws.length());
std::wstring token_ws = StringUtil::string_to_wstring(token);
@@ -471,6 +522,42 @@ Status FullTextIndexReader::match_all_index_search(
return Status::OK();
}
+Status FullTextIndexReader::match_phrase_prefix_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
+ const std::vector<std::string>& analyse_result, const
IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ TQueryOptions queryOptions = runtime_state->query_options();
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ PhrasePrefixQuery query(index_searcher);
+ query.set_max_expansions(queryOptions.inverted_index_max_expansions);
+ query.add(field_ws, analyse_result);
+ query.search(*term_match_bitmap);
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+ return Status::OK();
+}
+
+Status FullTextIndexReader::match_regexp_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
+ const std::string& pattern, const IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ TQueryOptions queryOptions = runtime_state->query_options();
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ RegexpQuery query(index_searcher);
+ query.set_max_expansions(queryOptions.inverted_index_max_expansions);
+ query.add(field_ws, pattern);
+ query.search(*term_match_bitmap);
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+ return Status::OK();
+}
+
void FullTextIndexReader::check_null_bitmap(const IndexSearcherPtr&
index_searcher,
bool& null_bitmap_already_read) {
// try to reuse index_searcher's directory to read null_bitmap to cache
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 20c5c731f9e..8b5c786f36f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -97,11 +97,12 @@ public:
return _index_meta.properties();
}
- static std::vector<std::string> get_analyse_result(lucene::util::Reader*
reader,
-
lucene::analysis::Analyzer* analyzer,
- const std::string&
field_name,
- InvertedIndexQueryType
query_type,
- bool drop_duplicates =
true);
+ static void get_analyse_result(std::vector<std::string>& analyse_result,
+ lucene::util::Reader* reader,
+ lucene::analysis::Analyzer* analyzer,
+ const std::string& field_name,
InvertedIndexQueryType query_type,
+ bool drop_duplicates = true);
+
static std::unique_ptr<lucene::util::Reader>
create_reader(InvertedIndexCtx* inverted_index_ctx,
const
std::string& value);
static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
@@ -153,6 +154,16 @@ private:
const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
void check_null_bitmap(const IndexSearcherPtr& index_searcher, bool&
null_bitmap_already_read);
+
+ Status match_phrase_prefix_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
+ const std::vector<std::string>& analyse_result, const
IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap);
+
+ Status match_regexp_index_search(OlapReaderStatistics* stats,
RuntimeState* runtime_state,
+ const std::wstring& field_ws, const
std::string& pattern,
+ const IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
};
class StringTypeInvertedIndexReader : public InvertedIndexReader {
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 11760a30f50..54d9bee4ae9 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -79,10 +79,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString&
src_column_string,
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
&inverted_index_ctx, tokenize_str.to_string());
- std::vector<std::string> query_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx.analyzer, "tokenize",
-
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+ std::vector<std::string> query_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ query_tokens, reader.get(), inverted_index_ctx.analyzer,
"tokenize",
+ doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
for (auto token : query_tokens) {
const size_t old_size = column_string_chars.size();
const size_t split_part_size = token.length();
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 3497b2ef7a9..c81c1617ca6 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -129,10 +129,10 @@ inline std::vector<std::string>
FunctionMatchBase::analyse_data_token(
auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());
- std::vector<std::string> element_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx->analyzer,
column_name, query_type,
- false);
+ std::vector<std::string> element_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ element_tokens, reader.get(),
inverted_index_ctx->analyzer, column_name,
+ query_type, false);
data_tokens.insert(data_tokens.end(), element_tokens.begin(),
element_tokens.end());
}
} else {
@@ -140,8 +140,9 @@ inline std::vector<std::string>
FunctionMatchBase::analyse_data_token(
auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
str_ref.to_string());
- data_tokens =
doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
+
doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens,
reader.get(),
+
inverted_index_ctx->analyzer,
+
column_name, query_type, false);
}
return data_tokens;
}
@@ -160,10 +161,10 @@ Status FunctionMatchAny::execute_match(const std::string&
column_name,
<< inverted_index_parser_type_to_string(parser_type);
auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
- std::vector<std::string> query_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx->analyzer, column_name,
-
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
+ std::vector<std::string> query_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ query_tokens, reader.get(), inverted_index_ctx->analyzer,
column_name,
+ doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
if (query_tokens.empty()) {
LOG(WARNING) << fmt::format(
"token parser result is empty for query, "
@@ -205,10 +206,10 @@ Status FunctionMatchAll::execute_match(const std::string&
column_name,
<< inverted_index_parser_type_to_string(parser_type);
auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
- std::vector<std::string> query_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx->analyzer, column_name,
-
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
+ std::vector<std::string> query_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ query_tokens, reader.get(), inverted_index_ctx->analyzer,
column_name,
+ doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
if (query_tokens.empty()) {
LOG(WARNING) << fmt::format(
"token parser result is empty for query, "
@@ -256,10 +257,10 @@ Status FunctionMatchPhrase::execute_match(const
std::string& column_name,
<< inverted_index_parser_type_to_string(parser_type);
auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
- std::vector<std::string> query_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- reader.get(), inverted_index_ctx->analyzer, column_name,
-
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+ std::vector<std::string> query_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ query_tokens, reader.get(), inverted_index_ctx->analyzer,
column_name,
+ doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
if (query_tokens.empty()) {
LOG(WARNING) << fmt::format(
"token parser result is empty for query, "
@@ -313,6 +314,8 @@ void register_function_match(SimpleFunctionFactory&
factory) {
factory.register_function<FunctionMatchAny>();
factory.register_function<FunctionMatchAll>();
factory.register_function<FunctionMatchPhrase>();
+ factory.register_function<FunctionMatchPhrasePrefix>();
+ factory.register_function<FunctionMatchRegexp>();
factory.register_function<FunctionMatchElementEQ>();
factory.register_function<FunctionMatchElementLT>();
factory.register_function<FunctionMatchElementGT>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index b8e7f91cb01..13701bd2d60 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -128,6 +128,40 @@ public:
ColumnUInt8::Container& result) override;
};
+class FunctionMatchPhrasePrefix : public FunctionMatchBase {
+public:
+ static constexpr auto name = "match_phrase_prefix";
+ static FunctionPtr create() { return
std::make_shared<FunctionMatchPhrasePrefix>(); }
+
+ String get_name() const override { return name; }
+
+ Status execute_match(const std::string& column_name, const std::string&
match_query_str,
+ size_t input_rows_count, const ColumnString*
string_col,
+ InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
+ ColumnUInt8::Container& result) override {
+ return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+ "FunctionMatchPhrasePrefix not support execute_match");
+ }
+};
+
+class FunctionMatchRegexp : public FunctionMatchBase {
+public:
+ static constexpr auto name = "match_regexp";
+ static FunctionPtr create() { return
std::make_shared<FunctionMatchRegexp>(); }
+
+ String get_name() const override { return name; }
+
+ Status execute_match(const std::string& column_name, const std::string&
match_query_str,
+ size_t input_rows_count, const ColumnString*
string_col,
+ InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
+ ColumnUInt8::Container& result) override {
+ return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+ "FunctionMatchRegexp not support execute_match");
+ }
+};
+
class FunctionMatchElementEQ : public FunctionMatchBase {
public:
static constexpr auto name = "match_element_eq";
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
index abf6e4bb27f..378a34f3da0 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
@@ -338,6 +338,8 @@ MATCH_ELEMENT_GT: 'ELEMENT_GT';
MATCH_ELEMENT_LE: 'ELEMENT_LE';
MATCH_ELEMENT_LT: 'ELEMENT_LT';
MATCH_PHRASE: 'MATCH_PHRASE';
+MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX';
+MATCH_REGEXP: 'MATCH_REGEXP';
MATERIALIZED: 'MATERIALIZED';
MAX: 'MAX';
MAXVALUE: 'MAXVALUE';
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
index ec4b2672c6d..ba44219e7e0 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
@@ -312,7 +312,7 @@ booleanExpression
predicate
: NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression
| NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression
- | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE)
pattern=valueExpression
+ | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE |
MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression
| NOT? kind=IN LEFT_PAREN query RIGHT_PAREN
| NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
| IS NOT? kind=NULL
diff --git a/fe/fe-core/src/main/cup/sql_parser.cup
b/fe/fe-core/src/main/cup/sql_parser.cup
index 07345f6b6b9..08ffc389a92 100644
--- a/fe/fe-core/src/main/cup/sql_parser.cup
+++ b/fe/fe-core/src/main/cup/sql_parser.cup
@@ -468,6 +468,8 @@ terminal String
KW_MATCH_ANY,
KW_MATCH_ALL,
KW_MATCH_PHRASE,
+ KW_MATCH_PHRASE_PREFIX,
+ KW_MATCH_REGEXP,
KW_MATCH_ELEMENT_EQ,
KW_MATCH_ELEMENT_LT,
KW_MATCH_ELEMENT_GT,
@@ -959,7 +961,7 @@ precedence left KW_AND;
precedence left KW_NOT, NOT;
precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS;
precedence left KW_LIKE, KW_REGEXP;
-precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH,
KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT,
KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
+precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE,
KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ,
KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE,
KW_MATCH_ELEMENT_GE;
precedence left EQUAL, LESSTHAN, GREATERTHAN;
precedence left ADD, SUBTRACT;
precedence left AT, STAR, DIVIDE, MOD, KW_DIV;
@@ -6985,6 +6987,10 @@ match_predicate ::=
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ALL, e1, e2); :}
| expr:e1 KW_MATCH_PHRASE expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1,
e2); :}
+ | expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2
+ {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX,
e1, e2); :}
+ | expr:e1 KW_MATCH_REGEXP expr:e2
+ {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1,
e2); :}
| expr:e1 KW_MATCH_ELEMENT_EQ expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1,
e2); :}
| expr:e1 KW_MATCH_ELEMENT_LT expr:e2
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
index 10579614524..f106aec956c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
@@ -50,6 +50,8 @@ public class MatchPredicate extends Predicate {
MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY),
MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
+ MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix",
TExprOpcode.MATCH_PHRASE_PREFIX),
+ MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP),
MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq",
TExprOpcode.MATCH_ELEMENT_EQ),
MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt",
TExprOpcode.MATCH_ELEMENT_LT),
MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt",
TExprOpcode.MATCH_ELEMENT_GT),
@@ -147,6 +149,26 @@ public class MatchPredicate extends Predicate {
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
+
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+ Operator.MATCH_PHRASE_PREFIX.getName(),
+ symbolNotUsed,
+ Lists.<Type>newArrayList(t, t),
+ Type.BOOLEAN));
+
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+ Operator.MATCH_PHRASE_PREFIX.getName(),
+ symbolNotUsed,
+ Lists.<Type>newArrayList(new ArrayType(t), t),
+ Type.BOOLEAN));
+
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+ Operator.MATCH_REGEXP.getName(),
+ symbolNotUsed,
+ Lists.<Type>newArrayList(t, t),
+ Type.BOOLEAN));
+
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+ Operator.MATCH_REGEXP.getName(),
+ symbolNotUsed,
+ Lists.<Type>newArrayList(new ArrayType(t), t),
+ Type.BOOLEAN));
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index 23cb6c572e2..447f0c28442 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -158,6 +158,8 @@ import org.apache.doris.nereids.trees.expressions.ListQuery;
import org.apache.doris.nereids.trees.expressions.MatchAll;
import org.apache.doris.nereids.trees.expressions.MatchAny;
import org.apache.doris.nereids.trees.expressions.MatchPhrase;
+import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
+import org.apache.doris.nereids.trees.expressions.MatchRegexp;
import org.apache.doris.nereids.trees.expressions.Mod;
import org.apache.doris.nereids.trees.expressions.Multiply;
import org.apache.doris.nereids.trees.expressions.NamedExpression;
@@ -1927,6 +1929,18 @@ public class LogicalPlanBuilder extends
DorisParserBaseVisitor<Object> {
getExpression(ctx.pattern)
);
break;
+ case DorisParser.MATCH_PHRASE_PREFIX:
+ outExpression = new MatchPhrasePrefix(
+ valueExpression,
+ getExpression(ctx.pattern)
+ );
+ break;
+ case DorisParser.MATCH_REGEXP:
+ outExpression = new MatchRegexp(
+ valueExpression,
+ getExpression(ctx.pattern)
+ );
+ break;
default:
throw new ParseException("Unsupported predicate type: " +
ctx.kind.getText(), ctx);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
index bc9837eafec..5b3027365a8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
@@ -50,6 +50,10 @@ public abstract class Match extends BinaryOperator
implements PropagateNullable
return Operator.MATCH_ALL;
case "MATCH_PHRASE":
return Operator.MATCH_PHRASE;
+ case "MATCH_PHRASE_PREFIX":
+ return Operator.MATCH_PHRASE_PREFIX;
+ case "MATCH_REGEXP":
+ return Operator.MATCH_REGEXP;
default:
throw new AnalysisException("UnSupported type: " + symbol);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java
new file mode 100644
index 00000000000..748da21ce30
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions;
+
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * like expression: a MATCH_PHRASE_PREFIX 'hello w'.
+ */
+public class MatchPhrasePrefix extends Match {
+ public MatchPhrasePrefix(Expression left, Expression right) {
+ super(ImmutableList.of(left, right), "MATCH_PHRASE_PREFIX");
+ }
+
+ private MatchPhrasePrefix(List<Expression> children) {
+ super(children, "MATCH_PHRASE_PREFIX");
+ }
+
+ @Override
+ public MatchPhrasePrefix withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new MatchPhrasePrefix(children);
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitMatchPhrasePrefix(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
new file mode 100644
index 00000000000..6bb55aeb897
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions;
+
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * like expression: a MATCH_REGEXP '^h\\w*'.
+ */
+public class MatchRegexp extends Match {
+ public MatchRegexp(Expression left, Expression right) {
+ super(ImmutableList.of(left, right), "MATCH_REGEXP");
+ }
+
+ private MatchRegexp(List<Expression> children) {
+ super(children, "MATCH_REGEXP");
+ }
+
+ @Override
+ public MatchRegexp withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new MatchRegexp(children);
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitMatchRegexp(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
index 4d89227ee9f..179570b824e 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
@@ -56,6 +56,8 @@ import org.apache.doris.nereids.trees.expressions.Match;
import org.apache.doris.nereids.trees.expressions.MatchAll;
import org.apache.doris.nereids.trees.expressions.MatchAny;
import org.apache.doris.nereids.trees.expressions.MatchPhrase;
+import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
+import org.apache.doris.nereids.trees.expressions.MatchRegexp;
import org.apache.doris.nereids.trees.expressions.Mod;
import org.apache.doris.nereids.trees.expressions.Multiply;
import org.apache.doris.nereids.trees.expressions.NamedExpression;
@@ -454,6 +456,14 @@ public abstract class ExpressionVisitor<R, C>
return visitMatch(matchPhrase, context);
}
+ public R visitMatchPhrasePrefix(MatchPhrasePrefix matchPhrasePrefix, C
context) {
+ return visitMatch(matchPhrasePrefix, context);
+ }
+
+ public R visitMatchRegexp(MatchRegexp matchRegexp, C context) {
+ return visitMatch(matchRegexp, context);
+ }
+
/*
********************************************************************************************
* Unbound expressions
*
********************************************************************************************/
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 2fd5dd7adea..da4212f0418 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -402,6 +402,7 @@ public class SessionVariable implements Serializable,
Writable {
public static final String ENABLE_UNIQUE_KEY_PARTIAL_UPDATE =
"enable_unique_key_partial_update";
public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD =
"inverted_index_conjunction_opt_threshold";
+ public static final String INVERTED_INDEX_MAX_EXPANSIONS =
"inverted_index_max_expansions";
public static final String AUTO_ANALYZE_START_TIME =
"auto_analyze_start_time";
@@ -1192,6 +1193,12 @@ public class SessionVariable implements Serializable,
Writable {
flag = VariableMgr.GLOBAL)
public String autoAnalyzeEndTime = "23:59:59";
+ @VariableMgr.VarAttr(name = INVERTED_INDEX_MAX_EXPANSIONS,
+ description = {"这个参数用来限制查询时扩展的词项(terms)的数量,以此来控制查询的性能",
+ "This parameter is used to limit the number of term
expansions during a query,"
+ + " thereby controlling query performance"})
+ public int invertedIndexMaxExpansions = 50;
+
@VariableMgr.VarAttr(name = ENABLE_UNIQUE_KEY_PARTIAL_UPDATE, needForward
= true)
public boolean enableUniqueKeyPartialUpdate = false;
@@ -2435,6 +2442,7 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns);
tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold);
+ tResult.setInvertedIndexMaxExpansions(invertedIndexMaxExpansions);
tResult.setFasterFloatConvert(fasterFloatConvert);
diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex
b/fe/fe-core/src/main/jflex/sql_scanner.flex
index 4a19494ce80..86a3dc7482d 100644
--- a/fe/fe-core/src/main/jflex/sql_scanner.flex
+++ b/fe/fe-core/src/main/jflex/sql_scanner.flex
@@ -309,6 +309,8 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("match_any", new
Integer(SqlParserSymbols.KW_MATCH_ANY));
keywordMap.put("match_all", new
Integer(SqlParserSymbols.KW_MATCH_ALL));
keywordMap.put("match_phrase", new
Integer(SqlParserSymbols.KW_MATCH_PHRASE));
+ keywordMap.put("match_phrase_prefix", new
Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX));
+ keywordMap.put("match_regexp", new
Integer(SqlParserSymbols.KW_MATCH_REGEXP));
keywordMap.put("element_eq", new
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));
keywordMap.put("element_lt", new
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT));
keywordMap.put("element_gt", new
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT));
diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift
index f6444ebe218..72a1d80e036 100644
--- a/gensrc/thrift/Opcodes.thrift
+++ b/gensrc/thrift/Opcodes.thrift
@@ -93,4 +93,6 @@ enum TExprOpcode {
MATCH_ELEMENT_GT,
MATCH_ELEMENT_LE,
MATCH_ELEMENT_GE,
+ MATCH_PHRASE_PREFIX,
+ MATCH_REGEXP,
}
diff --git a/gensrc/thrift/PaloInternalService.thrift
b/gensrc/thrift/PaloInternalService.thrift
index 62eb5a0827b..9ff6a589d69 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -249,6 +249,8 @@ struct TQueryOptions {
86: optional i32 analyze_timeout = 43200;
87: optional bool faster_float_convert = false;
+
+ 88: optional i32 inverted_index_max_expansions = 50;
}
diff --git
a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out
b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out
new file mode 100644
index 00000000000..140fd5ee937
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out
@@ -0,0 +1,31 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+863
+
+-- !sql --
+863
+
+-- !sql --
+235
+
+-- !sql --
+235
+
+-- !sql --
+166
+
+-- !sql --
+166
+
+-- !sql --
+56
+
+-- !sql --
+56
+
+-- !sql --
+7
+
+-- !sql --
+7
+
diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out
b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
new file mode 100644
index 00000000000..eab27de65ee
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1000
+
+-- !sql --
+54
+
+-- !sql --
+910
+
+-- !sql --
+60
+
+-- !sql --
+38
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy
new file mode 100644
index 00000000000..b23bc1b5a8b
--- /dev/null
+++
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_phrase_prefix", "p0"){
+ def indexTbName1 = "test_index_match_phrase_prefix"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1,
'test_index_match_phrase_prefix', 'true', 'json', 'documents-1000.json')
+
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request match_phrase_prefix 'ima'; """
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request like '%ima%'; """
+
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request match_phrase_prefix 'images/h'; """
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request like '%images/h%'; """
+
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request match_phrase_prefix 'images/hm'; """
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request like '%images/hm%'; """
+
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request match_phrase_prefix '/french/images/n'; """
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request like '%/french/images/n%'; """
+
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request match_phrase_prefix '/french/tickets/images/ti'; """
+ qt_sql """ select count() from test_index_match_phrase_prefix where
request like '%/french/tickets/images/ti%'; """
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
\ No newline at end of file
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
new file mode 100644
index 00000000000..4c1ee1a5b0b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_regexp", "p0"){
+ def indexTbName1 = "test_index_match_regexp"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1, 'test_index_match_regexp',
'true', 'json', 'documents-1000.json')
+
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp '^h'; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp '^team'; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp 's\$'; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp 'er\$'; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp '.*tickets.*'; """
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]