[doris] 06/12: [Improve](inverted index) improve match performance without index (#24751)

kxiao Fri, 22 Sep 2023 08:13:19 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 3cdc962ecaaf07100b020f5ffee14680dd975705
Author: airborne12 <[email protected]>
AuthorDate: Fri Sep 22 18:45:11 2023 +0800

    [Improve](inverted index) improve match performance without index (#24751)
---
 be/src/olap/inverted_index_parser.h                |  7 ++
 .../inverted_index/query/conjunction_query.cpp     | 10 +--
 .../inverted_index/query/conjunction_query.h       |  2 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    | 75 +++++++++++++---------
 .../olap/rowset/segment_v2/inverted_index_reader.h | 16 +++--
 be/src/vec/exprs/vmatch_predicate.cpp              |  6 ++
 be/src/vec/exprs/vmatch_predicate.h                | 18 ++++--
 be/src/vec/functions/function_tokenize.cpp         | 16 +++--
 be/src/vec/functions/match.cpp                     | 47 ++++++++------
 be/src/vec/functions/match.h                       | 12 ++--
 10 files changed, 130 insertions(+), 79 deletions(-)

diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index df4f0769f93..54455bddef8 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -21,6 +21,12 @@
 #include <memory>
 #include <string>
 
+namespace lucene {
+namespace analysis {
+class Analyzer;
+}
+} // namespace lucene
+
 namespace doris {
 
 enum class InvertedIndexParserType {
@@ -38,6 +44,7 @@ struct InvertedIndexCtx {
     InvertedIndexParserType parser_type;
     std::string parser_mode;
     CharFilterMap char_filter_map;
+    lucene::analysis::Analyzer* analyzer;
 };
 
 using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
index 90909045cc1..c5793f93f06 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -37,15 +37,15 @@ ConjunctionQuery::~ConjunctionQuery() {
     }
 }
 
-void ConjunctionQuery::add(const std::wstring& field_name,
-                           const std::vector<std::wstring>& wterms) {
-    if (wterms.size() < 1) {
+void ConjunctionQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
+    if (terms.size() < 1) {
         _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
     }
 
     std::vector<TermIterator> iterators;
-    for (auto& wterm : wterms) {
-        Term* t = _CLNEW Term(field_name.c_str(), wterm.c_str());
+    for (auto& term : terms) {
+        std::wstring ws_term = StringUtil::string_to_wstring(term);
+        Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
         TermDocs* term_doc = _reader->termDocs(t);
         _term_docs.push_back(term_doc);
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
index bffb12ffb2b..36d9478c20d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
@@ -38,7 +38,7 @@ public:
         _conjunction_ratio = conjunction_ratio;
     }
 
-    void add(const std::wstring& field_name, const std::vector<std::wstring>& 
wterms);
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms);
     void search(roaring::Roaring& roaring);
 
 private:
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index cef35a9f510..325907fa14d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -99,21 +99,18 @@ bool InvertedIndexReader::indexExists(io::Path& 
index_file_path) {
     return exists;
 }
 
-std::vector<std::wstring> InvertedIndexReader::get_analyse_result(
-        const std::string& field_name, const std::string& value, 
InvertedIndexQueryType query_type,
-        InvertedIndexCtx* inverted_index_ctx, bool drop_duplicates) {
-    std::vector<std::wstring> analyse_result;
-    std::shared_ptr<lucene::analysis::Analyzer> analyzer;
-    std::unique_ptr<lucene::util::Reader> reader;
+std::unique_ptr<lucene::analysis::Analyzer> 
InvertedIndexReader::create_analyzer(
+        InvertedIndexCtx* inverted_index_ctx) {
+    std::unique_ptr<lucene::analysis::Analyzer> analyzer;
     auto analyser_type = inverted_index_ctx->parser_type;
     if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
         analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
-        analyzer = 
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
+        analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
     } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
-        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+        analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
     } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
         auto chinese_analyzer =
-                
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
+                
std::make_unique<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
         chinese_analyzer->initDict(config::inverted_index_dict_path);
         auto mode = inverted_index_ctx->parser_mode;
         if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
@@ -121,32 +118,44 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
         } else {
             chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
         }
-        analyzer = chinese_analyzer;
+        analyzer = std::move(chinese_analyzer);
     } else {
         // default
-        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
+        analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
     }
-    reader.reset(new lucene::util::SStringReader<char>());
+    return analyzer;
+}
+
+std::unique_ptr<lucene::util::Reader> InvertedIndexReader::create_reader(
+        InvertedIndexCtx* inverted_index_ctx, const std::string& value) {
+    std::unique_ptr<lucene::util::Reader> reader =
+            std::make_unique<lucene::util::SStringReader<char>>();
     CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map;
     if (!char_filter_map.empty()) {
-        reader.reset(CharFilterFactory::create(
+        reader = 
std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
                 char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], 
reader.release(),
                 char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
                 
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
     }
-    reader->init(value.data(), value.size(), false);
+    reader->init(value.data(), value.size(), true);
+    return reader;
+}
+
+std::vector<std::string> InvertedIndexReader::get_analyse_result(
+        lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer,
+        const std::string& field_name, InvertedIndexQueryType query_type, bool 
drop_duplicates) {
+    std::vector<std::string> analyse_result;
 
     std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
-            analyzer->tokenStream(field_ws.c_str(), reader.get()));
+            analyzer->tokenStream(field_ws.c_str(), reader));
 
     lucene::analysis::Token token;
 
     while (token_stream->next(&token)) {
         if (token.termLength<char>() != 0) {
-            std::string_view term(token.termBuffer<char>(), 
token.termLength<char>());
-            std::wstring ws_term = StringUtil::string_to_wstring(term);
-            analyse_result.emplace_back(ws_term);
+            analyse_result.emplace_back(
+                    std::string(token.termBuffer<char>(), 
token.termLength<char>()));
         }
     }
 
@@ -156,7 +165,7 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
 
     if (drop_duplicates && (query_type == 
InvertedIndexQueryType::MATCH_ANY_QUERY ||
                             query_type == 
InvertedIndexQueryType::MATCH_ALL_QUERY)) {
-        std::set<std::wstring> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
+        std::set<std::string> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
         analyse_result.assign(unrepeated_result.begin(), 
unrepeated_result.end());
     }
 
@@ -245,8 +254,11 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
     inverted_index_ctx->char_filter_map =
             
get_parser_char_filter_map_from_properties(_index_meta.properties());
     try {
-        std::vector<std::wstring> analyse_result =
-                get_analyse_result(column_name, search_str, query_type, 
inverted_index_ctx.get());
+        auto analyzer = create_analyzer(inverted_index_ctx.get());
+        auto reader = create_reader(inverted_index_ctx.get(), search_str);
+        inverted_index_ctx->analyzer = analyzer.get();
+        std::vector<std::string> analyse_result =
+                get_analyse_result(reader.get(), analyzer.get(), column_name, 
query_type);
 
         if (analyse_result.empty()) {
             auto msg = fmt::format(
@@ -280,11 +292,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
         roaring::Roaring query_match_bitmap;
         bool null_bitmap_already_read = false;
         if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
-            std::wstring wstr_tokens;
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
+            query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+            std::string str_tokens;
             for (auto& token : analyse_result) {
-                wstr_tokens += token;
-                wstr_tokens += L" ";
+                str_tokens += token;
+                str_tokens += " ";
             }
 
             auto cache = InvertedIndexQueryCache::instance();
@@ -292,7 +305,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             cache_key.index_path = index_file_path;
             cache_key.column_name = column_name;
             cache_key.query_type = query_type;
-            auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(), 
wstr_tokens.length());
+            //auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(), 
wstr_tokens.length());
             cache_key.value.swap(str_tokens);
             InvertedIndexQueryCacheHandle cache_handle;
             std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
@@ -308,7 +321,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                 if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
                     auto* phrase_query = new lucene::search::PhraseQuery();
                     for (auto& token : analyse_result) {
-                        auto* term = _CLNEW 
lucene::index::Term(field_ws.c_str(), token.c_str());
+                        std::wstring wtoken = 
StringUtil::string_to_wstring(token);
+                        auto* term = _CLNEW 
lucene::index::Term(field_ws.c_str(), wtoken.c_str());
                         phrase_query->add(term);
                         _CLDECDELETE(term);
                     }
@@ -330,13 +344,14 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             query_match_bitmap = *term_match_bitmap;
         } else {
             bool first = true;
-            for (auto token_ws : analyse_result) {
+            for (auto token : analyse_result) {
                 std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
 
                 // try to get term bitmap match result from cache to avoid 
query index on cache hit
                 auto cache = InvertedIndexQueryCache::instance();
                 // use EQUAL_QUERY type here since cache is for each term/token
-                auto token = lucene_wcstoutf8string(token_ws.c_str(), 
token_ws.length());
+                //auto token = lucene_wcstoutf8string(token_ws.c_str(), 
token_ws.length());
+                std::wstring token_ws = StringUtil::string_to_wstring(token);
 
                 InvertedIndexQueryCache::CacheKey cache_key {
                         index_file_path, column_name, 
InvertedIndexQueryType::EQUAL_QUERY, token};
@@ -439,7 +454,7 @@ Status FullTextIndexReader::normal_index_search(
 
 Status FullTextIndexReader::match_all_index_search(
         OlapReaderStatistics* stats, RuntimeState* runtime_state, const 
std::wstring& field_ws,
-        const std::vector<std::wstring>& analyse_result, const 
IndexSearcherPtr& index_searcher,
+        const std::vector<std::string>& analyse_result, const 
IndexSearcherPtr& index_searcher,
         const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
     TQueryOptions queryOptions = runtime_state->query_options();
     try {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 30269f3b192..20c5c731f9e 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -97,11 +97,15 @@ public:
         return _index_meta.properties();
     }
 
-    static std::vector<std::wstring> get_analyse_result(const std::string& 
field_name,
-                                                        const std::string& 
value,
-                                                        InvertedIndexQueryType 
query_type,
-                                                        InvertedIndexCtx* 
inverted_index_ctx,
-                                                        bool drop_duplicates = 
true);
+    static std::vector<std::string> get_analyse_result(lucene::util::Reader* 
reader,
+                                                       
lucene::analysis::Analyzer* analyzer,
+                                                       const std::string& 
field_name,
+                                                       InvertedIndexQueryType 
query_type,
+                                                       bool drop_duplicates = 
true);
+    static std::unique_ptr<lucene::util::Reader> 
create_reader(InvertedIndexCtx* inverted_index_ctx,
+                                                               const 
std::string& value);
+    static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
+            InvertedIndexCtx* inverted_index_ctx);
 
 protected:
     bool _is_range_query(InvertedIndexQueryType query_type);
@@ -144,7 +148,7 @@ private:
 
     Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,
                                   const std::wstring& field_ws,
-                                  const std::vector<std::wstring>& 
analyse_result,
+                                  const std::vector<std::string>& 
analyse_result,
                                   const IndexSearcherPtr& index_searcher,
                                   const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
 
diff --git a/be/src/vec/exprs/vmatch_predicate.cpp 
b/be/src/vec/exprs/vmatch_predicate.cpp
index f6ba52705f6..2a21aba5785 100644
--- a/be/src/vec/exprs/vmatch_predicate.cpp
+++ b/be/src/vec/exprs/vmatch_predicate.cpp
@@ -30,6 +30,7 @@
 #include <vector>
 
 #include "common/status.h"
+#include "olap/rowset/segment_v2/inverted_index_reader.h"
 #include "vec/core/block.h"
 #include "vec/core/column_numbers.h"
 #include "vec/core/column_with_type_and_name.h"
@@ -43,6 +44,7 @@ class RuntimeState;
 } // namespace doris
 
 namespace doris::vectorized {
+using namespace doris::segment_v2;
 
 VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) {
     _inverted_index_ctx = std::make_shared<InvertedIndexCtx>();
@@ -50,8 +52,12 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : 
VExpr(node) {
             
get_inverted_index_parser_type_from_string(node.match_predicate.parser_type);
     _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
     _inverted_index_ctx->char_filter_map = 
node.match_predicate.char_filter_map;
+    _analyzer = 
InvertedIndexReader::create_analyzer(_inverted_index_ctx.get());
+    _inverted_index_ctx->analyzer = _analyzer.get();
 }
 
+VMatchPredicate::~VMatchPredicate() = default;
+
 Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc,
                                 VExprContext* context) {
     RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context));
diff --git a/be/src/vec/exprs/vmatch_predicate.h 
b/be/src/vec/exprs/vmatch_predicate.h
index 2868454db38..75d17298b8c 100644
--- a/be/src/vec/exprs/vmatch_predicate.h
+++ b/be/src/vec/exprs/vmatch_predicate.h
@@ -26,6 +26,11 @@
 #include "vec/exprs/vexpr.h"
 #include "vec/functions/function.h"
 
+namespace lucene {
+namespace analysis {
+class Analyzer;
+}
+} // namespace lucene
 namespace doris {
 class RowDescriptor;
 class RuntimeState;
@@ -43,13 +48,11 @@ class VMatchPredicate final : public VExpr {
 
 public:
     VMatchPredicate(const TExprNode& node);
-    ~VMatchPredicate() override = default;
-    doris::Status execute(VExprContext* context, doris::vectorized::Block* 
block,
-                          int* result_column_id) override;
-    doris::Status prepare(doris::RuntimeState* state, const 
doris::RowDescriptor& desc,
-                          VExprContext* context) override;
-    doris::Status open(doris::RuntimeState* state, VExprContext* context,
-                       FunctionContext::FunctionStateScope scope) override;
+    ~VMatchPredicate() override;
+    Status execute(VExprContext* context, Block* block, int* result_column_id) 
override;
+    Status prepare(RuntimeState* state, const RowDescriptor& desc, 
VExprContext* context) override;
+    Status open(RuntimeState* state, VExprContext* context,
+                FunctionContext::FunctionStateScope scope) override;
     void close(VExprContext* context, FunctionContext::FunctionStateScope 
scope) override;
     VExprSPtr clone() const override { return 
VMatchPredicate::create_shared(*this); }
     const std::string& expr_name() const override;
@@ -64,5 +67,6 @@ private:
     std::string _expr_name;
     std::string _function_name;
     InvertedIndexCtxSPtr _inverted_index_ctx;
+    std::unique_ptr<lucene::analysis::Analyzer> _analyzer;
 };
 } // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index 72a400a58d0..62e0a53bccb 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -95,13 +95,14 @@ void FunctionTokenize::_do_tokenize(const ColumnString& 
src_column_string,
             dest_offsets.push_back(dest_pos);
             continue;
         }
-        std::vector<std::wstring> query_tokens =
+        auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
+                &inverted_index_ctx, tokenize_str.to_string());
+
+        std::vector<std::string> query_tokens =
                 doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                        "tokenize", tokenize_str.to_string(),
-                        
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY,
-                        &inverted_index_ctx);
-        for (auto token_ws : query_tokens) {
-            std::string token = lucene_wcstoutf8string(token_ws.data(), 
token_ws.length());
+                        reader.get(), inverted_index_ctx.analyzer, "tokenize",
+                        
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+        for (auto token : query_tokens) {
             const size_t old_size = column_string_chars.size();
             const size_t split_part_size = token.length();
             if (split_part_size > 0) {
@@ -158,6 +159,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             inverted_index_ctx.parser_mode = 
get_parser_mode_string_from_properties(properties);
             inverted_index_ctx.char_filter_map =
                     get_parser_char_filter_map_from_properties(properties);
+            auto analyzer =
+                    
doris::segment_v2::InvertedIndexReader::create_analyzer(&inverted_index_ctx);
+            inverted_index_ctx.analyzer = analyzer.get();
             _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, 
dest_offsets,
                          dest_nested_null_map);
 
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index a549762c7da..4267407281f 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -102,26 +102,32 @@ inline doris::segment_v2::InvertedIndexQueryType 
FunctionMatchBase::get_query_ty
     return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
 }
 
-inline std::vector<std::wstring> FunctionMatchBase::analyse_data_token(
+inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
         const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
         const ColumnString* string_col, int32_t current_block_row_idx,
         const ColumnArray::Offsets64* array_offsets, int32_t& 
current_src_array_offset) {
-    std::vector<std::wstring> data_tokens;
+    std::vector<std::string> data_tokens;
     auto query_type = get_query_type_from_fn_name();
     if (array_offsets) {
         for (auto next_src_array_offset = 
(*array_offsets)[current_block_row_idx];
              current_src_array_offset < next_src_array_offset; 
++current_src_array_offset) {
             const auto& str_ref = 
string_col->get_data_at(current_src_array_offset);
-            std::vector<std::wstring> element_tokens =
+            auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(
+                    inverted_index_ctx, str_ref.to_string());
+
+            std::vector<std::string> element_tokens =
                     doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                            column_name, str_ref.to_string(), query_type, 
inverted_index_ctx,
+                            reader.get(), inverted_index_ctx->analyzer, 
column_name, query_type,
                             false);
             data_tokens.insert(data_tokens.end(), element_tokens.begin(), 
element_tokens.end());
         }
     } else {
         const auto& str_ref = string_col->get_data_at(current_block_row_idx);
+        auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+                                                                            
str_ref.to_string());
+
         data_tokens = 
doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                column_name, str_ref.to_string(), query_type, 
inverted_index_ctx, false);
+                reader.get(), inverted_index_ctx->analyzer, column_name, 
query_type, false);
     }
     return data_tokens;
 }
@@ -138,10 +144,12 @@ Status FunctionMatchAny::execute_match(const std::string& 
column_name,
     }
     VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: "
                << inverted_index_parser_type_to_string(parser_type);
-    std::vector<std::wstring> query_tokens =
+    auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+                                                                        
match_query_str);
+    std::vector<std::string> query_tokens =
             doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    column_name, match_query_str,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, inverted_index_ctx);
+                    reader.get(), inverted_index_ctx->analyzer, column_name,
+                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -152,7 +160,7 @@ Status FunctionMatchAny::execute_match(const std::string& 
column_name,
 
     auto current_src_array_offset = 0;
     for (int i = 0; i < input_rows_count; i++) {
-        std::vector<std::wstring> data_tokens =
+        std::vector<std::string> data_tokens =
                 analyse_data_token(column_name, inverted_index_ctx, 
string_col, i, array_offsets,
                                    current_src_array_offset);
 
@@ -181,10 +189,12 @@ Status FunctionMatchAll::execute_match(const std::string& 
column_name,
     }
     VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: "
                << inverted_index_parser_type_to_string(parser_type);
-    std::vector<std::wstring> query_tokens =
+    auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+                                                                        
match_query_str);
+    std::vector<std::string> query_tokens =
             doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    column_name, match_query_str,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, inverted_index_ctx);
+                    reader.get(), inverted_index_ctx->analyzer, column_name,
+                    
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -195,7 +205,7 @@ Status FunctionMatchAll::execute_match(const std::string& 
column_name,
 
     auto current_src_array_offset = 0;
     for (int i = 0; i < input_rows_count; i++) {
-        std::vector<std::wstring> data_tokens =
+        std::vector<std::string> data_tokens =
                 analyse_data_token(column_name, inverted_index_ctx, 
string_col, i, array_offsets,
                                    current_src_array_offset);
 
@@ -230,11 +240,12 @@ Status FunctionMatchPhrase::execute_match(const 
std::string& column_name,
     }
     VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, 
parser_type: "
                << inverted_index_parser_type_to_string(parser_type);
-    std::vector<std::wstring> query_tokens =
+    auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+                                                                        
match_query_str);
+    std::vector<std::string> query_tokens =
             doris::segment_v2::InvertedIndexReader::get_analyse_result(
-                    column_name, match_query_str,
-                    
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY,
-                    inverted_index_ctx);
+                    reader.get(), inverted_index_ctx->analyzer, column_name,
+                    
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
     if (query_tokens.empty()) {
         LOG(WARNING) << fmt::format(
                 "token parser result is empty for query, "
@@ -245,7 +256,7 @@ Status FunctionMatchPhrase::execute_match(const 
std::string& column_name,
 
     auto current_src_array_offset = 0;
     for (int i = 0; i < input_rows_count; i++) {
-        std::vector<std::wstring> data_tokens =
+        std::vector<std::string> data_tokens =
                 analyse_data_token(column_name, inverted_index_ctx, 
string_col, i, array_offsets,
                                    current_src_array_offset);
 
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index 8f66f7866d2..52b0c8ee6d2 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -76,12 +76,12 @@ public:
 
     doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name();
 
-    std::vector<std::wstring> analyse_data_token(const std::string& 
column_name,
-                                                 InvertedIndexCtx* 
inverted_index_ctx,
-                                                 const ColumnString* 
string_col,
-                                                 int32_t current_block_row_idx,
-                                                 const ColumnArray::Offsets64* 
array_offsets,
-                                                 int32_t& 
current_src_array_offset);
+    std::vector<std::string> analyse_data_token(const std::string& column_name,
+                                                InvertedIndexCtx* 
inverted_index_ctx,
+                                                const ColumnString* string_col,
+                                                int32_t current_block_row_idx,
+                                                const ColumnArray::Offsets64* 
array_offsets,
+                                                int32_t& 
current_src_array_offset);
 };
 
 class FunctionMatchAny : public FunctionMatchBase {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] 06/12: [Improve](inverted index) improve match performance without index (#24751)

Reply via email to