This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 3cdc962ecaaf07100b020f5ffee14680dd975705 Author: airborne12 <[email protected]> AuthorDate: Fri Sep 22 18:45:11 2023 +0800 [Improve](inverted index) improve match performance without index (#24751) --- be/src/olap/inverted_index_parser.h | 7 ++ .../inverted_index/query/conjunction_query.cpp | 10 +-- .../inverted_index/query/conjunction_query.h | 2 +- .../rowset/segment_v2/inverted_index_reader.cpp | 75 +++++++++++++--------- .../olap/rowset/segment_v2/inverted_index_reader.h | 16 +++-- be/src/vec/exprs/vmatch_predicate.cpp | 6 ++ be/src/vec/exprs/vmatch_predicate.h | 18 ++++-- be/src/vec/functions/function_tokenize.cpp | 16 +++-- be/src/vec/functions/match.cpp | 47 ++++++++------ be/src/vec/functions/match.h | 12 ++-- 10 files changed, 130 insertions(+), 79 deletions(-) diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index df4f0769f93..54455bddef8 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -21,6 +21,12 @@ #include <memory> #include <string> +namespace lucene { +namespace analysis { +class Analyzer; +} +} // namespace lucene + namespace doris { enum class InvertedIndexParserType { @@ -38,6 +44,7 @@ struct InvertedIndexCtx { InvertedIndexParserType parser_type; std::string parser_mode; CharFilterMap char_filter_map; + lucene::analysis::Analyzer* analyzer; }; using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index 90909045cc1..c5793f93f06 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -37,15 +37,15 @@ ConjunctionQuery::~ConjunctionQuery() { } } -void ConjunctionQuery::add(const std::wstring& field_name, - const std::vector<std::wstring>& wterms) { - if (wterms.size() < 1) { +void ConjunctionQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) { + if (terms.size() < 1) { _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); } std::vector<TermIterator> iterators; - for (auto& wterm : wterms) { - Term* t = _CLNEW Term(field_name.c_str(), wterm.c_str()); + for (auto& term : terms) { + std::wstring ws_term = StringUtil::string_to_wstring(term); + Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); TermDocs* term_doc = _reader->termDocs(t); _term_docs.push_back(term_doc); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h index bffb12ffb2b..36d9478c20d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h @@ -38,7 +38,7 @@ public: _conjunction_ratio = conjunction_ratio; } - void add(const std::wstring& field_name, const std::vector<std::wstring>& wterms); + void add(const std::wstring& field_name, const std::vector<std::string>& terms); void search(roaring::Roaring& roaring); private: diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index cef35a9f510..325907fa14d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -99,21 +99,18 @@ bool InvertedIndexReader::indexExists(io::Path& index_file_path) { return exists; } -std::vector<std::wstring> InvertedIndexReader::get_analyse_result( - const std::string& field_name, const std::string& value, InvertedIndexQueryType query_type, - InvertedIndexCtx* inverted_index_ctx, bool drop_duplicates) { - std::vector<std::wstring> analyse_result; - std::shared_ptr<lucene::analysis::Analyzer> analyzer; - std::unique_ptr<lucene::util::Reader> reader; +std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexReader::create_analyzer( + InvertedIndexCtx* inverted_index_ctx) { + std::unique_ptr<lucene::analysis::Analyzer> analyzer; auto analyser_type = inverted_index_ctx->parser_type; if (analyser_type == InvertedIndexParserType::PARSER_STANDARD || analyser_type == InvertedIndexParserType::PARSER_UNICODE) { - analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>(); + analyzer = std::make_unique<lucene::analysis::standard95::StandardAnalyzer>(); } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) { - analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); + analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) { auto chinese_analyzer = - std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); + std::make_unique<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); chinese_analyzer->initDict(config::inverted_index_dict_path); auto mode = inverted_index_ctx->parser_mode; if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) { @@ -121,32 +118,44 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( } else { chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); } - analyzer = chinese_analyzer; + analyzer = std::move(chinese_analyzer); } else { // default - analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); + analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); } - reader.reset(new lucene::util::SStringReader<char>()); + return analyzer; +} + +std::unique_ptr<lucene::util::Reader> InvertedIndexReader::create_reader( + InvertedIndexCtx* inverted_index_ctx, const std::string& value) { + std::unique_ptr<lucene::util::Reader> reader = + std::make_unique<lucene::util::SStringReader<char>>(); CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map; if (!char_filter_map.empty()) { - reader.reset(CharFilterFactory::create( + reader = std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create( char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(), char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); } - reader->init(value.data(), value.size(), false); + reader->init(value.data(), value.size(), true); + return reader; +} + +std::vector<std::string> InvertedIndexReader::get_analyse_result( + lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer, + const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates) { + std::vector<std::string> analyse_result; std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); std::unique_ptr<lucene::analysis::TokenStream> token_stream( - analyzer->tokenStream(field_ws.c_str(), reader.get())); + analyzer->tokenStream(field_ws.c_str(), reader)); lucene::analysis::Token token; while (token_stream->next(&token)) { if (token.termLength<char>() != 0) { - std::string_view term(token.termBuffer<char>(), token.termLength<char>()); - std::wstring ws_term = StringUtil::string_to_wstring(term); - analyse_result.emplace_back(ws_term); + analyse_result.emplace_back( + std::string(token.termBuffer<char>(), token.termLength<char>())); } } @@ -156,7 +165,7 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( if (drop_duplicates && (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY)) { - std::set<std::wstring> unrepeated_result(analyse_result.begin(), analyse_result.end()); + std::set<std::string> unrepeated_result(analyse_result.begin(), analyse_result.end()); analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end()); } @@ -245,8 +254,11 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run inverted_index_ctx->char_filter_map = get_parser_char_filter_map_from_properties(_index_meta.properties()); try { - std::vector<std::wstring> analyse_result = - get_analyse_result(column_name, search_str, query_type, inverted_index_ctx.get()); + auto analyzer = create_analyzer(inverted_index_ctx.get()); + auto reader = create_reader(inverted_index_ctx.get(), search_str); + inverted_index_ctx->analyzer = analyzer.get(); + std::vector<std::string> analyse_result = + get_analyse_result(reader.get(), analyzer.get(), column_name, query_type); if (analyse_result.empty()) { auto msg = fmt::format( @@ -280,11 +292,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run roaring::Roaring query_match_bitmap; bool null_bitmap_already_read = false; if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || - query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) { - std::wstring wstr_tokens; + query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || + query_type == InvertedIndexQueryType::EQUAL_QUERY) { + std::string str_tokens; for (auto& token : analyse_result) { - wstr_tokens += token; - wstr_tokens += L" "; + str_tokens += token; + str_tokens += " "; } auto cache = InvertedIndexQueryCache::instance(); @@ -292,7 +305,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run cache_key.index_path = index_file_path; cache_key.column_name = column_name; cache_key.query_type = query_type; - auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(), wstr_tokens.length()); + //auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(), wstr_tokens.length()); cache_key.value.swap(str_tokens); InvertedIndexQueryCacheHandle cache_handle; std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr; @@ -308,7 +321,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { auto* phrase_query = new lucene::search::PhraseQuery(); for (auto& token : analyse_result) { - auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), token.c_str()); + std::wstring wtoken = StringUtil::string_to_wstring(token); + auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), wtoken.c_str()); phrase_query->add(term); _CLDECDELETE(term); } @@ -330,13 +344,14 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run query_match_bitmap = *term_match_bitmap; } else { bool first = true; - for (auto token_ws : analyse_result) { + for (auto token : analyse_result) { std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr; // try to get term bitmap match result from cache to avoid query index on cache hit auto cache = InvertedIndexQueryCache::instance(); // use EQUAL_QUERY type here since cache is for each term/token - auto token = lucene_wcstoutf8string(token_ws.c_str(), token_ws.length()); + //auto token = lucene_wcstoutf8string(token_ws.c_str(), token_ws.length()); + std::wstring token_ws = StringUtil::string_to_wstring(token); InvertedIndexQueryCache::CacheKey cache_key { index_file_path, column_name, InvertedIndexQueryType::EQUAL_QUERY, token}; @@ -439,7 +454,7 @@ Status FullTextIndexReader::normal_index_search( Status FullTextIndexReader::match_all_index_search( OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, - const std::vector<std::wstring>& analyse_result, const IndexSearcherPtr& index_searcher, + const std::vector<std::string>& analyse_result, const IndexSearcherPtr& index_searcher, const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 30269f3b192..20c5c731f9e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -97,11 +97,15 @@ public: return _index_meta.properties(); } - static std::vector<std::wstring> get_analyse_result(const std::string& field_name, - const std::string& value, - InvertedIndexQueryType query_type, - InvertedIndexCtx* inverted_index_ctx, - bool drop_duplicates = true); + static std::vector<std::string> get_analyse_result(lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, + InvertedIndexQueryType query_type, + bool drop_duplicates = true); + static std::unique_ptr<lucene::util::Reader> create_reader(InvertedIndexCtx* inverted_index_ctx, + const std::string& value); + static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer( + InvertedIndexCtx* inverted_index_ctx); protected: bool _is_range_query(InvertedIndexQueryType query_type); @@ -144,7 +148,7 @@ private: Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, - const std::vector<std::wstring>& analyse_result, + const std::vector<std::string>& analyse_result, const IndexSearcherPtr& index_searcher, const std::shared_ptr<roaring::Roaring>& term_match_bitmap); diff --git a/be/src/vec/exprs/vmatch_predicate.cpp b/be/src/vec/exprs/vmatch_predicate.cpp index f6ba52705f6..2a21aba5785 100644 --- a/be/src/vec/exprs/vmatch_predicate.cpp +++ b/be/src/vec/exprs/vmatch_predicate.cpp @@ -30,6 +30,7 @@ #include <vector> #include "common/status.h" +#include "olap/rowset/segment_v2/inverted_index_reader.h" #include "vec/core/block.h" #include "vec/core/column_numbers.h" #include "vec/core/column_with_type_and_name.h" @@ -43,6 +44,7 @@ class RuntimeState; } // namespace doris namespace doris::vectorized { +using namespace doris::segment_v2; VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) { _inverted_index_ctx = std::make_shared<InvertedIndexCtx>(); @@ -50,8 +52,12 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) { get_inverted_index_parser_type_from_string(node.match_predicate.parser_type); _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode; _inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map; + _analyzer = InvertedIndexReader::create_analyzer(_inverted_index_ctx.get()); + _inverted_index_ctx->analyzer = _analyzer.get(); } +VMatchPredicate::~VMatchPredicate() = default; + Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); diff --git a/be/src/vec/exprs/vmatch_predicate.h b/be/src/vec/exprs/vmatch_predicate.h index 2868454db38..75d17298b8c 100644 --- a/be/src/vec/exprs/vmatch_predicate.h +++ b/be/src/vec/exprs/vmatch_predicate.h @@ -26,6 +26,11 @@ #include "vec/exprs/vexpr.h" #include "vec/functions/function.h" +namespace lucene { +namespace analysis { +class Analyzer; +} +} // namespace lucene namespace doris { class RowDescriptor; class RuntimeState; @@ -43,13 +48,11 @@ class VMatchPredicate final : public VExpr { public: VMatchPredicate(const TExprNode& node); - ~VMatchPredicate() override = default; - doris::Status execute(VExprContext* context, doris::vectorized::Block* block, - int* result_column_id) override; - doris::Status prepare(doris::RuntimeState* state, const doris::RowDescriptor& desc, - VExprContext* context) override; - doris::Status open(doris::RuntimeState* state, VExprContext* context, - FunctionContext::FunctionStateScope scope) override; + ~VMatchPredicate() override; + Status execute(VExprContext* context, Block* block, int* result_column_id) override; + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; VExprSPtr clone() const override { return VMatchPredicate::create_shared(*this); } const std::string& expr_name() const override; @@ -64,5 +67,6 @@ private: std::string _expr_name; std::string _function_name; InvertedIndexCtxSPtr _inverted_index_ctx; + std::unique_ptr<lucene::analysis::Analyzer> _analyzer; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 72a400a58d0..62e0a53bccb 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -95,13 +95,14 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, dest_offsets.push_back(dest_pos); continue; } - std::vector<std::wstring> query_tokens = + auto reader = doris::segment_v2::InvertedIndexReader::create_reader( + &inverted_index_ctx, tokenize_str.to_string()); + + std::vector<std::string> query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - "tokenize", tokenize_str.to_string(), - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, - &inverted_index_ctx); - for (auto token_ws : query_tokens) { - std::string token = lucene_wcstoutf8string(token_ws.data(), token_ws.length()); + reader.get(), inverted_index_ctx.analyzer, "tokenize", + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + for (auto token : query_tokens) { const size_t old_size = column_string_chars.size(); const size_t split_part_size = token.length(); if (split_part_size > 0) { @@ -158,6 +159,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties); inverted_index_ctx.char_filter_map = get_parser_char_filter_map_from_properties(properties); + auto analyzer = + doris::segment_v2::InvertedIndexReader::create_analyzer(&inverted_index_ctx); + inverted_index_ctx.analyzer = analyzer.get(); _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets, dest_nested_null_map); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index a549762c7da..4267407281f 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -102,26 +102,32 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY; } -inline std::vector<std::wstring> FunctionMatchBase::analyse_data_token( +inline std::vector<std::string> FunctionMatchBase::analyse_data_token( const std::string& column_name, InvertedIndexCtx* inverted_index_ctx, const ColumnString* string_col, int32_t current_block_row_idx, const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) { - std::vector<std::wstring> data_tokens; + std::vector<std::string> data_tokens; auto query_type = get_query_type_from_fn_name(); if (array_offsets) { for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset) { const auto& str_ref = string_col->get_data_at(current_src_array_offset); - std::vector<std::wstring> element_tokens = + auto reader = doris::segment_v2::InvertedIndexReader::create_reader( + inverted_index_ctx, str_ref.to_string()); + + std::vector<std::string> element_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - column_name, str_ref.to_string(), query_type, inverted_index_ctx, + reader.get(), inverted_index_ctx->analyzer, column_name, query_type, false); data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end()); } } else { const auto& str_ref = string_col->get_data_at(current_block_row_idx); + auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, + str_ref.to_string()); + data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - column_name, str_ref.to_string(), query_type, inverted_index_ctx, false); + reader.get(), inverted_index_ctx->analyzer, column_name, query_type, false); } return data_tokens; } @@ -138,10 +144,12 @@ Status FunctionMatchAny::execute_match(const std::string& column_name, } VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); - std::vector<std::wstring> query_tokens = + auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, + match_query_str); + std::vector<std::string> query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - column_name, match_query_str, - doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, inverted_index_ctx); + reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -152,7 +160,7 @@ Status FunctionMatchAny::execute_match(const std::string& column_name, auto current_src_array_offset = 0; for (int i = 0; i < input_rows_count; i++) { - std::vector<std::wstring> data_tokens = + std::vector<std::string> data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets, current_src_array_offset); @@ -181,10 +189,12 @@ Status FunctionMatchAll::execute_match(const std::string& column_name, } VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); - std::vector<std::wstring> query_tokens = + auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, + match_query_str); + std::vector<std::string> query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - column_name, match_query_str, - doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, inverted_index_ctx); + reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -195,7 +205,7 @@ Status FunctionMatchAll::execute_match(const std::string& column_name, auto current_src_array_offset = 0; for (int i = 0; i < input_rows_count; i++) { - std::vector<std::wstring> data_tokens = + std::vector<std::string> data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets, current_src_array_offset); @@ -230,11 +240,12 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, } VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); - std::vector<std::wstring> query_tokens = + auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, + match_query_str); + std::vector<std::string> query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - column_name, match_query_str, - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, - inverted_index_ctx); + reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -245,7 +256,7 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, auto current_src_array_offset = 0; for (int i = 0; i < input_rows_count; i++) { - std::vector<std::wstring> data_tokens = + std::vector<std::string> data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets, current_src_array_offset); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index 8f66f7866d2..52b0c8ee6d2 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -76,12 +76,12 @@ public: doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name(); - std::vector<std::wstring> analyse_data_token(const std::string& column_name, - InvertedIndexCtx* inverted_index_ctx, - const ColumnString* string_col, - int32_t current_block_row_idx, - const ColumnArray::Offsets64* array_offsets, - int32_t& current_src_array_offset); + std::vector<std::string> analyse_data_token(const std::string& column_name, + InvertedIndexCtx* inverted_index_ctx, + const ColumnString* string_col, + int32_t current_block_row_idx, + const ColumnArray::Offsets64* array_offsets, + int32_t& current_src_array_offset); }; class FunctionMatchAny : public FunctionMatchBase { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
