This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9c6353f2ee5 [refact](inverted index) use inverted index context for
query (#58981)
9c6353f2ee5 is described below
commit 9c6353f2ee5c3f0be553e056b3124d4bf84d8e7d
Author: Jack <[email protected]>
AuthorDate: Wed Dec 17 14:09:23 2025 +0800
[refact](inverted index) use inverted index context for query (#58981)
---
be/src/olap/comparison_predicate.h | 2 +-
be/src/olap/inverted_index_parser.h | 28 ++++-
.../inverted_index/analyzer/analyzer.cpp | 57 ++++-----
.../segment_v2/inverted_index/analyzer/analyzer.h | 4 +-
.../rowset/segment_v2/inverted_index_iterator.cpp | 3 +-
.../rowset/segment_v2/inverted_index_iterator.h | 6 +
.../rowset/segment_v2/inverted_index_reader.cpp | 21 +++-
.../olap/rowset/segment_v2/inverted_index_reader.h | 12 +-
.../rowset/segment_v2/inverted_index_writer.cpp | 30 ++---
.../olap/rowset/segment_v2/inverted_index_writer.h | 4 +-
be/src/vec/exprs/vexpr.cpp | 9 +-
be/src/vec/exprs/vexpr_context.h | 20 ++++
be/src/vec/exprs/vmatch_predicate.cpp | 36 ++++--
be/src/vec/exprs/vmatch_predicate.h | 6 +-
be/src/vec/functions/array/function_array_index.h | 4 +-
.../vec/functions/array/function_arrays_overlap.h | 5 +-
be/src/vec/functions/function.h | 8 +-
be/src/vec/functions/function_ip.h | 1 +
be/src/vec/functions/function_multi_match.cpp | 4 +-
be/src/vec/functions/function_multi_match.h | 1 +
be/src/vec/functions/function_search.cpp | 1 +
be/src/vec/functions/function_search.h | 1 +
be/src/vec/functions/function_tokenize.cpp | 48 ++++----
be/src/vec/functions/function_tokenize.h | 4 +-
be/src/vec/functions/functions_comparison.h | 4 +-
be/src/vec/functions/in.h | 4 +-
be/src/vec/functions/is_not_null.h | 1 +
be/src/vec/functions/is_null.h | 1 +
be/src/vec/functions/match.cpp | 128 +++++++++++----------
be/src/vec/functions/match.h | 25 ++--
be/test/olap/inverted_index_parser_test.cpp | 34 ++----
.../rowset/segment_v2/index_reader_helper_test.cpp | 3 +-
.../inverted_index/ananlyzer/analyzer_test.cpp | 126 ++++++++++----------
.../segment_v2/inverted_index/util/reader_test.cpp | 20 ++--
be/test/vec/function/function_is_null_test.cpp | 16 +--
be/test/vec/function/function_match_test.cpp | 19 +--
be/test/vec/function/function_multi_match_test.cpp | 2 +-
be/test/vec/function/function_search_test.cpp | 4 +-
.../test_index_lowercase_fault_injection.out | 2 +-
39 files changed, 412 insertions(+), 292 deletions(-)
diff --git a/be/src/olap/comparison_predicate.h
b/be/src/olap/comparison_predicate.h
index b575f1cd8cb..f4cf674ab82 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -104,7 +104,7 @@ public:
param.query_type = query_type;
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(iterator->read_from_index(¶m));
+ RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam
{¶m}));
// mask out null_bitmap, since NULL cmp VALUE will produce NULL
// and be treated as false in WHERE
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index b7f547882ca..a7eba8f6418 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -45,18 +45,34 @@ enum class InvertedIndexParserType {
using CharFilterMap = std::map<std::string, std::string>;
-struct InvertedIndexCtx {
+// Configuration for creating analyzer (SRP: only used during analyzer
creation)
+// This is typically a stack-allocated temporary object, discarded after use
+struct InvertedIndexAnalyzerConfig {
std::string analyzer_name;
- InvertedIndexParserType parser_type;
+ InvertedIndexParserType parser_type =
InvertedIndexParserType::PARSER_UNKNOWN;
std::string parser_mode;
- std::string support_phrase;
- CharFilterMap char_filter_map;
std::string lower_case;
std::string stop_words;
- lucene::analysis::Analyzer* analyzer = nullptr;
+ CharFilterMap char_filter_map;
};
-using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
+// Runtime context for analyzer
+// Contains only the fields needed at runtime
+struct InvertedIndexAnalyzerCtx {
+ // Used by execute_column path to determine if tokenization should be
skipped
+ std::string analyzer_name;
+ InvertedIndexParserType parser_type =
InvertedIndexParserType::PARSER_UNKNOWN;
+
+ // Used for creating reader and tokenization
+ CharFilterMap char_filter_map;
+ lucene::analysis::Analyzer* analyzer = nullptr;
+
+ // Helper method: returns true if tokenization should be performed
+ bool should_tokenize() const {
+ return !(parser_type == InvertedIndexParserType::PARSER_NONE &&
analyzer_name.empty());
+ }
+};
+using InvertedIndexAnalyzerCtxSPtr = std::shared_ptr<InvertedIndexAnalyzerCtx>;
const std::string INVERTED_INDEX_PARSER_TRUE = "true";
const std::string INVERTED_INDEX_PARSER_FALSE = "false";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index 640362d4bcc..24a114b7bd9 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -17,6 +17,8 @@
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
+#include <glog/logging.h>
+
#include "CLucene.h"
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
@@ -42,14 +44,19 @@
namespace doris::segment_v2::inverted_index {
#include "common/compile_check_begin.h"
-ReaderPtr InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map)
{
+ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap&
char_filter_map) {
ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
if (!char_filter_map.empty()) {
- if (char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] ==
- INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
- reader = std::make_shared<CharReplaceCharFilter>(
- reader,
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
-
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]);
+ auto it_type =
char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
+ if (it_type != char_filter_map.end() &&
+ it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
+ auto it_pattern =
char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
+ auto it_replacement =
+
char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
+ if (it_pattern != char_filter_map.end() && it_replacement !=
char_filter_map.end()) {
+ reader = std::make_shared<CharReplaceCharFilter>(reader,
it_pattern->second,
+
it_replacement->second);
+ }
}
}
return reader;
@@ -123,21 +130,19 @@ AnalyzerPtr
InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy
return analyzer;
}
-std::shared_ptr<lucene::analysis::Analyzer>
InvertedIndexAnalyzer::create_analyzer(
- const InvertedIndexCtx* inverted_index_ctx) {
- const std::string& analyzer_name = inverted_index_ctx->analyzer_name;
+AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const
InvertedIndexAnalyzerConfig* config) {
+ DCHECK(config != nullptr);
+ const std::string& analyzer_name = config->analyzer_name;
if (analyzer_name.empty()) {
- return create_builtin_analyzer(
- inverted_index_ctx->parser_type,
inverted_index_ctx->parser_mode,
- inverted_index_ctx->lower_case,
inverted_index_ctx->stop_words);
+ return create_builtin_analyzer(config->parser_type,
config->parser_mode, config->lower_case,
+ config->stop_words);
}
if (is_builtin_analyzer(analyzer_name)) {
InvertedIndexParserType parser_type =
get_inverted_index_parser_type_from_string(analyzer_name);
- return create_builtin_analyzer(parser_type,
inverted_index_ctx->parser_mode,
- inverted_index_ctx->lower_case,
- inverted_index_ctx->stop_words);
+ return create_builtin_analyzer(parser_type, config->parser_mode,
config->lower_case,
+ config->stop_words);
}
auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
@@ -176,18 +181,16 @@ std::vector<TermInfo>
InvertedIndexAnalyzer::get_analyse_result(
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
const std::string& search_str, const std::map<std::string,
std::string>& properties) {
- InvertedIndexCtxSPtr inverted_index_ctx =
std::make_shared<InvertedIndexCtx>(
- get_analyzer_name_from_properties(properties),
- get_inverted_index_parser_type_from_string(
- get_parser_string_from_properties(properties)),
- get_parser_mode_string_from_properties(properties),
- get_parser_phrase_support_string_from_properties(properties),
- get_parser_char_filter_map_from_properties(properties),
- get_parser_lowercase_from_properties(properties),
- get_parser_stopwords_from_properties(properties));
- auto analyzer = create_analyzer(inverted_index_ctx.get());
- inverted_index_ctx->analyzer = analyzer.get();
- auto reader = create_reader(inverted_index_ctx->char_filter_map);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = get_analyzer_name_from_properties(properties);
+ config.parser_type = get_inverted_index_parser_type_from_string(
+ get_parser_string_from_properties(properties));
+ config.parser_mode = get_parser_mode_string_from_properties(properties);
+ config.lower_case = get_parser_lowercase_from_properties(properties);
+ config.stop_words = get_parser_stopwords_from_properties(properties);
+ config.char_filter_map =
get_parser_char_filter_map_from_properties(properties);
+ auto analyzer = create_analyzer(&config);
+ auto reader = create_reader(config.char_filter_map);
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()),
true);
return get_analyse_result(reader, analyzer.get());
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
index 71f575c4c83..b8a49e03810 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
@@ -41,14 +41,14 @@ using AnalyzerPtr =
std::shared_ptr<lucene::analysis::Analyzer>;
class InvertedIndexAnalyzer {
public:
- static ReaderPtr create_reader(CharFilterMap& char_filter_map);
+ static ReaderPtr create_reader(const CharFilterMap& char_filter_map);
static bool is_builtin_analyzer(const std::string& analyzer_name);
static AnalyzerPtr create_builtin_analyzer(InvertedIndexParserType
parser_type,
const std::string& parser_mode,
const std::string& lower_case,
const std::string& stop_words);
- static AnalyzerPtr create_analyzer(const InvertedIndexCtx*
inverted_index_ctx);
+ static AnalyzerPtr create_analyzer(const InvertedIndexAnalyzerConfig*
config);
static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
lucene::analysis::Analyzer* analyzer);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
index 78caacaf927..ce3584b0161 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp
@@ -65,9 +65,10 @@ Status InvertedIndexIterator::read_from_index(const
IndexParam& param) {
}
}
+ // Note: analyzer_ctx is now passed via i_param->analyzer_ctx
auto execute_query = [&]() {
return reader->query(_context, i_param->column_name,
i_param->query_value,
- i_param->query_type, i_param->roaring);
+ i_param->query_type, i_param->roaring,
i_param->analyzer_ctx);
};
if (runtime_state->query_options().enable_profile) {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_iterator.h
b/be/src/olap/rowset/segment_v2/inverted_index_iterator.h
index 66d231c4a2a..f2f72cc58e5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_iterator.h
@@ -17,6 +17,7 @@
#pragma once
+#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/index_iterator.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
@@ -30,6 +31,10 @@ struct InvertedIndexParam {
uint32_t num_rows;
std::shared_ptr<roaring::Roaring> roaring;
bool skip_try = false;
+
+ // Pointer to analyzer context (can be nullptr if not needed)
+ // Used by FullTextIndexReader for tokenization
+ const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr;
};
class InvertedIndexIterator : public IndexIterator {
@@ -39,6 +44,7 @@ public:
void add_reader(InvertedIndexReaderType type, const
InvertedIndexReaderPtr& reader);
+ // Note: analyzer_ctx is now passed via InvertedIndexParam.analyzer_ctx
Status read_from_index(const IndexParam& param) override;
Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle)
override;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index d9eeba27ffd..c6a681a2444 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -292,7 +292,8 @@ Status
FullTextIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterato
Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
const std::string& column_name, const void*
query_value,
InvertedIndexQueryType query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) {
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
std::string search_str = *reinterpret_cast<const
std::string*>(query_value);
@@ -313,8 +314,16 @@ Status FullTextIndexReader::query(const
IndexQueryContextPtr& context,
query_info);
} else {
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
- query_info.term_infos =
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
- search_str, _index_meta.properties());
+ if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
+ auto reader =
inverted_index::InvertedIndexAnalyzer::create_reader(
+ analyzer_ctx->char_filter_map);
+ reader->init(search_str.data(),
static_cast<int32_t>(search_str.size()), true);
+ query_info.term_infos =
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+ reader, analyzer_ctx->analyzer);
+ } else {
+ query_info.term_infos =
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+ search_str, _index_meta.properties());
+ }
}
if (query_info.term_infos.empty()) {
@@ -394,7 +403,8 @@ Status
StringTypeInvertedIndexReader::new_iterator(std::unique_ptr<IndexIterator
Status StringTypeInvertedIndexReader::query(const IndexQueryContextPtr&
context,
const std::string& column_name,
const void* query_value,
InvertedIndexQueryType query_type,
- std::shared_ptr<roaring::Roaring>&
bit_map) {
+ std::shared_ptr<roaring::Roaring>&
bit_map,
+ const InvertedIndexAnalyzerCtx*
/*analyzer_ctx*/) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
std::string search_str = *reinterpret_cast<const
std::string*>(query_value);
@@ -688,7 +698,8 @@ Status BkdIndexReader::try_query(const
IndexQueryContextPtr& context,
Status BkdIndexReader::query(const IndexQueryContextPtr& context, const
std::string& column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) {
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/)
{
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
try {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 918054f472b..2d0911b0adc 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -194,7 +194,8 @@ public:
virtual Status query(const IndexQueryContextPtr& context, const
std::string& column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) = 0;
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx =
nullptr) = 0;
virtual Status try_query(const IndexQueryContextPtr& context, const
std::string& column_name,
const void* query_value, InvertedIndexQueryType
query_type,
size_t* count) = 0;
@@ -255,7 +256,8 @@ public:
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
Status query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) override;
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr)
override;
Status try_query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
size_t* count) override {
@@ -279,7 +281,8 @@ public:
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
Status query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) override;
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr)
override;
Status try_query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
size_t* count) override {
@@ -338,7 +341,8 @@ public:
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
Status query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType query_type,
- std::shared_ptr<roaring::Roaring>& bit_map) override;
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr)
override;
Status try_query(const IndexQueryContextPtr& context, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
size_t* count) override;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index c47891f25d8..ffb6f983565 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -191,9 +191,9 @@ Status
InvertedIndexColumnWriter<field_type>::create_field(lucene::document::Fie
template <FieldType field_type>
Result<std::shared_ptr<lucene::analysis::Analyzer>>
InvertedIndexColumnWriter<field_type>::create_analyzer(
- std::shared_ptr<InvertedIndexCtx>& inverted_index_ctx) {
+ const InvertedIndexAnalyzerConfig& analyzer_config) {
try {
- return
inverted_index::InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get());
+ return
inverted_index::InvertedIndexAnalyzer::create_analyzer(&analyzer_config);
} catch (CLuceneError& e) {
return
ResultError(Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what()));
@@ -205,20 +205,20 @@ InvertedIndexColumnWriter<field_type>::create_analyzer(
template <FieldType field_type>
Status InvertedIndexColumnWriter<field_type>::init_fulltext_index() {
- _inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
- get_analyzer_name_from_properties(_index_meta->properties()),
- get_inverted_index_parser_type_from_string(
-
get_parser_string_from_properties(_index_meta->properties())),
- get_parser_mode_string_from_properties(_index_meta->properties()),
-
get_parser_phrase_support_string_from_properties(_index_meta->properties()),
-
get_parser_char_filter_map_from_properties(_index_meta->properties()),
-
get_parser_lowercase_from_properties<true>(_index_meta->properties()),
- get_parser_stopwords_from_properties(_index_meta->properties()));
+ _analyzer_config.analyzer_name =
get_analyzer_name_from_properties(_index_meta->properties());
+ _analyzer_config.parser_type = get_inverted_index_parser_type_from_string(
+ get_parser_string_from_properties(_index_meta->properties()));
+ _analyzer_config.parser_mode =
+ get_parser_mode_string_from_properties(_index_meta->properties());
+ _analyzer_config.char_filter_map =
+
get_parser_char_filter_map_from_properties(_index_meta->properties());
+ _analyzer_config.lower_case =
+
get_parser_lowercase_from_properties<true>(_index_meta->properties());
+ _analyzer_config.stop_words =
get_parser_stopwords_from_properties(_index_meta->properties());
RETURN_IF_ERROR(open_index_directory());
- _char_string_reader =
-
DORIS_TRY(create_char_string_reader(_inverted_index_ctx->char_filter_map));
+ _char_string_reader =
DORIS_TRY(create_char_string_reader(_analyzer_config.char_filter_map));
if (_should_analyzer) {
- _analyzer = DORIS_TRY(create_analyzer(_inverted_index_ctx));
+ _analyzer = DORIS_TRY(create_analyzer(_analyzer_config));
}
_similarity = std::make_unique<lucene::search::LengthSimilarity>();
_index_writer = create_index_writer();
@@ -448,7 +448,7 @@ Status
InvertedIndexColumnWriter<field_type>::add_array_values(size_t field_size
// stream can not reuse for different field
bool own_token_stream = true;
ReaderPtr char_string_reader = DORIS_TRY(
-
create_char_string_reader(_inverted_index_ctx->char_filter_map));
+
create_char_string_reader(_analyzer_config.char_filter_map));
char_string_reader->init(v->get_data(),
cast_set<int32_t>(v->get_size()),
false);
ts = _analyzer->tokenStream(new_field->name(),
char_string_reader);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h
b/be/src/olap/rowset/segment_v2/inverted_index_writer.h
index 5835b09b0d6..5aad91a0371 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h
@@ -58,7 +58,7 @@ public:
std::unique_ptr<lucene::index::IndexWriter> create_index_writer();
Status create_field(lucene::document::Field** field);
Result<std::shared_ptr<lucene::analysis::Analyzer>> create_analyzer(
- std::shared_ptr<InvertedIndexCtx>& inverted_index_ctx);
+ const InvertedIndexAnalyzerConfig& analyzer_config);
Status init_fulltext_index();
Status add_document();
Status add_null_document();
@@ -97,7 +97,7 @@ private:
std::unique_ptr<lucene::search::Similarity> _similarity = nullptr;
ReaderPtr _char_string_reader = nullptr;
std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer = nullptr;
- InvertedIndexCtxSPtr _inverted_index_ctx = nullptr;
+ InvertedIndexAnalyzerConfig _analyzer_config;
const KeyCoder* _value_key_coder;
const TabletIndex* _index_meta;
std::wstring _field_name;
diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp
index e6b0ce0ea59..ba3a82d3ebc 100644
--- a/be/src/vec/exprs/vexpr.cpp
+++ b/be/src/vec/exprs/vexpr.cpp
@@ -28,11 +28,13 @@
#include <cstdint>
#include <memory>
#include <stack>
+#include <string_view>
#include <utility>
#include "common/config.h"
#include "common/exception.h"
#include "common/status.h"
+#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/ann_index/ann_search_params.h"
#include "olap/rowset/segment_v2/ann_index/ann_topn_runtime.h"
#include "pipeline/pipeline_task.h"
@@ -958,9 +960,14 @@ Status VExpr::_evaluate_inverted_index(VExprContext*
context, const FunctionBase
return Status::OK(); // Nothing to evaluate or no literals to compare
against
}
+ const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr;
+ if (auto index_ctx = context->get_index_context(); index_ctx != nullptr) {
+ analyzer_ctx = index_ctx->get_analyzer_ctx_for_expr(this);
+ }
+
auto result_bitmap = segment_v2::InvertedIndexResultBitmap();
auto res = function->evaluate_inverted_index(arguments,
data_type_with_names, iterators,
- segment_num_rows,
result_bitmap);
+ segment_num_rows,
analyzer_ctx, result_bitmap);
if (!res.ok()) {
return res;
}
diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h
index 6859a7cdf01..3d9abfd8d12 100644
--- a/be/src/vec/exprs/vexpr_context.h
+++ b/be/src/vec/exprs/vexpr_context.h
@@ -22,6 +22,7 @@
#include <algorithm>
#include <cstddef>
#include <memory>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -139,6 +140,22 @@ public:
ScoreRuntimeSPtr get_score_runtime() const { return _score_runtime; }
+ void set_analyzer_ctx_for_expr(const vectorized::VExpr* expr,
+ InvertedIndexAnalyzerCtxSPtr analyzer_ctx) {
+ if (expr == nullptr || analyzer_ctx == nullptr) {
+ return;
+ }
+ _expr_analyzer_ctx[expr] = std::move(analyzer_ctx);
+ }
+
+ const InvertedIndexAnalyzerCtx* get_analyzer_ctx_for_expr(const
vectorized::VExpr* expr) const {
+ auto iter = _expr_analyzer_ctx.find(expr);
+ if (iter == _expr_analyzer_ctx.end()) {
+ return nullptr;
+ }
+ return iter->second.get();
+ }
+
private:
// A reference to a vector of column IDs for the current expression's
output columns.
const std::vector<ColumnId>& _col_ids;
@@ -156,6 +173,9 @@ private:
// A map of expressions to their corresponding result columns.
std::unordered_map<const vectorized::VExpr*, ColumnPtr>
_index_result_column;
+ // Per-expression analyzer context for inverted index evaluation.
+ std::unordered_map<const vectorized::VExpr*, InvertedIndexAnalyzerCtxSPtr>
_expr_analyzer_ctx;
+
// A reference to a map of common expressions to their inverted index
evaluation status.
std::unordered_map<ColumnId, std::unordered_map<const vectorized::VExpr*,
bool>>&
_expr_index_status;
diff --git a/be/src/vec/exprs/vmatch_predicate.cpp
b/be/src/vec/exprs/vmatch_predicate.cpp
index 806fbbd97e2..a02704c1b84 100644
--- a/be/src/vec/exprs/vmatch_predicate.cpp
+++ b/be/src/vec/exprs/vmatch_predicate.cpp
@@ -57,20 +57,31 @@ namespace doris::vectorized {
using namespace doris::segment_v2;
VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) {
- _inverted_index_ctx = std::make_shared<InvertedIndexCtx>();
- _inverted_index_ctx->analyzer_name = node.match_predicate.analyzer_name;
- _inverted_index_ctx->parser_type =
+ // Step 1: Create configuration (stack-allocated temporary, follows SRP)
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = node.match_predicate.analyzer_name;
+ config.parser_type =
get_inverted_index_parser_type_from_string(node.match_predicate.parser_type);
- _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
- _inverted_index_ctx->char_filter_map =
node.match_predicate.char_filter_map;
+ config.parser_mode = node.match_predicate.parser_mode;
+ config.char_filter_map = node.match_predicate.char_filter_map;
if (node.match_predicate.parser_lowercase) {
- _inverted_index_ctx->lower_case = INVERTED_INDEX_PARSER_TRUE;
+ config.lower_case = INVERTED_INDEX_PARSER_TRUE;
} else {
- _inverted_index_ctx->lower_case = INVERTED_INDEX_PARSER_FALSE;
+ config.lower_case = INVERTED_INDEX_PARSER_FALSE;
}
- _inverted_index_ctx->stop_words = node.match_predicate.parser_stopwords;
- _analyzer =
inverted_index::InvertedIndexAnalyzer::create_analyzer(_inverted_index_ctx.get());
- _inverted_index_ctx->analyzer = _analyzer.get();
+
DBUG_EXECUTE_IF("inverted_index_parser.get_parser_lowercase_from_properties",
+ { config.lower_case = ""; })
+ config.stop_words = node.match_predicate.parser_stopwords;
+
+ // Step 2: Use config to create analyzer (factory method)
+ _analyzer =
inverted_index::InvertedIndexAnalyzer::create_analyzer(&config);
+
+ // Step 3: Create runtime context (only extract runtime-needed info)
+ _analyzer_ctx = std::make_shared<InvertedIndexAnalyzerCtx>();
+ _analyzer_ctx->analyzer_name = config.analyzer_name;
+ _analyzer_ctx->parser_type = config.parser_type;
+ _analyzer_ctx->char_filter_map = std::move(config.char_filter_map);
+ _analyzer_ctx->analyzer = _analyzer.get();
}
VMatchPredicate::~VMatchPredicate() = default;
@@ -115,7 +126,7 @@ Status VMatchPredicate::open(RuntimeState* state,
VExprContext* context,
}
RETURN_IF_ERROR(VExpr::init_function_context(state, context, scope,
_function));
if (scope == FunctionContext::THREAD_LOCAL || scope ==
FunctionContext::FRAGMENT_LOCAL) {
- context->fn_context(_fn_context_index)->set_function_state(scope,
_inverted_index_ctx);
+ context->fn_context(_fn_context_index)->set_function_state(scope,
_analyzer_ctx);
}
if (scope == FunctionContext::FRAGMENT_LOCAL) {
RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr));
@@ -131,6 +142,9 @@ void VMatchPredicate::close(VExprContext* context,
FunctionContext::FunctionStat
Status VMatchPredicate::evaluate_inverted_index(VExprContext* context,
uint32_t segment_num_rows) {
DCHECK_EQ(get_num_children(), 2);
+ if (context != nullptr && context->get_index_context() != nullptr &&
_analyzer_ctx != nullptr) {
+ context->get_index_context()->set_analyzer_ctx_for_expr(this,
_analyzer_ctx);
+ }
return _evaluate_inverted_index(context, _function, segment_num_rows);
}
diff --git a/be/src/vec/exprs/vmatch_predicate.h
b/be/src/vec/exprs/vmatch_predicate.h
index bef42c6205d..326a76a05f4 100644
--- a/be/src/vec/exprs/vmatch_predicate.h
+++ b/be/src/vec/exprs/vmatch_predicate.h
@@ -67,7 +67,11 @@ private:
FunctionBasePtr _function;
std::string _expr_name;
std::string _function_name;
- InvertedIndexCtxSPtr _inverted_index_ctx;
+
+ // Lifecycle management: holds ownership of the analyzer
std::shared_ptr<lucene::analysis::Analyzer> _analyzer;
+
+ // Runtime context: holds raw pointer to analyzer and necessary runtime
info
+ InvertedIndexAnalyzerCtxSPtr _analyzer_ctx;
};
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/functions/array/function_array_index.h
b/be/src/vec/functions/array/function_array_index.h
index 50a75043ab9..fc49a0566bf 100644
--- a/be/src/vec/functions/array/function_array_index.h
+++ b/be/src/vec/functions/array/function_array_index.h
@@ -130,6 +130,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
@@ -173,7 +174,8 @@ public:
param.query_type = segment_v2::InvertedIndexQueryType::EQUAL_QUERY;
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(iter->read_from_index(¶m));
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam
{¶m}));
// here debug for check array_contains function really filter rows by
inverted index correctly
DBUG_EXECUTE_IF("array_func.array_contains", {
auto result_bitmap =
DebugPoints::instance()->get_debug_param_or_default<int32_t>(
diff --git a/be/src/vec/functions/array/function_arrays_overlap.h
b/be/src/vec/functions/array/function_arrays_overlap.h
index dc47a51c793..a75bed7197a 100644
--- a/be/src/vec/functions/array/function_arrays_overlap.h
+++ b/be/src/vec/functions/array/function_arrays_overlap.h
@@ -165,6 +165,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
@@ -226,8 +227,8 @@ public:
nested_param_type, &nested_query_val, query_param));
param.query_value = query_param->get_value();
param.roaring = std::make_shared<roaring::Roaring>();
- ;
- RETURN_IF_ERROR(iter->read_from_index(¶m));
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam
{¶m}));
*roaring |= *param.roaring;
}
diff --git a/be/src/vec/functions/function.h b/be/src/vec/functions/function.h
index 8aa77991e04..05d98e5bcb1 100644
--- a/be/src/vec/functions/function.h
+++ b/be/src/vec/functions/function.h
@@ -45,6 +45,10 @@
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_struct.h"
+namespace doris {
+struct InvertedIndexAnalyzerCtx;
+} // namespace doris
+
namespace doris::vectorized {
struct FunctionAttr {
@@ -191,6 +195,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
return Status::OK();
}
@@ -458,9 +463,10 @@ public:
const ColumnsWithTypeAndName& args,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
return function->evaluate_inverted_index(args, data_type_with_names,
iterators, num_rows,
- bitmap_result);
+ analyzer_ctx, bitmap_result);
}
bool is_use_default_implementation_for_constants() const override {
diff --git a/be/src/vec/functions/function_ip.h
b/be/src/vec/functions/function_ip.h
index 90bd34b00f2..4ee333a8415 100644
--- a/be/src/vec/functions/function_ip.h
+++ b/be/src/vec/functions/function_ip.h
@@ -647,6 +647,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
diff --git a/be/src/vec/functions/function_multi_match.cpp
b/be/src/vec/functions/function_multi_match.cpp
index f84c50cc135..55368eff4bc 100644
--- a/be/src/vec/functions/function_multi_match.cpp
+++ b/be/src/vec/functions/function_multi_match.cpp
@@ -57,6 +57,7 @@ Status FunctionMultiMatch::evaluate_inverted_index(
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
DCHECK(arguments.size() == 2);
std::shared_ptr<roaring::Roaring> roaring =
std::make_shared<roaring::Roaring>();
@@ -95,7 +96,8 @@ Status FunctionMultiMatch::evaluate_inverted_index(
param.column_name = column_name;
param.roaring = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(iter->read_from_index(¶m));
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam
{¶m}));
*roaring |= *param.roaring;
}
segment_v2::InvertedIndexResultBitmap result(roaring, null_bitmap);
diff --git a/be/src/vec/functions/function_multi_match.h
b/be/src/vec/functions/function_multi_match.h
index a76166f1264..6af9df27f01 100644
--- a/be/src/vec/functions/function_multi_match.h
+++ b/be/src/vec/functions/function_multi_match.h
@@ -54,6 +54,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override;
};
diff --git a/be/src/vec/functions/function_search.cpp
b/be/src/vec/functions/function_search.cpp
index 7774ed80e25..4e69ab470a1 100644
--- a/be/src/vec/functions/function_search.cpp
+++ b/be/src/vec/functions/function_search.cpp
@@ -187,6 +187,7 @@ Status FunctionSearch::evaluate_inverted_index(
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<IndexIterator*> iterators, uint32_t num_rows,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
InvertedIndexResultBitmap& bitmap_result) const {
return Status::OK();
}
diff --git a/be/src/vec/functions/function_search.h
b/be/src/vec/functions/function_search.h
index 96e93220f44..b1ec1f63895 100644
--- a/be/src/vec/functions/function_search.h
+++ b/be/src/vec/functions/function_search.h
@@ -152,6 +152,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<IndexIterator*> iterators, uint32_t num_rows,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
InvertedIndexResultBitmap& bitmap_result) const override;
Status evaluate_inverted_index_with_search_param(
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 7364947abec..5f0c05e9844 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -93,7 +93,8 @@ void FunctionTokenize::_do_tokenize_none(const ColumnString&
src_column_string,
}
void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,
- InvertedIndexCtx& inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx&
analyzer_ctx,
+ bool support_phrase,
const MutableColumnPtr& dest_column_ptr)
const {
ColumnArray::Offset64 src_offsets_size =
src_column_string.get_offsets().size();
for (size_t i = 0; i < src_offsets_size; i++) {
@@ -103,10 +104,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString&
src_column_string,
continue;
}
- auto reader =
InvertedIndexAnalyzer::create_reader(inverted_index_ctx.char_filter_map);
+ auto reader =
InvertedIndexAnalyzer::create_reader(analyzer_ctx.char_filter_map);
reader->init(tokenize_str.data, (int)tokenize_str.size, true);
auto analyzer_tokens =
- InvertedIndexAnalyzer::get_analyse_result(reader,
inverted_index_ctx.analyzer);
+ InvertedIndexAnalyzer::get_analyse_result(reader,
analyzer_ctx.analyzer);
rapidjson::Document doc;
doc.SetArray();
@@ -117,7 +118,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString&
src_column_string,
"token",
rapidjson::Value(analyzer_token.get_single_term().c_str(),
allocator).Move(),
allocator);
- if (inverted_index_ctx.support_phrase ==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES) {
+ if (support_phrase) {
obj.AddMember("position", analyzer_token.position, allocator);
}
doc.PushBack(obj, allocator);
@@ -146,42 +147,41 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
if (const auto* col_left =
check_and_get_column<ColumnString>(src_column.get())) {
if (const auto* col_right =
check_and_get_column<ColumnString>(right_column.get())) {
- InvertedIndexCtx inverted_index_ctx;
std::map<std::string, std::string> properties;
auto st = parse(col_right->get_data_at(0).to_string(), properties);
if (!st.ok()) {
return st;
}
- inverted_index_ctx.analyzer_name =
get_analyzer_name_from_properties(properties);
- inverted_index_ctx.parser_type =
get_inverted_index_parser_type_from_string(
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name =
get_analyzer_name_from_properties(properties);
+ config.parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(properties));
- if (inverted_index_ctx.parser_type ==
InvertedIndexParserType::PARSER_UNKNOWN) {
+ if (config.parser_type == InvertedIndexParserType::PARSER_UNKNOWN)
{
return
Status::Error<doris::ErrorCode::INDEX_INVALID_PARAMETERS>(
"unsupported parser type. currently, only 'english',
'chinese', "
"'unicode', 'icu', 'basic' and 'ik' analyzers are
supported.");
}
// Special handling for PARSER_NONE: return original string as
single token
- if (inverted_index_ctx.analyzer_name.empty() &&
- inverted_index_ctx.parser_type ==
InvertedIndexParserType::PARSER_NONE) {
+ if (config.analyzer_name.empty() &&
+ config.parser_type == InvertedIndexParserType::PARSER_NONE) {
_do_tokenize_none(*col_left, dest_column_ptr);
block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
}
- inverted_index_ctx.parser_mode =
get_parser_mode_string_from_properties(properties);
- inverted_index_ctx.support_phrase =
-
get_parser_phrase_support_string_from_properties(properties);
- inverted_index_ctx.char_filter_map =
- get_parser_char_filter_map_from_properties(properties);
- inverted_index_ctx.lower_case =
get_parser_lowercase_from_properties(properties);
- inverted_index_ctx.stop_words =
get_parser_stopwords_from_properties(properties);
+ config.parser_mode =
get_parser_mode_string_from_properties(properties);
+ config.char_filter_map =
get_parser_char_filter_map_from_properties(properties);
+ config.lower_case =
get_parser_lowercase_from_properties(properties);
+ config.stop_words =
get_parser_stopwords_from_properties(properties);
+ bool support_phrase =
get_parser_phrase_support_string_from_properties(properties) ==
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES;
- std::shared_ptr<lucene::analysis::Analyzer> analyzer;
+ std::shared_ptr<lucene::analysis::Analyzer> analyzer_holder;
try {
- analyzer =
+ analyzer_holder =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(
- &inverted_index_ctx);
+ &config);
} catch (CLuceneError& e) {
return
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
@@ -190,8 +190,12 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
"inverted index create analyzer failed: {}", e.what());
}
- inverted_index_ctx.analyzer = analyzer.get();
- _do_tokenize(*col_left, inverted_index_ctx, dest_column_ptr);
+ InvertedIndexAnalyzerCtx analyzer_ctx;
+ analyzer_ctx.analyzer_name = config.analyzer_name;
+ analyzer_ctx.parser_type = config.parser_type;
+ analyzer_ctx.char_filter_map = config.char_filter_map;
+ analyzer_ctx.analyzer = analyzer_holder.get();
+ _do_tokenize(*col_left, analyzer_ctx, support_phrase,
dest_column_ptr);
block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
diff --git a/be/src/vec/functions/function_tokenize.h
b/be/src/vec/functions/function_tokenize.h
index 4985b0a01c0..6748784ed8f 100644
--- a/be/src/vec/functions/function_tokenize.h
+++ b/be/src/vec/functions/function_tokenize.h
@@ -24,6 +24,7 @@
#include <vector>
#include "common/status.h"
+#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "udf/udf.h"
#include "vec/columns/column_array.h"
@@ -65,7 +66,8 @@ public:
<< " and arguments[1] is " << arguments[1]->get_name();
return std::make_shared<DataTypeString>();
}
- void _do_tokenize(const ColumnString& src_column_string, InvertedIndexCtx&
inverted_index_ctx,
+ void _do_tokenize(const ColumnString& src_column_string,
+ const InvertedIndexAnalyzerCtx& analyzer_ctx, bool
support_phrase,
const MutableColumnPtr& dest_column_ptr) const;
void _do_tokenize_none(const ColumnString& src_column_string,
const MutableColumnPtr& dest_column_ptr) const;
diff --git a/be/src/vec/functions/functions_comparison.h
b/be/src/vec/functions/functions_comparison.h
index 0fd0c46590c..467fcfce58d 100644
--- a/be/src/vec/functions/functions_comparison.h
+++ b/be/src/vec/functions/functions_comparison.h
@@ -452,6 +452,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
@@ -499,7 +500,8 @@ public:
param.query_type = query_type;
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(iter->read_from_index(¶m));
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam
{¶m}));
std::shared_ptr<roaring::Roaring> null_bitmap =
std::make_shared<roaring::Roaring>();
if (iter->has_null()) {
segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h
index e52841df682..34c11cac7cf 100644
--- a/be/src/vec/functions/in.h
+++ b/be/src/vec/functions/in.h
@@ -138,6 +138,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
DCHECK(data_type_with_names.size() == 1);
DCHECK(iterators.size() == 1);
@@ -181,7 +182,8 @@ public:
param.query_type = query_type;
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(iter->read_from_index(¶m));
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam
{¶m}));
*roaring |= *param.roaring;
}
segment_v2::InvertedIndexResultBitmap result(roaring, null_bitmap);
diff --git a/be/src/vec/functions/is_not_null.h
b/be/src/vec/functions/is_not_null.h
index a56bd428088..83f127627d9 100644
--- a/be/src/vec/functions/is_not_null.h
+++ b/be/src/vec/functions/is_not_null.h
@@ -87,6 +87,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
if (iterators.empty() || iterators[0] == nullptr) {
return Status::OK();
diff --git a/be/src/vec/functions/is_null.h b/be/src/vec/functions/is_null.h
index 46d746e185b..366997c0ce6 100644
--- a/be/src/vec/functions/is_null.h
+++ b/be/src/vec/functions/is_null.h
@@ -78,6 +78,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override {
if (iterators.empty() || iterators[0] == nullptr) {
return Status::OK();
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 78354aa74ae..2350570341d 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -27,10 +27,29 @@
namespace doris::vectorized {
#include "common/compile_check_begin.h"
+
+namespace {
+
+const InvertedIndexAnalyzerCtx* get_match_analyzer_ctx(FunctionContext*
context) {
+ if (context == nullptr) {
+ return nullptr;
+ }
+ auto* analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
+ context->get_function_state(FunctionContext::THREAD_LOCAL));
+ if (analyzer_ctx == nullptr) {
+ analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
+ context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
+ }
+ return analyzer_ctx;
+}
+
+} // namespace
+
Status FunctionMatchBase::evaluate_inverted_index(
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
@@ -72,13 +91,8 @@ Status FunctionMatchBase::evaluate_inverted_index(
param.query_type = get_query_type_from_fn_name();
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
- if (is_string_type(param_type)) {
- RETURN_IF_ERROR(iter->read_from_index(¶m));
- } else {
- return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
- "invalid params type for
FunctionMatchBase::evaluate_inverted_index {}",
- param_type);
- }
+ param.analyzer_ctx = analyzer_ctx;
+ RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam {¶m}));
std::shared_ptr<roaring::Roaring> null_bitmap =
std::make_shared<roaring::Roaring>();
if (iter->has_null()) {
segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
@@ -106,12 +120,7 @@ Status FunctionMatchBase::execute_impl(FunctionContext*
context, Block& block,
std::string column_name = block.get_by_position(arguments[0]).name;
VLOG_DEBUG << "begin to execute match directly, column_name=" <<
column_name
<< ", match_query_str=" << match_query_str;
- auto* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
- context->get_function_state(FunctionContext::THREAD_LOCAL));
- if (inverted_index_ctx == nullptr) {
- inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
- context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
- }
+ auto* analyzer_ctx = get_match_analyzer_ctx(context);
const ColumnPtr source_col =
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
@@ -150,8 +159,8 @@ Status FunctionMatchBase::execute_impl(FunctionContext*
context, Block& block,
// set default value to 0, and match functions only need to set 1/true
vec_res.resize_fill(input_rows_count);
RETURN_IF_ERROR(execute_match(context, column_name, match_query_str,
input_rows_count, values,
- inverted_index_ctx,
- (array_col ? &(array_col->get_offsets()) :
nullptr), vec_res));
+ analyzer_ctx, (array_col ?
&(array_col->get_offsets()) : nullptr),
+ vec_res));
block.replace_by_position(result, std::move(res));
return Status::OK();
@@ -177,64 +186,61 @@ inline doris::segment_v2::InvertedIndexQueryType
FunctionMatchBase::get_query_ty
}
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
- InvertedIndexCtx* inverted_index_ctx, const std::string&
match_query_str,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx, const std::string&
match_query_str,
const std::string& column_name) const {
- VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
- <<
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
std::vector<TermInfo> query_tokens;
- if (inverted_index_ctx == nullptr) {
+ if (analyzer_ctx == nullptr) {
return query_tokens;
}
- // parse is none and custom analyzer is empty mean no analyzer is set
- if (inverted_index_ctx->parser_type ==
InvertedIndexParserType::PARSER_NONE &&
- inverted_index_ctx->analyzer_name.empty()) {
+ VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
+ <<
inverted_index_parser_type_to_string(analyzer_ctx->parser_type);
+ if (!analyzer_ctx->should_tokenize()) {
query_tokens.emplace_back(match_query_str);
return query_tokens;
}
auto reader =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
- inverted_index_ctx->char_filter_map);
+ analyzer_ctx->char_filter_map);
reader->init(match_query_str.data(), (int)match_query_str.size(), true);
query_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
- reader, inverted_index_ctx->analyzer);
+ reader, analyzer_ctx->analyzer);
return query_tokens;
}
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
- const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
+ const std::string& column_name, const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnString* string_col, int32_t current_block_row_idx,
const ColumnArray::Offsets64* array_offsets, int32_t&
current_src_array_offset) const {
std::vector<TermInfo> data_tokens;
+ if (analyzer_ctx == nullptr) {
+ return data_tokens;
+ }
if (array_offsets) {
for (auto next_src_array_offset =
(*array_offsets)[current_block_row_idx];
current_src_array_offset < next_src_array_offset;
++current_src_array_offset) {
const auto& str_ref =
string_col->get_data_at(current_src_array_offset);
- // parse is none and custom analyzer is empty mean no analyzer is
set
- if (inverted_index_ctx->parser_type ==
InvertedIndexParserType::PARSER_NONE &&
- inverted_index_ctx->analyzer_name.empty()) {
+ if (!analyzer_ctx->should_tokenize()) {
data_tokens.emplace_back(str_ref.to_string());
continue;
}
auto reader =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
- inverted_index_ctx->char_filter_map);
+ analyzer_ctx->char_filter_map);
reader->init(str_ref.data, (int)str_ref.size, true);
data_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
- reader, inverted_index_ctx->analyzer);
+ reader, analyzer_ctx->analyzer);
}
} else {
const auto& str_ref = string_col->get_data_at(current_block_row_idx);
- // parse is none and custom analyzer is empty mean no analyzer is set
- if (inverted_index_ctx->parser_type ==
InvertedIndexParserType::PARSER_NONE &&
- inverted_index_ctx->analyzer_name.empty()) {
+ if (!analyzer_ctx->should_tokenize()) {
data_tokens.emplace_back(str_ref.to_string());
} else {
auto reader =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
- inverted_index_ctx->char_filter_map);
+ analyzer_ctx->char_filter_map);
reader->init(str_ref.data, (int)str_ref.size, true);
data_tokens =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
- reader, inverted_index_ctx->analyzer);
+ reader, analyzer_ctx->analyzer);
}
}
return data_tokens;
@@ -257,24 +263,25 @@ Status FunctionMatchBase::check(FunctionContext* context,
const std::string& fun
Status FunctionMatchAny::execute_match(FunctionContext* context, const
std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count,
const ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
- auto query_tokens = analyse_query_str_token(inverted_index_ctx,
match_query_str, column_name);
+ auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str,
column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str,
-
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
return Status::OK();
}
auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
// TODO: more efficient impl
@@ -296,24 +303,25 @@ Status FunctionMatchAny::execute_match(FunctionContext*
context, const std::stri
Status FunctionMatchAll::execute_match(FunctionContext* context, const
std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count,
const ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
- auto query_tokens = analyse_query_str_token(inverted_index_ctx,
match_query_str, column_name);
+ auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str,
column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str,
-
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
return Status::OK();
}
auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
// TODO: more efficient impl
@@ -341,24 +349,25 @@ Status FunctionMatchAll::execute_match(FunctionContext*
context, const std::stri
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const
std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count, const
ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result)
const {
RETURN_IF_ERROR(check(context, name));
- auto query_tokens = analyse_query_str_token(inverted_index_ctx,
match_query_str, column_name);
+ auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str,
column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str,
-
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
return Status::OK();
}
auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
// TODO: more efficient impl
@@ -402,23 +411,24 @@ Status
FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s
Status FunctionMatchPhrasePrefix::execute_match(
FunctionContext* context, const std::string& column_name,
const std::string& match_query_str, size_t input_rows_count, const
ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64*
array_offsets,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx, const
ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
- auto query_tokens = analyse_query_str_token(inverted_index_ctx,
match_query_str, column_name);
+ auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str,
column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str,
-
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
return Status::OK();
}
int32_t current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
int64_t dis_count = data_tokens.size() - query_tokens.size();
@@ -459,13 +469,14 @@ Status FunctionMatchPhrasePrefix::execute_match(
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const
std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count, const
ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result)
const {
RETURN_IF_ERROR(check(context, name));
VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match,
parser_type: "
- <<
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
+ << (analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
const std::string& pattern = match_query_str;
@@ -498,7 +509,7 @@ Status FunctionMatchRegexp::execute_match(FunctionContext*
context, const std::s
try {
auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name,
inverted_index_ctx, string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
for (auto& input : data_tokens) {
@@ -528,23 +539,24 @@ Status
FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s
Status FunctionMatchPhraseEdge::execute_match(
FunctionContext* context, const std::string& column_name,
const std::string& match_query_str, size_t input_rows_count, const
ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64*
array_offsets,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx, const
ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
- auto query_tokens = analyse_query_str_token(inverted_index_ctx,
match_query_str, column_name);
+ auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str,
column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str,
-
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ analyzer_ctx ?
inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
+ : "unknown");
return Status::OK();
}
int32_t current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ auto data_tokens = analyse_data_token(column_name, analyzer_ctx,
string_col, i,
array_offsets,
current_src_array_offset);
int64_t dis_count = data_tokens.size() - query_tokens.size();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index d66c03332c8..55f4c9f407c 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -77,18 +77,18 @@ public:
virtual Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
const ColumnString* string_col,
- InvertedIndexCtx* inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const = 0;
doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name()
const;
- std::vector<TermInfo> analyse_query_str_token(InvertedIndexCtx*
inverted_index_ctx,
+ std::vector<TermInfo> analyse_query_str_token(const
InvertedIndexAnalyzerCtx* analyzer_ctx,
const std::string&
match_query_str,
const std::string&
field_name) const;
std::vector<TermInfo> analyse_data_token(const std::string& column_name,
- InvertedIndexCtx*
inverted_index_ctx,
+ const InvertedIndexAnalyzerCtx*
analyzer_ctx,
const ColumnString* string_col,
int32_t current_block_row_idx,
const ColumnArray::Offsets64*
array_offsets,
@@ -100,6 +100,7 @@ public:
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>&
data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t
num_rows,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const
override;
};
@@ -112,7 +113,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
@@ -126,7 +128,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
@@ -140,7 +143,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
@@ -154,7 +158,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
@@ -168,7 +173,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
@@ -182,7 +188,8 @@ public:
Status execute_match(FunctionContext* context, const std::string&
column_name,
const std::string& match_query_str, size_t
input_rows_count,
- const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString* string_col,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx,
const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) const override;
};
diff --git a/be/test/olap/inverted_index_parser_test.cpp
b/be/test/olap/inverted_index_parser_test.cpp
index 07520c2ef27..eeb9b262dab 100644
--- a/be/test/olap/inverted_index_parser_test.cpp
+++ b/be/test/olap/inverted_index_parser_test.cpp
@@ -263,32 +263,18 @@ TEST_F(InvertedIndexParserTest,
TestGetAnalyzerNameFromProperties) {
EXPECT_EQ(get_analyzer_name_from_properties(properties),
"another_analyzer");
}
-// Test InvertedIndexCtx structure
-TEST_F(InvertedIndexParserTest, TestInvertedIndexCtxStructure) {
- InvertedIndexCtx ctx;
+TEST_F(InvertedIndexParserTest, TestInvertedIndexAnalyzerCtxShouldTokenize) {
+ InvertedIndexAnalyzerCtx ctx;
+ ctx.parser_type = InvertedIndexParserType::PARSER_NONE;
+ ctx.analyzer_name.clear();
+ EXPECT_FALSE(ctx.should_tokenize());
- // Test default initialization
ctx.parser_type = InvertedIndexParserType::PARSER_ENGLISH;
- ctx.parser_mode = INVERTED_INDEX_PARSER_FINE_GRANULARITY;
- ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
- ctx.stop_words = "a,an,the";
- ctx.analyzer = nullptr;
-
- EXPECT_EQ(ctx.parser_type, InvertedIndexParserType::PARSER_ENGLISH);
- EXPECT_EQ(ctx.parser_mode, INVERTED_INDEX_PARSER_FINE_GRANULARITY);
- EXPECT_EQ(ctx.lower_case, INVERTED_INDEX_PARSER_TRUE);
- EXPECT_EQ(ctx.stop_words, "a,an,the");
- EXPECT_EQ(ctx.analyzer, nullptr);
-
- // Test char_filter_map
- ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] =
"char_replace";
- ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN] = "._";
- ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT] = " ";
-
- EXPECT_EQ(ctx.char_filter_map.size(), 3);
- EXPECT_EQ(ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE],
"char_replace");
- EXPECT_EQ(ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
"._");
-
EXPECT_EQ(ctx.char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT], "
");
+ EXPECT_TRUE(ctx.should_tokenize());
+
+ ctx.parser_type = InvertedIndexParserType::PARSER_NONE;
+ ctx.analyzer_name = "custom_analyzer";
+ EXPECT_TRUE(ctx.should_tokenize());
}
// Test constants
diff --git a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
index 6d0b6c2cb02..3500c25c33b 100644
--- a/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
+++ b/be/test/olap/rowset/segment_v2/index_reader_helper_test.cpp
@@ -64,7 +64,8 @@ public:
MOCK_FUNCTION Status query(const IndexQueryContextPtr& context, const
std::string& column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- std::shared_ptr<roaring::Roaring>& bit_map)
override {
+ std::shared_ptr<roaring::Roaring>& bit_map,
+ const InvertedIndexAnalyzerCtx* analyzer_ctx =
nullptr) override {
return Status::OK();
}
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
index 8c6e2328b93..75590e6ccc1 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp
@@ -158,27 +158,27 @@ TEST_F(AnalyzerTest, TestBuiltinAnalyzers) {
TEST_F(AnalyzerTest, TestCreateAnalyzer) {
// Test Case 1: Empty custom_analyzer, use builtin parser_type
{
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "";
- ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
- ctx.parser_mode = "";
- ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
- ctx.stop_words = "none";
-
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "";
+ config.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+ config.parser_mode = "";
+ config.lower_case = INVERTED_INDEX_PARSER_TRUE;
+ config.stop_words = "none";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr);
}
// Test Case 2: custom_analyzer is a builtin name (using one that doesn't
need dict)
{
- InvertedIndexCtx ctx;
- ctx.analyzer_name = INVERTED_INDEX_PARSER_ENGLISH;
- ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
- ctx.parser_mode = "";
- ctx.lower_case = INVERTED_INDEX_PARSER_FALSE;
- ctx.stop_words = "";
-
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = INVERTED_INDEX_PARSER_ENGLISH;
+ config.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ config.parser_mode = "";
+ config.lower_case = INVERTED_INDEX_PARSER_FALSE;
+ config.stop_words = "";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr);
}
@@ -194,22 +194,22 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) {
};
for (const auto& [name, requires_dict] : builtin_names) {
- InvertedIndexCtx ctx;
- ctx.analyzer_name = name;
- ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
- ctx.parser_mode = "";
- ctx.lower_case = "";
- ctx.stop_words = "";
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = name;
+ config.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ config.parser_mode = "";
+ config.lower_case = "";
+ config.stop_words = "";
if (requires_dict) {
try {
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ auto analyzer =
InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr) << "Created analyzer for builtin
name: " << name;
} catch (const std::exception& e) {
LOG(INFO) << "Skipped " << name << " due to missing dict: " <<
e.what();
}
} else {
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr) << "Failed for builtin name: " <<
name;
}
}
@@ -226,16 +226,16 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) {
};
for (const auto& [parser_type, requires_dict] : parser_types) {
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "";
- ctx.parser_type = parser_type;
- ctx.parser_mode = "";
- ctx.lower_case = "";
- ctx.stop_words = "";
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "";
+ config.parser_type = parser_type;
+ config.parser_mode = "";
+ config.lower_case = "";
+ config.stop_words = "";
if (requires_dict) {
try {
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ auto analyzer =
InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr)
<< "Created analyzer for parser_type: " <<
static_cast<int>(parser_type);
} catch (const std::exception& e) {
@@ -243,7 +243,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) {
<< " due to missing dict: " << e.what();
}
} else {
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr)
<< "Failed for parser_type: " <<
static_cast<int>(parser_type);
}
@@ -255,18 +255,18 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) {
TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) {
// Test when index_policy_mgr is null - should throw exception
{
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "non_existent_custom";
- ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
- ctx.parser_mode = "";
- ctx.lower_case = "";
- ctx.stop_words = "";
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "non_existent_custom";
+ config.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ config.parser_mode = "";
+ config.lower_case = "";
+ config.stop_words = "";
if (!doris::ExecEnv::GetInstance()->index_policy_mgr()) {
EXPECT_THROW(
{
try {
- InvertedIndexAnalyzer::create_analyzer(&ctx);
+ InvertedIndexAnalyzer::create_analyzer(&config);
} catch (const Exception& e) {
EXPECT_EQ(e.code(),
ErrorCode::INVERTED_INDEX_ANALYZER_ERROR);
EXPECT_TRUE(std::string(e.what()).find(
@@ -286,27 +286,27 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) {
// Test successful custom analyzer retrieval
{
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "test_custom_analyzer";
- ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
- ctx.parser_mode = "";
- ctx.lower_case = "";
- ctx.stop_words = "";
-
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "test_custom_analyzer";
+ config.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ config.parser_mode = "";
+ config.lower_case = "";
+ config.stop_words = "";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
EXPECT_NE(analyzer, nullptr);
}
// Test non-existent custom analyzer throws exception
{
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "non_existent_analyzer";
- ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
- ctx.parser_mode = "";
- ctx.lower_case = "";
- ctx.stop_words = "";
-
- EXPECT_THROW(InvertedIndexAnalyzer::create_analyzer(&ctx),
Exception);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "non_existent_analyzer";
+ config.parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
+ config.parser_mode = "";
+ config.lower_case = "";
+ config.stop_words = "";
+
+ EXPECT_THROW(InvertedIndexAnalyzer::create_analyzer(&config),
Exception);
}
}
}
@@ -315,14 +315,14 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) {
TEST_F(AnalyzerTest, TestAnalyzerFunctionality) {
// Create an analyzer and test it can tokenize text properly
- InvertedIndexCtx ctx;
- ctx.analyzer_name = "";
- ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD;
- ctx.parser_mode = "";
- ctx.lower_case = INVERTED_INDEX_PARSER_TRUE;
- ctx.stop_words = "none";
-
- auto analyzer = InvertedIndexAnalyzer::create_analyzer(&ctx);
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "";
+ config.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+ config.parser_mode = "";
+ config.lower_case = INVERTED_INDEX_PARSER_TRUE;
+ config.stop_words = "none";
+
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
ASSERT_NE(analyzer, nullptr);
// Test tokenization
diff --git a/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
index 8a3450e1e84..304edaabda2 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
@@ -37,17 +37,15 @@ TEST(ReaderTest, ArrayFieldTokenStreamWorkflow) {
char_filter_map["char_filter_pattern"] = ",";
char_filter_map["char_filter_replacement"] = " ";
- // 正确创建 InvertedIndexCtx
- auto inverted_index_ctx = std::make_shared<InvertedIndexCtx>();
- inverted_index_ctx->analyzer_name = "";
- inverted_index_ctx->parser_type = InvertedIndexParserType::PARSER_STANDARD;
- inverted_index_ctx->parser_mode = "standard";
- inverted_index_ctx->support_phrase = "yes";
- inverted_index_ctx->char_filter_map = char_filter_map;
- inverted_index_ctx->lower_case = "true";
- inverted_index_ctx->stop_words = "";
+ InvertedIndexAnalyzerConfig config;
+ config.analyzer_name = "";
+ config.parser_type = InvertedIndexParserType::PARSER_STANDARD;
+ config.parser_mode = "standard";
+ config.char_filter_map = char_filter_map;
+ config.lower_case = "true";
+ config.stop_words = "";
- auto analyzer =
InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get());
+ auto analyzer = InvertedIndexAnalyzer::create_analyzer(&config);
ASSERT_NE(analyzer, nullptr);
std::string test_data = "hello,world,test";
@@ -71,7 +69,7 @@ TEST(ReaderTest, ArrayFieldTokenStreamWorkflow) {
new_field.reset(field);
{
ReaderPtr char_string_reader =
-
InvertedIndexAnalyzer::create_reader(inverted_index_ctx->char_filter_map);
+
InvertedIndexAnalyzer::create_reader(config.char_filter_map);
char_string_reader->init(slice.get_data(),
cast_set<int32_t>(slice.get_size()),
false);
diff --git a/be/test/vec/function/function_is_null_test.cpp
b/be/test/vec/function/function_is_null_test.cpp
index dc9fd9ddafa..c909c5efc46 100644
--- a/be/test/vec/function/function_is_null_test.cpp
+++ b/be/test/vec/function/function_is_null_test.cpp
@@ -199,7 +199,7 @@ TEST_F(FunctionIsNullTest, gc_binlogs_test) {
auto is_null_function = std::make_shared<FunctionIsNull>();
EXPECT_TRUE(is_null_function
->evaluate_inverted_index(arguments,
data_type_with_names,
- iterators, 3,
bitmap_result)
+ iterators, 3,
nullptr, bitmap_result)
.ok());
EXPECT_TRUE(!bitmap_result.is_empty());
EXPECT_EQ(expected_result,
bitmap_result.get_data_bitmap()->cardinality());
@@ -207,7 +207,7 @@ TEST_F(FunctionIsNullTest, gc_binlogs_test) {
auto is_not_null_function = std::make_shared<FunctionIsNotNull>();
EXPECT_TRUE(is_not_null_function
->evaluate_inverted_index(arguments,
data_type_with_names,
- iterators, 3,
bitmap_result)
+ iterators, 3,
nullptr, bitmap_result)
.ok());
EXPECT_TRUE(!bitmap_result.is_empty());
EXPECT_EQ(expected_result,
bitmap_result.get_data_bitmap()->cardinality());
@@ -264,7 +264,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result;
EXPECT_TRUE(is_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result)
+ nullptr, bitmap_result)
.ok());
EXPECT_TRUE(bitmap_result.is_empty());
}
@@ -273,7 +273,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result;
EXPECT_TRUE(is_not_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result)
+ nullptr, bitmap_result)
.ok());
EXPECT_TRUE(bitmap_result.is_empty());
}
@@ -285,7 +285,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result;
EXPECT_TRUE(is_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result)
+ nullptr, bitmap_result)
.ok());
EXPECT_TRUE(bitmap_result.is_empty());
}
@@ -295,7 +295,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result;
EXPECT_TRUE(is_not_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result)
+ nullptr, bitmap_result)
.ok());
EXPECT_TRUE(bitmap_result.is_empty());
}
@@ -374,7 +374,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result1;
EXPECT_TRUE(is_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result1)
+ nullptr, bitmap_result1)
.ok());
// When there's no null data, the result should be empty or have 0
cardinality
// depending on whether has_null() returns false
@@ -382,7 +382,7 @@ TEST_F(FunctionIsNullTest,
evaluate_inverted_index_corner_cases) {
segment_v2::InvertedIndexResultBitmap bitmap_result2;
EXPECT_TRUE(is_not_null_function
->evaluate_inverted_index(arguments,
data_type_with_names, iterators, 3,
- bitmap_result2)
+ nullptr, bitmap_result2)
.ok());
// Similar test for is_not_null
}
diff --git a/be/test/vec/function/function_match_test.cpp
b/be/test/vec/function/function_match_test.cpp
index 59868492dbe..6cf275c4ce8 100644
--- a/be/test/vec/function/function_match_test.cpp
+++ b/be/test/vec/function/function_match_test.cpp
@@ -34,19 +34,20 @@ namespace doris::vectorized {
// Helper structure to manage analyzer lifetime
struct TestInvertedIndexCtx {
- std::unique_ptr<InvertedIndexCtx> ctx;
+ std::unique_ptr<InvertedIndexAnalyzerCtx> ctx;
std::shared_ptr<lucene::analysis::Analyzer> analyzer_holder;
};
// Helper function to create inverted index context
TestInvertedIndexCtx create_inverted_index_ctx(InvertedIndexParserType
parser_type) {
TestInvertedIndexCtx test_ctx;
- test_ctx.ctx = std::make_unique<InvertedIndexCtx>();
+ test_ctx.ctx = std::make_unique<InvertedIndexAnalyzerCtx>();
test_ctx.ctx->parser_type = parser_type;
if (parser_type != InvertedIndexParserType::PARSER_NONE) {
+ InvertedIndexAnalyzerConfig config;
+ config.parser_type = parser_type;
test_ctx.analyzer_holder =
-
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(
- test_ctx.ctx.get());
+
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(&config);
test_ctx.ctx->analyzer = test_ctx.analyzer_holder.get();
}
return test_ctx;
@@ -63,7 +64,7 @@ TEST(FunctionMatchTest, analyse_query_str) {
}
{
- auto inverted_index_ctx = std::make_unique<InvertedIndexCtx>();
+ auto inverted_index_ctx = std::make_unique<InvertedIndexAnalyzerCtx>();
inverted_index_ctx->parser_type = InvertedIndexParserType::PARSER_NONE;
auto query_tokens =
func_match_phrase.analyse_query_str_token(inverted_index_ctx.get(),
"a b c",
"name");
@@ -71,10 +72,12 @@ TEST(FunctionMatchTest, analyse_query_str) {
}
{
- auto inverted_index_ctx = std::make_unique<InvertedIndexCtx>();
+ auto inverted_index_ctx = std::make_unique<InvertedIndexAnalyzerCtx>();
inverted_index_ctx->parser_type =
InvertedIndexParserType::PARSER_ENGLISH;
- auto analyzer =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(
- inverted_index_ctx.get());
+ InvertedIndexAnalyzerConfig config;
+ config.parser_type = InvertedIndexParserType::PARSER_ENGLISH;
+ auto analyzer =
+
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(&config);
inverted_index_ctx->analyzer = analyzer.get();
auto query_tokens =
func_match_phrase.analyse_query_str_token(inverted_index_ctx.get(),
"a b c",
"name");
diff --git a/be/test/vec/function/function_multi_match_test.cpp
b/be/test/vec/function/function_multi_match_test.cpp
index bbcadef73c5..1a9faa682bc 100644
--- a/be/test/vec/function/function_multi_match_test.cpp
+++ b/be/test/vec/function/function_multi_match_test.cpp
@@ -63,7 +63,7 @@ TEST_F(FunctionMultiMatchTest,
EvaluateInvertedIndexWithNullIterator) {
FunctionMultiMatch function;
Status status = function.evaluate_inverted_index(arguments,
data_type_with_names, iterators,
- num_rows, bitmap_result);
+ num_rows, nullptr,
bitmap_result);
ASSERT_FALSE(status.ok());
EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR);
diff --git a/be/test/vec/function/function_search_test.cpp
b/be/test/vec/function/function_search_test.cpp
index 4b6b27ed861..a4f53068d6f 100644
--- a/be/test/vec/function/function_search_test.cpp
+++ b/be/test/vec/function/function_search_test.cpp
@@ -498,8 +498,8 @@ TEST_F(FunctionSearchTest, TestEvaluateInvertedIndexBasic) {
uint32_t num_rows = 100;
InvertedIndexResultBitmap bitmap_result;
- auto status = function_search->evaluate_inverted_index(arguments,
data_type_with_names,
- iterators,
num_rows, bitmap_result);
+ auto status = function_search->evaluate_inverted_index(
+ arguments, data_type_with_names, iterators, num_rows, nullptr,
bitmap_result);
EXPECT_TRUE(status.ok()); // Should return OK for legacy method
}
diff --git
a/regression-test/data/inverted_index_p0/test_index_lowercase_fault_injection.out
b/regression-test/data/inverted_index_p0/test_index_lowercase_fault_injection.out
index 196077986ec..66536732dfe 100644
---
a/regression-test/data/inverted_index_p0/test_index_lowercase_fault_injection.out
+++
b/regression-test/data/inverted_index_p0/test_index_lowercase_fault_injection.out
@@ -6,7 +6,7 @@
0
-- !sql --
-8
+3
-- !sql --
3
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]