This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 130d9c057e3 [fix](chinese) fix the issue where the be crashes due to
the missing chinese dict (#30820)
130d9c057e3 is described below
commit 130d9c057e3e8e6c1dc18b4edd10d26d246286b3
Author: zzzxl <[email protected]>
AuthorDate: Mon Feb 5 07:15:39 2024 +0800
[fix](chinese) fix the issue where the be crashes due to the missing
chinese dict (#30820)
---
be/src/clucene | 2 +-
be/src/common/status.h | 1 +
.../rowset/segment_v2/inverted_index_writer.cpp | 52 ++++++++++++----------
be/src/vec/functions/function_tokenize.cpp | 12 ++++-
4 files changed, 41 insertions(+), 26 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 2a20b0d72e6..cd0cfe7f2dd 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 2a20b0d72e6d05cb7025137823f874ff062bdebf
+Subproject commit cd0cfe7f2dd05398b37a85aa7f17c46545438d01
diff --git a/be/src/common/status.h b/be/src/common/status.h
index 15ba73cd646..555ee73443d 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -274,6 +274,7 @@ E(INVERTED_INDEX_EVALUATE_SKIPPED, -6007);
E(INVERTED_INDEX_BUILD_WAITTING, -6008);
E(INVERTED_INDEX_NOT_IMPLEMENTED, -6009);
E(INVERTED_INDEX_COMPACTION_ERROR, -6010);
+E(INVERTED_INDEX_ANALYZER_ERROR, -6011);
E(KEY_NOT_FOUND, -7000);
E(KEY_ALREADY_EXISTS, -7001);
E(ENTRY_NOT_FOUND, -7002);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 9d7a047c606..c99de50d7ac 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -184,32 +184,38 @@ public:
_dir.reset(DorisCompoundDirectoryFactory::getDirectory(
_fs, index_path.c_str(), use_compound_file_writer,
can_use_ram_dir));
- if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
- _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
- _analyzer =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
- } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
- _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
- } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
- auto chinese_analyzer = _CLNEW
lucene::analysis::LanguageBasedAnalyzer();
- chinese_analyzer->setLanguage(L"chinese");
- chinese_analyzer->initDict(config::inverted_index_dict_path);
- auto mode =
get_parser_mode_string_from_properties(_index_meta->properties());
- if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
- chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+ try {
+ if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+ _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ _analyzer =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
+ } else if (_parser_type ==
InvertedIndexParserType::PARSER_ENGLISH) {
+ _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
+ } else if (_parser_type ==
InvertedIndexParserType::PARSER_CHINESE) {
+ auto chinese_analyzer = _CLNEW
lucene::analysis::LanguageBasedAnalyzer();
+ chinese_analyzer->setLanguage(L"chinese");
+ chinese_analyzer->initDict(config::inverted_index_dict_path);
+ auto mode =
get_parser_mode_string_from_properties(_index_meta->properties());
+ if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
+
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+ } else {
+
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+ }
+ _analyzer.reset(chinese_analyzer);
} else {
-
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+ // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
+ _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
- _analyzer.reset(chinese_analyzer);
- } else {
- // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
- _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
- }
- auto lowercase =
get_parser_lowercase_from_properties(_index_meta->properties());
- if (lowercase == "true") {
- _analyzer->set_lowercase(true);
- } else if (lowercase == "false") {
- _analyzer->set_lowercase(false);
+ auto lowercase =
get_parser_lowercase_from_properties(_index_meta->properties());
+ if (lowercase == "true") {
+ _analyzer->set_lowercase(true);
+ } else if (lowercase == "false") {
+ _analyzer->set_lowercase(false);
+ }
+ } catch (CLuceneError& e) {
+ return
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+ "inverted index create analyzer failed: {}", e.what());
}
+
_index_writer =
std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create,
true);
_index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 54d9bee4ae9..1e7a5d3c9bb 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -140,8 +140,16 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
inverted_index_ctx.parser_mode =
get_parser_mode_string_from_properties(properties);
inverted_index_ctx.char_filter_map =
get_parser_char_filter_map_from_properties(properties);
- auto analyzer =
-
doris::segment_v2::InvertedIndexReader::create_analyzer(&inverted_index_ctx);
+
+ std::unique_ptr<lucene::analysis::Analyzer> analyzer;
+ try {
+ analyzer =
doris::segment_v2::InvertedIndexReader::create_analyzer(
+ &inverted_index_ctx);
+ } catch (CLuceneError& e) {
+ return
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+ "inverted index create analyzer failed: {}", e.what());
+ }
+
inverted_index_ctx.analyzer = analyzer.get();
_do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column,
dest_offsets,
dest_nested_null_map);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]