This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 130d9c057e3 [fix](chinese) fix the issue where the be crashes due to 
the missing chinese dict (#30820)
130d9c057e3 is described below

commit 130d9c057e3e8e6c1dc18b4edd10d26d246286b3
Author: zzzxl <[email protected]>
AuthorDate: Mon Feb 5 07:15:39 2024 +0800

    [fix](chinese) fix the issue where the be crashes due to the missing 
chinese dict (#30820)
---
 be/src/clucene                                     |  2 +-
 be/src/common/status.h                             |  1 +
 .../rowset/segment_v2/inverted_index_writer.cpp    | 52 ++++++++++++----------
 be/src/vec/functions/function_tokenize.cpp         | 12 ++++-
 4 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 2a20b0d72e6..cd0cfe7f2dd 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 2a20b0d72e6d05cb7025137823f874ff062bdebf
+Subproject commit cd0cfe7f2dd05398b37a85aa7f17c46545438d01
diff --git a/be/src/common/status.h b/be/src/common/status.h
index 15ba73cd646..555ee73443d 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -274,6 +274,7 @@ E(INVERTED_INDEX_EVALUATE_SKIPPED, -6007);
 E(INVERTED_INDEX_BUILD_WAITTING, -6008);
 E(INVERTED_INDEX_NOT_IMPLEMENTED, -6009);
 E(INVERTED_INDEX_COMPACTION_ERROR, -6010);
+E(INVERTED_INDEX_ANALYZER_ERROR, -6011);
 E(KEY_NOT_FOUND, -7000);
 E(KEY_ALREADY_EXISTS, -7001);
 E(ENTRY_NOT_FOUND, -7002);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 9d7a047c606..c99de50d7ac 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -184,32 +184,38 @@ public:
         _dir.reset(DorisCompoundDirectoryFactory::getDirectory(
                 _fs, index_path.c_str(), use_compound_file_writer, 
can_use_ram_dir));
 
-        if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
-            _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
-            _analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
-        } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
-            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
-        } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
-            auto chinese_analyzer = _CLNEW 
lucene::analysis::LanguageBasedAnalyzer();
-            chinese_analyzer->setLanguage(L"chinese");
-            chinese_analyzer->initDict(config::inverted_index_dict_path);
-            auto mode = 
get_parser_mode_string_from_properties(_index_meta->properties());
-            if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
-                chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+        try {
+            if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+                _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+                _analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
+            } else if (_parser_type == 
InvertedIndexParserType::PARSER_ENGLISH) {
+                _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
+            } else if (_parser_type == 
InvertedIndexParserType::PARSER_CHINESE) {
+                auto chinese_analyzer = _CLNEW 
lucene::analysis::LanguageBasedAnalyzer();
+                chinese_analyzer->setLanguage(L"chinese");
+                chinese_analyzer->initDict(config::inverted_index_dict_path);
+                auto mode = 
get_parser_mode_string_from_properties(_index_meta->properties());
+                if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
+                    
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+                } else {
+                    
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+                }
+                _analyzer.reset(chinese_analyzer);
             } else {
-                
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+                // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
+                _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
             }
-            _analyzer.reset(chinese_analyzer);
-        } else {
-            // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
-            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
-        }
-        auto lowercase = 
get_parser_lowercase_from_properties(_index_meta->properties());
-        if (lowercase == "true") {
-            _analyzer->set_lowercase(true);
-        } else if (lowercase == "false") {
-            _analyzer->set_lowercase(false);
+            auto lowercase = 
get_parser_lowercase_from_properties(_index_meta->properties());
+            if (lowercase == "true") {
+                _analyzer->set_lowercase(true);
+            } else if (lowercase == "false") {
+                _analyzer->set_lowercase(false);
+            }
+        } catch (CLuceneError& e) {
+            return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                    "inverted index create analyzer failed: {}", e.what());
         }
+
         _index_writer = 
std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
                                                                      create, 
true);
         
_index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index 54d9bee4ae9..1e7a5d3c9bb 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -140,8 +140,16 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             inverted_index_ctx.parser_mode = 
get_parser_mode_string_from_properties(properties);
             inverted_index_ctx.char_filter_map =
                     get_parser_char_filter_map_from_properties(properties);
-            auto analyzer =
-                    
doris::segment_v2::InvertedIndexReader::create_analyzer(&inverted_index_ctx);
+
+            std::unique_ptr<lucene::analysis::Analyzer> analyzer;
+            try {
+                analyzer = 
doris::segment_v2::InvertedIndexReader::create_analyzer(
+                        &inverted_index_ctx);
+            } catch (CLuceneError& e) {
+                return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
+                        "inverted index create analyzer failed: {}", e.what());
+            }
+
             inverted_index_ctx.analyzer = analyzer.get();
             _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, 
dest_offsets,
                          dest_nested_null_map);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to