This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f5672f3184 [Refact](inverted index) refactor inverted index writer 
init (#29072)
6f5672f3184 is described below

commit 6f5672f3184ee71e63537f9e16c3810248f77039
Author: airborne12 <[email protected]>
AuthorDate: Wed Dec 27 12:49:26 2023 +0800

    [Refact](inverted index) refactor inverted index writer init (#29072)
---
 .../rowset/segment_v2/inverted_index_writer.cpp    | 319 +++++++++------------
 1 file changed, 137 insertions(+), 182 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index c2cc0bbbefa..13a31e24768 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -72,7 +72,6 @@ const int32_t MERGE_FACTOR = 100000000;
 const int32_t MAX_LEAF_COUNT = 1024;
 const float MAXMBSortInHeap = 512.0 * 8;
 const int DIMS = 1;
-const std::string empty_value;
 
 template <FieldType field_type>
 class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
@@ -139,9 +138,38 @@ public:
         return Status::OK();
     }
 
-    Status init_fulltext_index() {
-        bool create = true;
+    std::unique_ptr<lucene::analysis::Analyzer> create_chinese_analyzer() {
+        auto chinese_analyzer = 
std::make_unique<lucene::analysis::LanguageBasedAnalyzer>();
+        chinese_analyzer->setLanguage(L"chinese");
+        chinese_analyzer->initDict(config::inverted_index_dict_path);
+
+        auto mode = 
get_parser_mode_string_from_properties(_index_meta->properties());
+        if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+        } else {
+            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+        }
+
+        return chinese_analyzer;
+    }
+
+    Status create_char_string_reader(std::unique_ptr<lucene::util::Reader>& 
string_reader) {
+        CharFilterMap char_filter_map =
+                
get_parser_char_filter_map_from_properties(_index_meta->properties());
+        if (!char_filter_map.empty()) {
+            string_reader = 
std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
+                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE],
+                    new lucene::util::SStringReader<char>(),
+                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
+                    
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
+        } else {
+            string_reader = 
std::make_unique<lucene::util::SStringReader<char>>();
+        }
+        return Status::OK();
+    }
 
+    Status create_index_directory(std::unique_ptr<DorisCompoundDirectory>& 
dir) {
+        bool create = true;
         auto index_path = InvertedIndexDescriptor::get_temporary_index_path(
                 _directory + "/" + _segment_file_name, _index_meta->index_id(),
                 _index_meta->get_index_suffix());
@@ -149,8 +177,7 @@ public:
         bool exists = false;
         auto st = _fs->exists(index_path.c_str(), &exists);
         if (!st.ok()) {
-            LOG(ERROR) << "index_path:"
-                       << " exists error:" << st;
+            LOG(ERROR) << "index_path: exists error:" << st;
             return st;
         }
         if (exists) {
@@ -158,69 +185,76 @@ public:
             return Status::InternalError("init_fulltext_index directory 
already exists");
         }
 
-        _char_string_reader = 
std::make_unique<lucene::util::SStringReader<char>>();
-        CharFilterMap char_filter_map =
-                
get_parser_char_filter_map_from_properties(_index_meta->properties());
-        if (!char_filter_map.empty()) {
-            _char_string_reader.reset(CharFilterFactory::create(
-                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE],
-                    _char_string_reader.release(),
-                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
-                    
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
-        }
+        dir = std::unique_ptr<DorisCompoundDirectory>(
+                DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), 
create));
+        return Status::OK();
+    }
 
-        _doc = std::make_unique<lucene::document::Document>();
-        _dir.reset(DorisCompoundDirectory::getDirectory(_fs, 
index_path.c_str(), true));
-
-        if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
-            _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
-            _analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
-        } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
-            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
-        } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
-            auto chinese_analyzer = _CLNEW 
lucene::analysis::LanguageBasedAnalyzer();
-            chinese_analyzer->setLanguage(L"chinese");
-            chinese_analyzer->initDict(config::inverted_index_dict_path);
-            auto mode = 
get_parser_mode_string_from_properties(_index_meta->properties());
-            if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
-                chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
-            } else {
-                
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
-            }
-            _analyzer.reset(chinese_analyzer);
-        } else {
-            // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
-            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
+    Status create_index_writer(std::unique_ptr<lucene::index::IndexWriter>& 
index_writer) {
+        bool create_index = true;
+        bool close_dir_on_shutdown = true;
+        index_writer = std::make_unique<lucene::index::IndexWriter>(
+                _dir.get(), _analyzer.get(), create_index, 
close_dir_on_shutdown);
+        index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
+        
index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
+        index_writer->setMaxFieldLength(MAX_FIELD_LEN);
+        index_writer->setMergeFactor(MERGE_FACTOR);
+        index_writer->setUseCompoundFile(false);
+
+        return Status::OK();
+    }
+
+    Status create_field(lucene::document::Field** field) {
+        int field_config = int(lucene::document::Field::STORE_NO) |
+                           int(lucene::document::Field::INDEX_NONORMS);
+        field_config |= (_parser_type == InvertedIndexParserType::PARSER_NONE)
+                                ? 
int(lucene::document::Field::INDEX_UNTOKENIZED)
+                                : 
int(lucene::document::Field::INDEX_TOKENIZED);
+        *field = new lucene::document::Field(_field_name.c_str(), 
field_config);
+        (*field)->setOmitTermFreqAndPositions(
+                
get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
+                                INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES
+                        ? false
+                        : true);
+        return Status::OK();
+    }
+
+    Status create_analyzer(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer) {
+        switch (_parser_type) {
+        case InvertedIndexParserType::PARSER_STANDARD:
+        case InvertedIndexParserType::PARSER_UNICODE:
+            analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
+            break;
+        case InvertedIndexParserType::PARSER_ENGLISH:
+            analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
+            break;
+        case InvertedIndexParserType::PARSER_CHINESE:
+            analyzer = create_chinese_analyzer();
+            break;
+        default:
+            analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
+            break;
         }
+        setup_analyzer_lowercase(analyzer);
+        return Status::OK();
+    }
+
+    void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer) {
         auto lowercase = 
get_parser_lowercase_from_properties(_index_meta->properties());
         if (lowercase == "true") {
-            _analyzer->set_lowercase(true);
+            analyzer->set_lowercase(true);
         } else if (lowercase == "false") {
-            _analyzer->set_lowercase(false);
+            analyzer->set_lowercase(false);
         }
-        _index_writer = 
std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
-                                                                     create, 
true);
-        _index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS);
-        
_index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size);
-        _index_writer->setMaxFieldLength(MAX_FIELD_LEN);
-        _index_writer->setMergeFactor(MERGE_FACTOR);
-        _index_writer->setUseCompoundFile(false);
-        _doc->clear();
+    }
 
-        int field_config = int(lucene::document::Field::STORE_NO) |
-                           int(lucene::document::Field::INDEX_NONORMS);
-        if (_parser_type == InvertedIndexParserType::PARSER_NONE) {
-            field_config |= int(lucene::document::Field::INDEX_UNTOKENIZED);
-        } else {
-            field_config |= int(lucene::document::Field::INDEX_TOKENIZED);
-        }
-        _field = new lucene::document::Field(_field_name.c_str(), 
field_config);
-        if 
(get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
-            INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES) {
-            _field->setOmitTermFreqAndPositions(false);
-        } else {
-            _field->setOmitTermFreqAndPositions(true);
-        }
+    Status init_fulltext_index() {
+        RETURN_IF_ERROR(create_index_directory(_dir));
+        RETURN_IF_ERROR(create_char_string_reader(_char_string_reader));
+        RETURN_IF_ERROR(create_analyzer(_analyzer));
+        RETURN_IF_ERROR(create_index_writer(_index_writer));
+        RETURN_IF_ERROR(create_field(&_field));
+        _doc = std::make_unique<lucene::document::Document>();
         _doc->add(*_field);
         return Status::OK();
     }
@@ -555,12 +589,13 @@ private:
     roaring::Roaring _null_bitmap;
     uint64_t _reverted_index_size;
 
-    std::unique_ptr<lucene::document::Document> _doc {};
-    lucene::document::Field* _field {};
-    std::unique_ptr<lucene::index::IndexWriter> _index_writer {};
-    std::unique_ptr<lucene::analysis::Analyzer> _analyzer {};
-    std::unique_ptr<lucene::util::Reader> _char_string_reader {};
-    std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer;
+    std::unique_ptr<lucene::document::Document> _doc = nullptr;
+    lucene::document::Field* _field = nullptr;
+    std::unique_ptr<lucene::index::IndexWriter> _index_writer = nullptr;
+    std::unique_ptr<lucene::analysis::Analyzer> _analyzer = nullptr;
+    std::unique_ptr<lucene::util::Reader> _char_string_reader = nullptr;
+    std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer = nullptr;
+    std::unique_ptr<DorisCompoundDirectory> _dir = nullptr;
     std::string _segment_file_name;
     std::string _directory;
     io::FileSystemSPtr _fs;
@@ -568,7 +603,6 @@ private:
     const TabletIndex* _index_meta;
     InvertedIndexParserType _parser_type;
     std::wstring _field_name;
-    std::unique_ptr<DorisCompoundDirectory> _dir;
 };
 
 Status InvertedIndexColumnWriter::create(const Field* field,
@@ -576,127 +610,48 @@ Status InvertedIndexColumnWriter::create(const Field* 
field,
                                          const std::string& segment_file_name,
                                          const std::string& dir, const 
TabletIndex* index_meta,
                                          const io::FileSystemSPtr& fs) {
-    auto typeinfo = field->type_info();
+    const auto* typeinfo = field->type_info();
     FieldType type = typeinfo->type();
     std::string field_name = field->name();
     if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
-        const auto array_typeinfo = dynamic_cast<const 
ArrayTypeInfo*>(typeinfo);
-        typeinfo = array_typeinfo->item_type_info();
-        type = typeinfo->type();
+        const auto* array_typeinfo = dynamic_cast<const 
ArrayTypeInfo*>(typeinfo);
+        if (array_typeinfo != nullptr) {
+            typeinfo = array_typeinfo->item_type_info();
+            type = typeinfo->type();
+        } else {
+            return Status::NotSupported("unsupported array type for inverted 
index: " +
+                                        std::to_string(int(type)));
+        }
     }
 
     switch (type) {
-    case FieldType::OLAP_FIELD_TYPE_CHAR: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_CHAR>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_VARCHAR: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_VARCHAR>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_STRING: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_STRING>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DATETIME: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DATETIME>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DATE: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DATE>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DATETIMEV2>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DATEV2: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DATEV2>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_TINYINT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_TINYINT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_SMALLINT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_SMALLINT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
+#define M(TYPE)                                                       \
+    case TYPE:                                                        \
+        *res = std::make_unique<InvertedIndexColumnWriterImpl<TYPE>>( \
+                field_name, segment_file_name, dir, fs, index_meta);  \
         break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_INT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_INT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_LARGEINT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_LARGEINT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DECIMAL: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DECIMAL>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DECIMAL32: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DECIMAL32>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DECIMAL64: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DECIMAL64>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DECIMAL128I>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DECIMAL256: {
-        *res = std::make_unique<
-                
InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DECIMAL256>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_BOOL: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_BOOL>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_DOUBLE: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_DOUBLE>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_FLOAT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_FLOAT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
-    case FieldType::OLAP_FIELD_TYPE_BIGINT: {
-        *res = 
std::make_unique<InvertedIndexColumnWriterImpl<FieldType::OLAP_FIELD_TYPE_BIGINT>>(
-                field_name, segment_file_name, dir, fs, index_meta);
-        break;
-    }
+        M(FieldType::OLAP_FIELD_TYPE_TINYINT)
+        M(FieldType::OLAP_FIELD_TYPE_SMALLINT)
+        M(FieldType::OLAP_FIELD_TYPE_INT)
+        M(FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT)
+        M(FieldType::OLAP_FIELD_TYPE_BIGINT)
+        M(FieldType::OLAP_FIELD_TYPE_LARGEINT)
+        M(FieldType::OLAP_FIELD_TYPE_CHAR)
+        M(FieldType::OLAP_FIELD_TYPE_VARCHAR)
+        M(FieldType::OLAP_FIELD_TYPE_STRING)
+        M(FieldType::OLAP_FIELD_TYPE_DATE)
+        M(FieldType::OLAP_FIELD_TYPE_DATETIME)
+        M(FieldType::OLAP_FIELD_TYPE_DECIMAL)
+        M(FieldType::OLAP_FIELD_TYPE_DATEV2)
+        M(FieldType::OLAP_FIELD_TYPE_DATETIMEV2)
+        M(FieldType::OLAP_FIELD_TYPE_DECIMAL32)
+        M(FieldType::OLAP_FIELD_TYPE_DECIMAL64)
+        M(FieldType::OLAP_FIELD_TYPE_DECIMAL128I)
+        M(FieldType::OLAP_FIELD_TYPE_DECIMAL256)
+        M(FieldType::OLAP_FIELD_TYPE_BOOL)
+        M(FieldType::OLAP_FIELD_TYPE_DOUBLE)
+        M(FieldType::OLAP_FIELD_TYPE_FLOAT)
+#undef M
     default:
         return Status::NotSupported("unsupported type for inverted index: " +
                                     std::to_string(int(type)));


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to