Re: [PR] [Improve](inverted_index) update clucene and improve array inverted index writer [doris]

via GitHub Tue, 26 Mar 2024 20:57:41 -0700


xiaokang commented on code in PR #32436:
URL: https://github.com/apache/doris/pull/32436#discussion_r1540423275



##########
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp:
##########
@@ -348,39 +367,60 @@ class InvertedIndexColumnWriterImpl : public 
InvertedIndexColumnWriter {
         }
         const auto* offsets = reinterpret_cast<const uint64_t*>(offsets_ptr);
         if constexpr (field_is_slice_type(field_type)) {
-            if (_field == nullptr || _index_writer == nullptr) {
-                LOG(ERROR) << "field or index writer is null in inverted index 
writer.";
-                return Status::InternalError(
-                        "field or index writer is null in inverted index 
writer");
+            if (_index_writer == nullptr) {
+                LOG(ERROR) << "index writer is null in inverted index writer.";
+                return Status::InternalError("index writer is null in inverted 
index writer");
             }
             auto ignore_above_value =
                     
get_parser_ignore_above_value_from_properties(_index_meta->properties());
             auto ignore_above = std::stoi(ignore_above_value);
             for (int i = 0; i < count; ++i) {
                 // offsets[i+1] is now row element count
-                std::vector<std::string> strings;
                 // [0, 3, 6]
                 // [10,20,30] [20,30,40], [30,40,50]
                 auto start_off = offsets[i];
                 auto end_off = offsets[i + 1];
+                // TODO(Amory).later we use object pool to avoid field creation
+                lucene::document::Field* new_field = nullptr;
+                CL_NS(analysis)::TokenStream* ts = nullptr;
                 for (auto j = start_off; j < end_off; ++j) {
                     if (null_map[j] == 1) {
                         continue;
                     }
+                    // now we temp create field . later make a pool
+                    if (Status st = create_field(&new_field); st != 
Status::OK()) {
+                        LOG(ERROR)
+                                << "create field " << 
string(_field_name.begin(), _field_name.end())
+                                << " error:" << st;
+                        return st;
+                    }
                     auto* v = (Slice*)((const uint8_t*)value_ptr + j * 
field_size);
-                    strings.emplace_back(v->get_data(), v->get_size());
-                }
-
-                auto value = join(strings, " ");
-                // only ignore_above UNTOKENIZED strings and empty strings not 
tokenized
-                if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
-                     value.length() > ignore_above) ||
-                    (_parser_type != InvertedIndexParserType::PARSER_NONE && 
value.empty())) {
-                    RETURN_IF_ERROR(add_null_document());
-                } else {
-                    new_fulltext_field(value.c_str(), value.length());
-                    RETURN_IF_ERROR(add_document());
+                    if ((_parser_type == InvertedIndexParserType::PARSER_NONE 
&&
+                         v->get_size() > ignore_above) ||
+                        (_parser_type != InvertedIndexParserType::PARSER_NONE 
&& v->empty())) {
+                        // is here a null value?
+                        // TODO. Maybe here has performance problem for large 
size string.
+                        continue;
+                    } else {
+                        if (_parser_type == 
InvertedIndexParserType::PARSER_ENGLISH ||

Review Comment:
   why check specific parser type? We should check NONE and others.



##########
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp:
##########
@@ -293,6 +301,17 @@ class InvertedIndexColumnWriterImpl : public 
InvertedIndexColumnWriter {
         }
     }
 
+    void new_fulltext_field(const char* field_value_data, size_t 
field_value_size,

Review Comment:
   this function is not used



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [Improve](inverted_index) update clucene and improve array inverted index writer [doris]

Reply via email to