xiaokang commented on code in PR #32436:
URL: https://github.com/apache/doris/pull/32436#discussion_r1540423275
##########
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp:
##########
@@ -348,39 +367,60 @@ class InvertedIndexColumnWriterImpl : public
InvertedIndexColumnWriter {
}
const auto* offsets = reinterpret_cast<const uint64_t*>(offsets_ptr);
if constexpr (field_is_slice_type(field_type)) {
- if (_field == nullptr || _index_writer == nullptr) {
- LOG(ERROR) << "field or index writer is null in inverted index
writer.";
- return Status::InternalError(
- "field or index writer is null in inverted index
writer");
+ if (_index_writer == nullptr) {
+ LOG(ERROR) << "index writer is null in inverted index writer.";
+ return Status::InternalError("index writer is null in inverted
index writer");
}
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
// offsets[i+1] is now row element count
- std::vector<std::string> strings;
// [0, 3, 6]
// [10,20,30] [20,30,40], [30,40,50]
auto start_off = offsets[i];
auto end_off = offsets[i + 1];
+ // TODO(Amory).later we use object pool to avoid field creation
+ lucene::document::Field* new_field = nullptr;
+ CL_NS(analysis)::TokenStream* ts = nullptr;
for (auto j = start_off; j < end_off; ++j) {
if (null_map[j] == 1) {
continue;
}
+ // now we temp create field . later make a pool
+ if (Status st = create_field(&new_field); st !=
Status::OK()) {
+ LOG(ERROR)
+ << "create field " <<
string(_field_name.begin(), _field_name.end())
+ << " error:" << st;
+ return st;
+ }
auto* v = (Slice*)((const uint8_t*)value_ptr + j *
field_size);
- strings.emplace_back(v->get_data(), v->get_size());
- }
-
- auto value = join(strings, " ");
- // only ignore_above UNTOKENIZED strings and empty strings not
tokenized
- if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
- value.length() > ignore_above) ||
- (_parser_type != InvertedIndexParserType::PARSER_NONE &&
value.empty())) {
- RETURN_IF_ERROR(add_null_document());
- } else {
- new_fulltext_field(value.c_str(), value.length());
- RETURN_IF_ERROR(add_document());
+ if ((_parser_type == InvertedIndexParserType::PARSER_NONE
&&
+ v->get_size() > ignore_above) ||
+ (_parser_type != InvertedIndexParserType::PARSER_NONE
&& v->empty())) {
+ // is here a null value?
+ // TODO. Maybe here has performance problem for large
size string.
+ continue;
+ } else {
+ if (_parser_type ==
InvertedIndexParserType::PARSER_ENGLISH ||
Review Comment:
why check specific parser type? We should check NONE and others.
##########
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp:
##########
@@ -293,6 +301,17 @@ class InvertedIndexColumnWriterImpl : public
InvertedIndexColumnWriter {
}
}
+ void new_fulltext_field(const char* field_value_data, size_t
field_value_size,
Review Comment:
this function is not used
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]