eldenmoon commented on code in PR #20078:
URL: https://github.com/apache/doris/pull/20078#discussion_r1221037246
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status
NewJsonReader::_simdjson_handle_nested_complex_json(
return Status::OK();
}
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {
- /// Optimization by caching the order of fields (which is almost always
the same)
- /// and a quick check to match the next expected field, instead of
searching the hash table.
- if (_prev_positions.size() > key_index && _prev_positions[key_index] &&
- name == _prev_positions[key_index]->get_first()) {
- return _prev_positions[key_index]->get_second();
- } else {
- auto* it = _slot_desc_index.find(name);
- if (it) {
- if (key_index < _prev_positions.size()) {
- _prev_positions[key_index] = it;
- }
- return it->get_second();
- } else {
- return size_t(-1);
- }
- }
-}
-
Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object*
value, Block& block,
const
std::vector<SlotDescriptor*>& slot_descs,
bool* valid) {
// set
_seen_columns.assign(block.columns(), false);
size_t cur_row_count = block.rows();
bool has_valid_value = false;
- // iterate through object, simdjson::ondemond will parsing on the fly
- size_t key_index = 0;
- for (auto field : *value) {
- std::string_view key = field.unescaped_key();
- StringRef name_ref(key.data(), key.size());
- const size_t column_index = _column_index(name_ref, key_index++);
- if (UNLIKELY(ssize_t(column_index) < 0)) {
- // This key is not exist in slot desc, just ignore
+ for (size_t i = 0; i < slot_descs.size(); ++i) {
+ auto slot_desc = slot_descs[i];
+ if (!slot_desc->is_materialized()) {
continue;
}
- simdjson::ondemand::value val = field.value();
- auto* column_ptr =
block.get_by_position(column_index).column->assume_mutable().get();
- RETURN_IF_ERROR(
- _simdjson_write_data_to_column(val, slot_descs[column_index],
column_ptr, valid));
- if (!(*valid)) {
- return Status::OK();
+ auto* column_ptr =
block.get_by_position(i).column->assume_mutable().get();
+ auto field = value->find_field_unordered(slot_desc->col_name());
Review Comment:
this is much slower than iterate through `for (auto field : *value) `, this
loop will utilize simd instruction
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status
NewJsonReader::_simdjson_write_columns_by_jsonpath(
return Status::OK();
}
+Status NewJsonReader::_get_column_default_value(
+ const std::vector<SlotDescriptor*>& slot_descs,
+ const std::unordered_map<std::string, vectorized::VExprContext*>&
col_default_value_ctx) {
+ for (auto slot_desc : slot_descs) {
+ auto it = col_default_value_ctx.find(slot_desc->col_name());
+ if (it != col_default_value_ctx.end() && it->second != nullptr) {
+ auto* ctx = it->second;
+ // empty block to save default value of slot_desc->col_name()
+ Block block;
+ // If block is empty, some functions will produce no result. So we
insert a column with
+ // single value here.
+ block.insert({ColumnUInt8::create(1),
std::make_shared<DataTypeUInt8>(), ""});
+ int result = -1;
+ RETURN_IF_ERROR(ctx->execute(&block, &result));
+ DCHECK(result != -1);
+ auto column = block.get_by_position(result).column;
+ DCHECK(column->size() == 1);
+ _col_default_value_map.emplace(slot_desc->col_name(),
+ column->get_data_at(0).to_string());
+ }
+ }
+ return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+ vectorized::IColumn* column_ptr,
bool* valid) {
+ if (slot_desc->is_nullable()) {
+ vectorized::ColumnNullable* nullable_column =
+ reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+ column_ptr = &nullable_column->get_nested_column();
+ auto col_value = _col_default_value_map.find(slot_desc->col_name());
+ if (col_value == _col_default_value_map.end()) {
+ nullable_column->insert_default();
+ } else {
+ const std::string& v_str = col_value->second;
+ nullable_column->get_null_map_data().push_back(0);
+ assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(),
v_str.size());
Review Comment:
what if default value is `CURRENT_TIMESTAMP` will` v_str` be ok?
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -832,19 +836,19 @@ Status NewJsonReader::_set_column_value(rapidjson::Value&
objectValue, Block& bl
return Status::OK();
}
has_valid_value = true;
- } else { // not found
- // When the entire row has no valid value, this row should be
filtered,
- // so the default value cannot be directly inserted here
- if (!slot_desc->is_nullable()) {
- RETURN_IF_ERROR(_append_error_msg(
- objectValue,
- "The column `{}` is not nullable, but it's not found
in jsondata.",
- slot_desc->col_name(), valid));
- break;
+ } else {
+ // not found, filling with default value
+ RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
+ if (!(*valid)) {
+ return Status::OK();
}
}
}
if (!has_valid_value) {
+ for (int i = 0; i < block.columns(); ++i) {
+ auto column = block.get_by_position(i).column->assume_mutable();
+ column->pop_back(1);
Review Comment:
add comment to explain why we need to pop_back here?
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status
NewJsonReader::_simdjson_write_columns_by_jsonpath(
return Status::OK();
}
+Status NewJsonReader::_get_column_default_value(
+ const std::vector<SlotDescriptor*>& slot_descs,
+ const std::unordered_map<std::string, vectorized::VExprContext*>&
col_default_value_ctx) {
+ for (auto slot_desc : slot_descs) {
+ auto it = col_default_value_ctx.find(slot_desc->col_name());
+ if (it != col_default_value_ctx.end() && it->second != nullptr) {
+ auto* ctx = it->second;
+ // empty block to save default value of slot_desc->col_name()
+ Block block;
+ // If block is empty, some functions will produce no result. So we
insert a column with
+ // single value here.
+ block.insert({ColumnUInt8::create(1),
std::make_shared<DataTypeUInt8>(), ""});
+ int result = -1;
+ RETURN_IF_ERROR(ctx->execute(&block, &result));
+ DCHECK(result != -1);
+ auto column = block.get_by_position(result).column;
+ DCHECK(column->size() == 1);
+ _col_default_value_map.emplace(slot_desc->col_name(),
+ column->get_data_at(0).to_string());
+ }
+ }
+ return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+ vectorized::IColumn* column_ptr,
bool* valid) {
+ if (slot_desc->is_nullable()) {
+ vectorized::ColumnNullable* nullable_column =
+ reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+ column_ptr = &nullable_column->get_nested_column();
+ auto col_value = _col_default_value_map.find(slot_desc->col_name());
+ if (col_value == _col_default_value_map.end()) {
+ nullable_column->insert_default();
+ } else {
+ const std::string& v_str = col_value->second;
+ nullable_column->get_null_map_data().push_back(0);
+ assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(),
v_str.size());
Review Comment:
we should add test cases
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status
NewJsonReader::_simdjson_handle_nested_complex_json(
return Status::OK();
}
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {
Review Comment:
this function accelerate parsing speed, do not delete it
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status
NewJsonReader::_simdjson_write_columns_by_jsonpath(
return Status::OK();
}
+Status NewJsonReader::_get_column_default_value(
Review Comment:
we should `_fill_missing_column` also when `fill missing slot`
```
// fill missing slot
int ctx_idx = 0;
int nullcount = 0;
for (auto slot_desc : slot_descs) {
if (!slot_desc->is_materialized()) {
continue;
}
int dest_index = ctx_idx++;
auto* column_ptr =
block.get_by_position(dest_index).column->assume_mutable().get();
if (column_ptr->size() < cur_row_count + 1) {
// ..._fill_missing_column.. here
++nullcount;
}
DCHECK(column_ptr->size() == cur_row_count + 1);
}
// There is at least one valid value here
DCHECK(nullcount < block.columns());
*valid = true;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]