morningman commented on code in PR #43469:
URL: https://github.com/apache/doris/pull/43469#discussion_r1835707840
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -840,79 +871,145 @@ Status
NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
}
Status
NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value,
- SlotDescriptor* slot_desc,
IColumn* column_ptr,
+ const TypeDescriptor& type_desc,
+ vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
bool* valid) {
- const char* str_value = nullptr;
- char tmp_buf[128] = {0};
- int32_t wbytes = 0;
- std::string json_str;
-
ColumnNullable* nullable_column = nullptr;
- if (slot_desc->is_nullable()) {
+ vectorized::IColumn* data_column_ptr = column_ptr;
+ DataTypeSerDeSPtr data_serde = serde;
+ vectorized::DataTypeSerDe::FormatOptions _options;
Review Comment:
Looks like this `_options` can be created only once at reuse it in each call.
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -840,79 +871,145 @@ Status
NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
}
Status
NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value,
- SlotDescriptor* slot_desc,
IColumn* column_ptr,
+ const TypeDescriptor& type_desc,
+ vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
bool* valid) {
- const char* str_value = nullptr;
- char tmp_buf[128] = {0};
- int32_t wbytes = 0;
- std::string json_str;
-
ColumnNullable* nullable_column = nullptr;
- if (slot_desc->is_nullable()) {
+ vectorized::IColumn* data_column_ptr = column_ptr;
+ DataTypeSerDeSPtr data_serde = serde;
+ vectorized::DataTypeSerDe::FormatOptions _options;
+
+ bool value_is_null = (value == nullptr) || (value->GetType() ==
rapidjson::Type::kNullType);
+
+ if (column_ptr->is_nullable()) {
nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
- // kNullType will put 1 into the Null map, so there is no need to push
0 for kNullType.
- if (value->GetType() != rapidjson::Type::kNullType) {
+ data_column_ptr = nullable_column->get_nested_column().get_ptr();
+ data_serde = serde->get_nested_serdes()[0];
+
+ if (value_is_null) {
+ nullable_column->insert_default();
+ return Status::OK();
+ } else {
nullable_column->get_null_map_data().push_back(0);
+ }
+
+ } else if (value_is_null) [[unlikely]] {
+ if (_is_load) {
+ RETURN_IF_ERROR(_append_error_msg(
+ *value, "Json value is null, but the column `{}` is not
nullable.", column_name,
+ valid));
+ return Status::OK();
+
} else {
- nullable_column->insert_default();
+ return Status::DataQualityError(
+ "Json value is null, but the column `{}` is not
nullable.", column_name);
}
- column_ptr = &nullable_column->get_nested_column();
}
- switch (value->GetType()) {
- case rapidjson::Type::kStringType:
- str_value = value->GetString();
- wbytes = value->GetStringLength();
- break;
- case rapidjson::Type::kNumberType:
- if (value->IsUint()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u",
value->GetUint());
- } else if (value->IsInt()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt());
- } else if (value->IsUint64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64,
value->GetUint64());
- } else if (value->IsInt64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64,
value->GetInt64());
- } else if (value->IsFloat() || value->IsDouble()) {
- auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble());
- wbytes = end - tmp_buf;
+ if (_is_load || !type_desc.is_complex_type()) {
+ if (value->IsString()) {
+ Slice slice {value->GetString(), value->GetStringLength()};
+ RETURN_IF_ERROR(
+
data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options));
+
} else {
- return Status::InternalError<false>("It should not here.");
+ // Maybe we can `switch (value->GetType()) case: kNumberType`.
+ // Note that `if (value->IsInt())`, but column is FloatColumn.
+ auto json_str = NewJsonReader::_print_json_value(*value);
+ Slice slice {json_str.c_str(), json_str.size()};
+ RETURN_IF_ERROR(
+
data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options));
}
- str_value = tmp_buf;
- break;
- case rapidjson::Type::kFalseType:
- wbytes = 1;
- str_value = (char*)"0";
- break;
- case rapidjson::Type::kTrueType:
- wbytes = 1;
- str_value = (char*)"1";
- break;
- case rapidjson::Type::kNullType:
- if (!slot_desc->is_nullable()) {
- RETURN_IF_ERROR(_append_error_msg(
- *value, "Json value is null, but the column `{}` is not
nullable.",
- slot_desc->col_name(), valid));
- return Status::OK();
+ } else if (type_desc.type == TYPE_STRUCT) {
Review Comment:
Can we use `switch ... case` for following logic?
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -756,7 +770,23 @@ Status NewJsonReader::_set_column_value(rapidjson::Value&
objectValue, Block& bl
_append_empty_skip_bitmap_value(block, cur_row_count);
}
- for (auto* slot_desc : slot_descs) {
+ if (_is_hive_table) {
+ //don't like _fuzzy_parse,each line read in must modify name_map once.
+ for (auto* v : slot_descs) {
Review Comment:
I think it will seriously impact the performance.
What if we set a session variable to control it? And by default, in most
cases, there is no duplicate column name in json value.
##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -840,79 +871,145 @@ Status
NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
}
Status
NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value,
- SlotDescriptor* slot_desc,
IColumn* column_ptr,
+ const TypeDescriptor& type_desc,
+ vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
bool* valid) {
- const char* str_value = nullptr;
- char tmp_buf[128] = {0};
- int32_t wbytes = 0;
- std::string json_str;
-
ColumnNullable* nullable_column = nullptr;
- if (slot_desc->is_nullable()) {
+ vectorized::IColumn* data_column_ptr = column_ptr;
+ DataTypeSerDeSPtr data_serde = serde;
+ vectorized::DataTypeSerDe::FormatOptions _options;
+
+ bool value_is_null = (value == nullptr) || (value->GetType() ==
rapidjson::Type::kNullType);
+
+ if (column_ptr->is_nullable()) {
nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
- // kNullType will put 1 into the Null map, so there is no need to push
0 for kNullType.
- if (value->GetType() != rapidjson::Type::kNullType) {
+ data_column_ptr = nullable_column->get_nested_column().get_ptr();
+ data_serde = serde->get_nested_serdes()[0];
+
+ if (value_is_null) {
+ nullable_column->insert_default();
+ return Status::OK();
+ } else {
nullable_column->get_null_map_data().push_back(0);
+ }
+
+ } else if (value_is_null) [[unlikely]] {
+ if (_is_load) {
+ RETURN_IF_ERROR(_append_error_msg(
+ *value, "Json value is null, but the column `{}` is not
nullable.", column_name,
+ valid));
+ return Status::OK();
+
} else {
- nullable_column->insert_default();
+ return Status::DataQualityError(
+ "Json value is null, but the column `{}` is not
nullable.", column_name);
}
- column_ptr = &nullable_column->get_nested_column();
}
- switch (value->GetType()) {
- case rapidjson::Type::kStringType:
- str_value = value->GetString();
- wbytes = value->GetStringLength();
- break;
- case rapidjson::Type::kNumberType:
- if (value->IsUint()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u",
value->GetUint());
- } else if (value->IsInt()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt());
- } else if (value->IsUint64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64,
value->GetUint64());
- } else if (value->IsInt64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64,
value->GetInt64());
- } else if (value->IsFloat() || value->IsDouble()) {
- auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble());
- wbytes = end - tmp_buf;
+ if (_is_load || !type_desc.is_complex_type()) {
+ if (value->IsString()) {
+ Slice slice {value->GetString(), value->GetStringLength()};
+ RETURN_IF_ERROR(
+
data_serde->deserialize_one_cell_from_json(*data_column_ptr, slice, _options));
+
} else {
- return Status::InternalError<false>("It should not here.");
+ // Maybe we can `switch (value->GetType()) case: kNumberType`.
Review Comment:
Why not using `switch (value->GetType())`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]