eldenmoon commented on code in PR #20078:
URL: https://github.com/apache/doris/pull/20078#discussion_r1221037246


##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status 
NewJsonReader::_simdjson_handle_nested_complex_json(
     return Status::OK();
 }
 
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {
-    /// Optimization by caching the order of fields (which is almost always 
the same)
-    /// and a quick check to match the next expected field, instead of 
searching the hash table.
-    if (_prev_positions.size() > key_index && _prev_positions[key_index] &&
-        name == _prev_positions[key_index]->get_first()) {
-        return _prev_positions[key_index]->get_second();
-    } else {
-        auto* it = _slot_desc_index.find(name);
-        if (it) {
-            if (key_index < _prev_positions.size()) {
-                _prev_positions[key_index] = it;
-            }
-            return it->get_second();
-        } else {
-            return size_t(-1);
-        }
-    }
-}
-
 Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* 
value, Block& block,
                                                  const 
std::vector<SlotDescriptor*>& slot_descs,
                                                  bool* valid) {
     // set
     _seen_columns.assign(block.columns(), false);
     size_t cur_row_count = block.rows();
     bool has_valid_value = false;
-    // iterate through object, simdjson::ondemond will parsing on the fly
-    size_t key_index = 0;
-    for (auto field : *value) {
-        std::string_view key = field.unescaped_key();
-        StringRef name_ref(key.data(), key.size());
-        const size_t column_index = _column_index(name_ref, key_index++);
-        if (UNLIKELY(ssize_t(column_index) < 0)) {
-            // This key is not exist in slot desc, just ignore
+    for (size_t i = 0; i < slot_descs.size(); ++i) {
+        auto slot_desc = slot_descs[i];
+        if (!slot_desc->is_materialized()) {
             continue;
         }
-        simdjson::ondemand::value val = field.value();
-        auto* column_ptr = 
block.get_by_position(column_index).column->assume_mutable().get();
-        RETURN_IF_ERROR(
-                _simdjson_write_data_to_column(val, slot_descs[column_index], 
column_ptr, valid));
-        if (!(*valid)) {
-            return Status::OK();
+        auto* column_ptr = 
block.get_by_position(i).column->assume_mutable().get();
+        auto field = value->find_field_unordered(slot_desc->col_name());

Review Comment:
   this is much slower than iterate through `for (auto field : *value) `, this 
loop will utilize simd instruction



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status 
NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(
+        const std::vector<SlotDescriptor*>& slot_descs,
+        const std::unordered_map<std::string, vectorized::VExprContext*>& 
col_default_value_ctx) {
+    for (auto slot_desc : slot_descs) {
+        auto it = col_default_value_ctx.find(slot_desc->col_name());
+        if (it != col_default_value_ctx.end() && it->second != nullptr) {
+            auto* ctx = it->second;
+            // empty block to save default value of slot_desc->col_name()
+            Block block;
+            // If block is empty, some functions will produce no result. So we 
insert a column with
+            // single value here.
+            block.insert({ColumnUInt8::create(1), 
std::make_shared<DataTypeUInt8>(), ""});
+            int result = -1;
+            RETURN_IF_ERROR(ctx->execute(&block, &result));
+            DCHECK(result != -1);
+            auto column = block.get_by_position(result).column;
+            DCHECK(column->size() == 1);
+            _col_default_value_map.emplace(slot_desc->col_name(),
+                                           column->get_data_at(0).to_string());
+        }
+    }
+    return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+                                           vectorized::IColumn* column_ptr, 
bool* valid) {
+    if (slot_desc->is_nullable()) {
+        vectorized::ColumnNullable* nullable_column =
+                reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+        column_ptr = &nullable_column->get_nested_column();
+        auto col_value = _col_default_value_map.find(slot_desc->col_name());
+        if (col_value == _col_default_value_map.end()) {
+            nullable_column->insert_default();
+        } else {
+            const std::string& v_str = col_value->second;
+            nullable_column->get_null_map_data().push_back(0);
+            assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), 
v_str.size());

Review Comment:
   what if default value is `CURRENT_TIMESTAMP` will` v_str` be ok?



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -832,19 +836,19 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& 
objectValue, Block& bl
                 return Status::OK();
             }
             has_valid_value = true;
-        } else { // not found
-            // When the entire row has no valid value, this row should be 
filtered,
-            // so the default value cannot be directly inserted here
-            if (!slot_desc->is_nullable()) {
-                RETURN_IF_ERROR(_append_error_msg(
-                        objectValue,
-                        "The column `{}` is not nullable, but it's not found 
in jsondata.",
-                        slot_desc->col_name(), valid));
-                break;
+        } else {
+            // not found, filling with default value
+            RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, 
valid));
+            if (!(*valid)) {
+                return Status::OK();
             }
         }
     }
     if (!has_valid_value) {
+        for (int i = 0; i < block.columns(); ++i) {
+            auto column = block.get_by_position(i).column->assume_mutable();
+            column->pop_back(1);

Review Comment:
   add comment to explain why we need to pop_back here?



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status 
NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(
+        const std::vector<SlotDescriptor*>& slot_descs,
+        const std::unordered_map<std::string, vectorized::VExprContext*>& 
col_default_value_ctx) {
+    for (auto slot_desc : slot_descs) {
+        auto it = col_default_value_ctx.find(slot_desc->col_name());
+        if (it != col_default_value_ctx.end() && it->second != nullptr) {
+            auto* ctx = it->second;
+            // empty block to save default value of slot_desc->col_name()
+            Block block;
+            // If block is empty, some functions will produce no result. So we 
insert a column with
+            // single value here.
+            block.insert({ColumnUInt8::create(1), 
std::make_shared<DataTypeUInt8>(), ""});
+            int result = -1;
+            RETURN_IF_ERROR(ctx->execute(&block, &result));
+            DCHECK(result != -1);
+            auto column = block.get_by_position(result).column;
+            DCHECK(column->size() == 1);
+            _col_default_value_map.emplace(slot_desc->col_name(),
+                                           column->get_data_at(0).to_string());
+        }
+    }
+    return Status::OK();
+}
+
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
+                                           vectorized::IColumn* column_ptr, 
bool* valid) {
+    if (slot_desc->is_nullable()) {
+        vectorized::ColumnNullable* nullable_column =
+                reinterpret_cast<vectorized::ColumnNullable*>(column_ptr);
+        column_ptr = &nullable_column->get_nested_column();
+        auto col_value = _col_default_value_map.find(slot_desc->col_name());
+        if (col_value == _col_default_value_map.end()) {
+            nullable_column->insert_default();
+        } else {
+            const std::string& v_str = col_value->second;
+            nullable_column->get_null_map_data().push_back(0);
+            assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), 
v_str.size());

Review Comment:
   we should add test cases



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1370,53 +1374,40 @@ Status 
NewJsonReader::_simdjson_handle_nested_complex_json(
     return Status::OK();
 }
 
-size_t NewJsonReader::_column_index(const StringRef& name, size_t key_index) {

Review Comment:
   this function accelerate parsing speed, do not delete it



##########
be/src/vec/exec/format/json/new_json_reader.cpp:
##########
@@ -1730,4 +1726,52 @@ Status 
NewJsonReader::_simdjson_write_columns_by_jsonpath(
     return Status::OK();
 }
 
+Status NewJsonReader::_get_column_default_value(

Review Comment:
   we should `_fill_missing_column`  also when `fill missing slot`
   
   ```
    // fill missing slot
       int ctx_idx = 0;
       int nullcount = 0;
       for (auto slot_desc : slot_descs) {
           if (!slot_desc->is_materialized()) {
               continue;
           }
           int dest_index = ctx_idx++;
           auto* column_ptr = 
block.get_by_position(dest_index).column->assume_mutable().get();
           if (column_ptr->size() < cur_row_count + 1) {
              // ..._fill_missing_column.. here
               ++nullcount;
           }
           DCHECK(column_ptr->size() == cur_row_count + 1);
       }
       // There is at least one valid value here
       DCHECK(nullcount < block.columns());
       *valid = true;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to