github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3231738768


##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +1441,142 @@ Status StructColumnReader::read_column_data(
     return Status::OK();
 }
 
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+                                 const tparquet::RowGroup& row_group, size_t 
max_buf_size,
+                                 std::unordered_map<int, 
tparquet::OffsetIndex>& col_offsets,
+                                 RuntimeState* state, bool in_collection,
+                                 const std::set<uint64_t>& column_ids,
+                                 const std::set<uint64_t>& filter_column_ids) {
+    _field_schema = field;
+    _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+    DataTypes child_types;
+    Strings child_names;
+    child_types.reserve(field->children.size());
+    child_names.reserve(field->children.size());
+    for (const auto& child : field->children) {
+        child_types.push_back(make_nullable(child.data_type));
+        child_names.push_back(child.name);
+    }
+    _variant_struct_type = std::make_shared<DataTypeStruct>(child_types, 
child_names);
+    if (field->data_type->is_nullable()) {
+        _variant_struct_type = make_nullable(_variant_struct_type);
+    }
+    _variant_struct_field->data_type = _variant_struct_type;
+
+    RETURN_IF_ERROR(ParquetColumnReader::create(file, 
_variant_struct_field.get(), row_group,
+                                                _row_ranges, _ctz, _io_ctx, 
_struct_reader,
+                                                max_buf_size, col_offsets, 
state, in_collection,
+                                                column_ids, 
filter_column_ids));
+    _struct_reader->set_column_in_nested();
+    return Status::OK();
+}
+
+Status VariantColumnReader::read_column_data(
+        ColumnPtr& doris_column, const DataTypePtr& type,
+        const std::shared_ptr<TableSchemaChangeHelper::Node>& root_node, 
FilterMap& filter_map,
+        size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter,
+        int64_t real_column_size) {
+    (void)root_node;
+    if (remove_nullable(type)->get_primitive_type() != 
PrimitiveType::TYPE_VARIANT) {
+        return Status::Corruption(
+                "Wrong data type for column '{}', expected Variant type, 
actual type: {}.",
+                _field_schema->name, type->get_name());
+    }
+
+    ColumnPtr struct_column = _variant_struct_type->create_column();
+    const size_t old_struct_rows = struct_column->size();
+    auto const_node = TableSchemaChangeHelper::ConstNode::get_instance();
+    RETURN_IF_ERROR(_struct_reader->read_column_data(struct_column, 
_variant_struct_type,
+                                                     const_node, filter_map, 
batch_size, read_rows,
+                                                     eof, is_dict_filter, 
real_column_size));
+
+    const size_t new_struct_rows = struct_column->size() - old_struct_rows;
+    if (new_struct_rows == 0) {
+        return Status::OK();
+    }
+
+    MutableColumnPtr variant_column_ptr;
+    NullMap* null_map_ptr = nullptr;
+    auto mutable_column = doris_column->assume_mutable();
+    if (doris_column->is_nullable()) {
+        auto* nullable_column = 
assert_cast<ColumnNullable*>(mutable_column.get());
+        variant_column_ptr = nullable_column->get_nested_column_ptr();
+        null_map_ptr = &nullable_column->get_null_map_data();
+    } else {
+        if (_field_schema->data_type->is_nullable()) {
+            return Status::Corruption("Not nullable column has null values in 
parquet file");
+        }
+        variant_column_ptr = std::move(mutable_column);
+    }
+    auto* variant_column = 
assert_cast<ColumnVariant*>(variant_column_ptr.get());
+
+    const IColumn* variant_struct_source = struct_column.get();
+    const NullMap* struct_null_map = nullptr;
+    if (const auto* nullable_struct = 
check_and_get_column<ColumnNullable>(variant_struct_source)) {
+        struct_null_map = &nullable_struct->get_null_map_data();
+        variant_struct_source = &nullable_struct->get_nested_column();
+    }
+    const auto& variant_struct_column = assert_cast<const 
ColumnStruct&>(*variant_struct_source);
+
+    const int value_idx = find_child_idx(*_field_schema, "value");
+    const int typed_value_idx = find_child_idx(*_field_schema, "typed_value");
+    if (value_idx < 0 && typed_value_idx >= 0 &&
+        can_direct_read_typed_value(_field_schema->children[typed_value_idx], 
false)) {
+        MutableColumnPtr batch_variant_column =
+                ColumnVariant::create(variant_column->max_subcolumns_count(),
+                                      variant_column->enable_doc_mode(), 
new_struct_rows + 1);
+        auto* batch_variant = 
assert_cast<ColumnVariant*>(batch_variant_column.get());
+        PathInDataBuilder path;
+        RETURN_IF_ERROR(append_direct_typed_column_to_batch(
+                _field_schema->children[typed_value_idx],
+                variant_struct_column.get_column(typed_value_idx), 
old_struct_rows, new_struct_rows,
+                &path, batch_variant, false));
+        variant_column->insert_range_from(*batch_variant_column, 1, 
new_struct_rows);
+        if (null_map_ptr != nullptr) {
+            for (size_t i = old_struct_rows; i < struct_column->size(); ++i) {
+                null_map_ptr->push_back(struct_null_map != nullptr && 
(*struct_null_map)[i]);
+            }
+        }
+#ifndef NDEBUG
+        doris_column->sanity_check();
+#endif
+        return Status::OK();
+    }
+
+    for (size_t i = old_struct_rows; i < struct_column->size(); ++i) {
+        if (struct_null_map != nullptr && (*struct_null_map)[i]) {
+            if (null_map_ptr != nullptr) {
+                variant_column->insert_default();
+                null_map_ptr->push_back(1);
+                continue;
+            }
+        }
+        VariantMap values;
+        bool present = false;
+        PathInDataBuilder path;
+        RETURN_IF_ERROR(variant_to_variant_map(*_field_schema, 
(*struct_column)[i], nullptr, &path,
+                                               &values, &present));

Review Comment:
   This turns a missing reconstructed VARIANT into SQL NULL whenever the Doris 
output slot is nullable, but the Parquet Variant Shredding spec only allows 
`value == null && typed_value == null` to mean missing for shredded object 
fields; if a Variant is missing in a context where the value is required, 
readers must return a Variant null (`00`). For example, the spec's required 
shredded primitive layout (`required group measurement (VARIANT) { required 
binary metadata; optional binary value; optional int64 typed_value; }`) stores 
a required Variant null with both optional fields null, and this branch 
materializes it as SQL NULL for nullable scan slots instead of inserting 
`values[PathInData()] = FieldWithDataType{.field = Field()}`. Please 
distinguish SQL-null top-level groups from required missing Variant payloads 
and add coverage for a required shredded Variant row with both `value` and 
`typed_value` absent.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to