eldenmoon commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3234504354
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -103,6 +119,804 @@ static void fill_array_offset(FieldSchema* field,
ColumnArray::Offsets64& offset
}
}
+static constexpr int64_t UNIX_EPOCH_DAYNR = 719528;
+static constexpr int64_t MICROS_PER_SECOND = 1000000;
+
+static int64_t variant_date_value(const VecDateTimeValue& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_date_value(const DateV2Value<DateV2ValueType>& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_datetime_value(const VecDateTimeValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND;
+}
+
+static int64_t variant_datetime_value(const DateV2Value<DateTimeV2ValueType>&
value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int64_t variant_datetime_value(const TimestampTzValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int find_child_idx(const FieldSchema& field, std::string_view name) {
+ for (int i = 0; i < field.children.size(); ++i) {
+ if (field.children[i].lower_case_name == name) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool is_variant_wrapper_typed_value_child(const FieldSchema& field) {
+ auto type = remove_nullable(field.data_type);
+ return type->get_primitive_type() == TYPE_STRUCT ||
type->get_primitive_type() == TYPE_ARRAY;
+}
+
+static bool is_variant_wrapper_field(const FieldSchema& field,
+ bool
allow_scalar_typed_value_only_wrapper) {
+ auto type = remove_nullable(field.data_type);
+ if (type->get_primitive_type() != TYPE_STRUCT &&
type->get_primitive_type() != TYPE_VARIANT) {
+ return false;
+ }
+
+ bool has_metadata = false;
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field.children) {
+ if (child.lower_case_name == "metadata") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_metadata = true;
+ continue;
+ }
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_metadata && has_value) {
+ return true;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
+ }
+ return typed_value != nullptr && (allow_scalar_typed_value_only_wrapper ||
+
is_variant_wrapper_typed_value_child(*typed_value));
+}
+
+static Status get_binary_field(const Field& field, std::string* value, bool*
present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+ *present = true;
+ switch (field.get_type()) {
+ case TYPE_STRING:
+ *value = field.get<TYPE_STRING>();
+ return Status::OK();
+ case TYPE_CHAR:
+ *value = field.get<TYPE_CHAR>();
+ return Status::OK();
+ case TYPE_VARCHAR:
+ *value = field.get<TYPE_VARCHAR>();
+ return Status::OK();
+ case TYPE_VARBINARY: {
+ auto ref = field.get<TYPE_VARBINARY>().to_string_ref();
+ value->assign(ref.data, ref.size);
+ return Status::OK();
+ }
+ default:
+ return Status::Corruption("Parquet VARIANT binary field has unexpected
Doris type {}",
+ field.get_type_name());
+ }
+}
+
+static PathInData append_path(const PathInData& prefix, const PathInData&
suffix) {
+ if (prefix.empty()) {
+ return suffix;
+ }
+ if (suffix.empty()) {
+ return prefix;
+ }
+ PathInDataBuilder builder;
+ builder.append(prefix.get_parts(), false);
+ builder.append(suffix.get_parts(), false);
+ return builder.build();
+}
+
+static Status parse_json_to_variant_map(const std::string& json, const
PathInData& prefix,
+ VariantMap* values) {
+ auto parsed_column = ColumnVariant::create(0, false);
+ ParseConfig parse_config;
+ StringRef json_ref(json.data(), json.size());
+ RETURN_IF_CATCH_EXCEPTION(
+ variant_util::parse_json_to_variant(*parsed_column, json_ref,
nullptr, parse_config));
+ Field parsed = (*parsed_column)[0];
+ auto& parsed_values = parsed.get<TYPE_VARIANT>();
+ for (auto& [path, value] : parsed_values) {
+ (*values)[append_path(prefix, path)] = std::move(value);
+ }
+ return Status::OK();
+}
+
+static Status variant_map_to_json(VariantMap values, std::string* json) {
+ auto variant_column = ColumnVariant::create(0, false);
+ RETURN_IF_CATCH_EXCEPTION(
+
variant_column->insert(Field::create_field<TYPE_VARIANT>(std::move(values))));
+ DataTypeSerDe::FormatOptions options;
+ variant_column->serialize_one_row_to_string(0, json, options);
+ return Status::OK();
+}
+
+static bool has_direct_typed_parent_null(const std::vector<const NullMap*>&
null_maps, size_t row) {
+ for (const NullMap* null_map : null_maps) {
+ DCHECK_LT(row, null_map->size());
+ if ((*null_map)[row]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void insert_direct_typed_leaf_range(const IColumn& column, size_t
start, size_t rows,
+ const std::vector<const NullMap*>&
parent_null_maps,
+ IColumn* variant_leaf) {
+ auto& nullable_leaf = assert_cast<ColumnNullable&>(*variant_leaf);
+ const IColumn* value_column = &column;
+ const NullMap* leaf_null_map = nullptr;
+ if (const auto* nullable_column =
check_and_get_column<ColumnNullable>(&column)) {
+ value_column = &nullable_column->get_nested_column();
+ leaf_null_map = &nullable_column->get_null_map_data();
+ }
+
+ nullable_leaf.get_nested_column().insert_range_from(*value_column, start,
rows);
+ auto& null_map = nullable_leaf.get_null_map_data();
+ null_map.reserve(null_map.size() + rows);
+ for (size_t i = 0; i < rows; ++i) {
+ const size_t row = start + i;
+ const bool leaf_is_null = leaf_null_map != nullptr &&
(*leaf_null_map)[row];
+ null_map.push_back(leaf_is_null ||
has_direct_typed_parent_null(parent_null_maps, row));
+ }
+}
+
+static void append_json_string(std::string_view value, std::string* json) {
+ auto column = ColumnString::create();
+ VectorBufferWriter writer(*column);
+ writer.write_json_string(value);
+ writer.commit();
+ json->append(column->get_data_at(0).data, column->get_data_at(0).size);
+}
+
+static bool is_column_selected(const FieldSchema& field_schema,
+ const std::set<uint64_t>& column_ids) {
+ return column_ids.empty() || column_ids.find(field_schema.get_column_id())
!= column_ids.end();
+}
+
+static bool has_selected_column(const FieldSchema& field_schema,
+ const std::set<uint64_t>& column_ids) {
+ if (is_column_selected(field_schema, column_ids)) {
+ return true;
+ }
+ return std::any_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return has_selected_column(child, column_ids);
+ });
+}
+
+static bool is_direct_variant_leaf_type(const DataTypePtr& data_type) {
+ const auto& type = remove_nullable(data_type);
+ switch (type->get_primitive_type()) {
+ case TYPE_BOOLEAN:
+ case TYPE_TINYINT:
+ case TYPE_SMALLINT:
+ case TYPE_INT:
+ case TYPE_BIGINT:
+ case TYPE_LARGEINT:
+ case TYPE_DECIMALV2:
+ case TYPE_DECIMAL32:
+ case TYPE_DECIMAL64:
+ case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
+ case TYPE_STRING:
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ return true;
+ case TYPE_ARRAY: {
+ const auto* array_type = assert_cast<const DataTypeArray*>(type.get());
+ return is_direct_variant_leaf_type(array_type->get_nested_type());
+ }
+ default:
+ return false;
+ }
+}
+
+static bool can_direct_read_typed_value(const FieldSchema& field_schema, bool
allow_variant_wrapper,
+ const std::set<uint64_t>& column_ids) {
+ if (!has_selected_column(field_schema, column_ids)) {
+ return true;
+ }
+ if (allow_variant_wrapper && is_variant_wrapper_field(field_schema,
false)) {
+ const int value_idx = find_child_idx(field_schema, "value");
+ const int typed_value_idx = find_child_idx(field_schema,
"typed_value");
+ return (value_idx < 0 ||
+ !has_selected_column(field_schema.children[value_idx],
column_ids)) &&
+ typed_value_idx >= 0 &&
+
can_direct_read_typed_value(field_schema.children[typed_value_idx], false,
+ column_ids);
+ }
+
+ const auto& type = remove_nullable(field_schema.data_type);
+ if (type->get_primitive_type() == TYPE_STRUCT) {
+ return std::all_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return can_direct_read_typed_value(child, true,
column_ids);
+ });
+ }
+ return is_direct_variant_leaf_type(field_schema.data_type);
+}
+
+static bool has_selected_direct_typed_leaf(const FieldSchema& field_schema,
+ bool allow_variant_wrapper,
+ const std::set<uint64_t>&
column_ids) {
+ if (!has_selected_column(field_schema, column_ids)) {
+ return false;
+ }
+ if (allow_variant_wrapper && is_variant_wrapper_field(field_schema,
false)) {
+ const int typed_value_idx = find_child_idx(field_schema,
"typed_value");
+ DCHECK_GE(typed_value_idx, 0);
+ return
has_selected_direct_typed_leaf(field_schema.children[typed_value_idx], false,
+ column_ids);
+ }
+
+ const auto& type = remove_nullable(field_schema.data_type);
+ if (type->get_primitive_type() == TYPE_STRUCT) {
+ return std::any_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return has_selected_direct_typed_leaf(child,
true, column_ids);
+ });
+ }
+ return is_direct_variant_leaf_type(field_schema.data_type);
+}
+
+static bool can_use_direct_typed_only_value(const FieldSchema& variant_field,
+ const std::set<uint64_t>&
column_ids) {
+ const int value_idx = find_child_idx(variant_field, "value");
+ const int typed_value_idx = find_child_idx(variant_field, "typed_value");
+ return value_idx < 0 && typed_value_idx >= 0 &&
+
has_selected_direct_typed_leaf(variant_field.children[typed_value_idx], false,
+ column_ids) &&
+
can_direct_read_typed_value(variant_field.children[typed_value_idx], false,
column_ids);
+}
+
+static void fill_variant_field_info(FieldWithDataType* value) {
+ FieldInfo info;
+ variant_util::get_field_info(value->field, &info);
+ DCHECK_LE(info.num_dimensions, std::numeric_limits<uint8_t>::max());
+ value->base_scalar_type_id = info.scalar_type_id;
+ value->num_dimensions = static_cast<uint8_t>(info.num_dimensions);
+}
+
+static Status field_to_variant_field(const FieldSchema& field_schema, const
Field& field,
+ FieldWithDataType* value, bool* present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+ *present = true;
+ const DataTypePtr& type = remove_nullable(field_schema.data_type);
+ switch (type->get_primitive_type()) {
+ case TYPE_BOOLEAN:
+ case TYPE_TINYINT:
+ case TYPE_SMALLINT:
+ case TYPE_INT:
+ case TYPE_BIGINT:
+ case TYPE_LARGEINT:
+ case TYPE_DECIMALV2:
+ case TYPE_DECIMAL32:
+ case TYPE_DECIMAL64:
+ case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
+ case TYPE_STRING:
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ case TYPE_ARRAY:
+ value->field = field;
+ fill_variant_field_info(value);
+ value->precision = type->get_precision();
+ value->scale = type->get_scale();
+ return Status::OK();
+ case TYPE_FLOAT: {
+ const auto float_value = field.get<TYPE_FLOAT>();
+ value->field = std::isfinite(float_value) ? field : Field();
+ fill_variant_field_info(value);
+ return Status::OK();
+ }
+ case TYPE_DOUBLE: {
+ const auto double_value = field.get<TYPE_DOUBLE>();
+ value->field = std::isfinite(double_value) ? field : Field();
+ fill_variant_field_info(value);
+ return Status::OK();
+ }
+ case TYPE_TIMEV2:
+ value->field = Field::create_field<TYPE_BIGINT>(
+ static_cast<int64_t>(std::llround(field.get<TYPE_TIMEV2>())));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_DATE:
+ value->field =
Field::create_field<TYPE_BIGINT>(variant_date_value(field.get<TYPE_DATE>()));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_DATETIME:
+ value->field = Field::create_field<TYPE_BIGINT>(
+ variant_datetime_value(field.get<TYPE_DATETIME>()));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_DATEV2:
+ value->field =
+
Field::create_field<TYPE_BIGINT>(variant_date_value(field.get<TYPE_DATEV2>()));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_DATETIMEV2:
+ value->field = Field::create_field<TYPE_BIGINT>(
+ variant_datetime_value(field.get<TYPE_DATETIMEV2>()));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_TIMESTAMPTZ:
+ value->field = Field::create_field<TYPE_BIGINT>(
+ variant_datetime_value(field.get<TYPE_TIMESTAMPTZ>()));
+ value->base_scalar_type_id = TYPE_BIGINT;
+ return Status::OK();
+ case TYPE_VARBINARY:
+ return Status::NotSupported("Parquet VARIANT binary typed_value is not
supported");
+ default:
+ return Status::Corruption("Unsupported Parquet VARIANT typed_value
Doris type {}",
+ type->get_name());
+ }
+}
+
+static Status typed_value_to_json(const FieldSchema& typed_value_field, const
Field& field,
+ const std::string& metadata, std::string*
json, bool* present);
+
+static Status serialize_field_to_json(const DataTypePtr& data_type, const
Field& field,
+ std::string* json) {
+ MutableColumnPtr column = data_type->create_column();
+ column->insert(field);
+
+ auto json_column = ColumnString::create();
+ VectorBufferWriter writer(*json_column);
+ auto serde = data_type->get_serde();
+ DataTypeSerDe::FormatOptions options;
+ RETURN_IF_ERROR(serde->serialize_one_cell_to_json(*column, 0, writer,
options));
+ writer.commit();
+ *json = json_column->get_data_at(0).to_string();
+ return Status::OK();
+}
+
+static Status scalar_typed_value_to_json(const FieldSchema& field_schema,
const Field& field,
+ std::string* json, bool* present) {
+ FieldWithDataType value;
+ RETURN_IF_ERROR(field_to_variant_field(field_schema, field, &value,
present));
+ if (!*present) {
+ return Status::OK();
+ }
+ if (value.field.is_null()) {
+ *json = "null";
+ return Status::OK();
+ }
+
+ DataTypePtr json_type;
+ if (value.base_scalar_type_id != PrimitiveType::INVALID_TYPE) {
+ json_type =
DataTypeFactory::instance().create_data_type(value.base_scalar_type_id, false,
+
value.precision, value.scale);
+ } else {
+ json_type = remove_nullable(field_schema.data_type);
+ }
+ return serialize_field_to_json(json_type, value.field, json);
+}
+
+static Status variant_to_json(const FieldSchema& variant_field, const Field&
field,
+ const std::string* inherited_metadata,
std::string* json,
+ bool* present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+
+ const auto& fields = field.get<TYPE_STRUCT>();
+ const int metadata_idx = find_child_idx(variant_field, "metadata");
+ const int value_idx = find_child_idx(variant_field, "value");
+ const int typed_value_idx = find_child_idx(variant_field, "typed_value");
+
+ std::string metadata;
+ bool has_metadata = false;
+ if (inherited_metadata != nullptr) {
+ metadata = *inherited_metadata;
+ has_metadata = true;
+ }
+ if (metadata_idx >= 0) {
+ bool metadata_present = false;
+ RETURN_IF_ERROR(get_binary_field(fields[metadata_idx], &metadata,
&metadata_present));
+ has_metadata = metadata_present;
+ }
+
+ std::string typed_json;
+ bool typed_present = false;
+ if (typed_value_idx >= 0) {
+
RETURN_IF_ERROR(typed_value_to_json(variant_field.children[typed_value_idx],
+ fields[typed_value_idx], metadata,
&typed_json,
+ &typed_present));
+ }
+
+ std::string value_json;
+ bool value_present = false;
+ if (value_idx >= 0) {
+ std::string value;
+ RETURN_IF_ERROR(get_binary_field(fields[value_idx], &value,
&value_present));
+ if (value_present) {
+ if (!has_metadata) {
+ return Status::Corruption("Parquet VARIANT value is present
without metadata");
+ }
+ RETURN_IF_ERROR(parquet::decode_variant_to_json(
+ StringRef(metadata.data(), metadata.size()),
+ StringRef(value.data(), value.size()), &value_json));
+ }
+ }
+
+ if (value_present && typed_present) {
+ VariantMap value_values;
+ RETURN_IF_ERROR(parse_json_to_variant_map(value_json, PathInData(),
&value_values));
+ if (value_values.find(PathInData()) != value_values.end()) {
+ return Status::Corruption(
+ "Parquet VARIANT has conflicting non-object value and
typed_value");
+ }
+ VariantMap typed_values;
+ RETURN_IF_ERROR(parse_json_to_variant_map(typed_json, PathInData(),
&typed_values));
Review Comment:
Fixed in current head `2a6b8af25879042ebdd5d6314d032fd0600a22e3`. The
residual/typed merge now checks for duplicate same-level fields before
`std::map::merge()` in both nested JSON reconstruction and row-wise
`VariantMap` reconstruction, and returns corruption instead of silently keeping
the residual branch. Added
`ParquetVariantReaderTest.NestedWrapperRejectsResidualTypedKeyCollision` and
`ParquetVariantReaderTest.RowWiseRejectsResidualTypedKeyCollision`. Local
verification: `./run-be-ut.sh --run
--filter="ParquetVariantReaderTest.*:NestedColumnAccessHelperTest.*"`,
`./build.sh --be`, regression with and without `-forceGenOut`,
`build-support/check-format.sh`, and `git diff --check`.
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,291 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
+#include "core/data_type/data_type_nullable.h"
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema&
field_schema,
+ std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+ std::string_view name) {
+ for (const auto& child : field_schema.children) {
+ if (child.name == name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_structural_name(variant_field,
"metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_structural_name(variant_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+struct VariantColumnIdExtractionResult {
+ bool has_child_columns = false;
+ bool needs_metadata = false;
+};
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field_schema.children) {
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
+ }
+ if (typed_value == nullptr) {
+ return false;
+ }
+ const auto type = remove_nullable(typed_value->data_type);
+ return type->get_primitive_type() == TYPE_STRUCT ||
type->get_primitive_type() == TYPE_ARRAY;
+}
+
+bool add_shredded_variant_field_value(const FieldSchema& shredded_field,
+ std::set<uint64_t>& column_ids) {
+ if (const auto* value = find_child_by_structural_name(shredded_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ return true;
+ }
+ return false;
+}
+
+bool contains_inherited_metadata_value(const FieldSchema& field_schema) {
+ if (is_shredded_variant_field(field_schema) &&
+ find_child_by_structural_name(field_schema, "value") != nullptr) {
+ return true;
+ }
+ return std::any_of(
+ field_schema.children.begin(), field_schema.children.end(),
+ [](const FieldSchema& child) { return
contains_inherited_metadata_value(child); });
+}
+
+VariantColumnIdExtractionResult extract_variant_typed_nested_column_ids(
+ const FieldSchema& field_schema, const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+VariantColumnIdExtractionResult extract_shredded_variant_field_ids(
+ const FieldSchema& shredded_field, const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_structural_name(shredded_field,
"typed_value");
+ VariantColumnIdExtractionResult result;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(shredded_field, column_ids);
+ result.has_child_columns = true;
+ result.needs_metadata |=
contains_inherited_metadata_value(shredded_field);
+ continue;
+ }
+
+ result.needs_metadata |=
add_shredded_variant_field_value(shredded_field, column_ids);
+ if (typed_value != nullptr) {
+ if (const auto* typed_child =
find_child_by_exact_name(*typed_value, path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ result.needs_metadata |=
contains_inherited_metadata_value(*typed_child);
+ column_ids.insert(typed_value->get_column_id());
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ auto child_result =
extract_variant_typed_nested_column_ids(
+ *typed_child, child_paths, column_ids);
+ if (child_result.has_child_columns) {
+ column_ids.insert(typed_value->get_column_id());
+ result.needs_metadata |= child_result.needs_metadata;
+ }
+ }
+ }
+ }
+ result.has_child_columns = true;
Review Comment:
Fixed in current head `2a6b8af25879042ebdd5d6314d032fd0600a22e3`. Hive and
Iceberg `extract_shredded_variant_field_ids()` now only report
`has_child_columns` when the field-level residual `value` or typed-child
recursion actually selected columns, so `v['nested']['missing']` on typed-only
nested shredding falls back to the missing-path handling instead of reading
`v.typed_value.nested.typed_value.x`. Added Hive/Iceberg BE UT coverage and a
regression/profile assertion that the nested missing query reads `v.metadata`
and no nested typed leaf. Local verification: `./run-be-ut.sh --run
--filter="ParquetVariantReaderTest.*:NestedColumnAccessHelperTest.*"`,
`./build.sh --be`, regression with and without `-forceGenOut`,
`build-support/check-format.sh`, and `git diff --check`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]