github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229863630
##########
be/src/format/parquet/schema_desc.cpp:
##########
@@ -446,6 +454,32 @@ Status FieldDescriptor::parse_group_field(const
std::vector<tparquet::SchemaElem
return Status::OK();
}
+Status FieldDescriptor::parse_variant_field(const
std::vector<tparquet::SchemaElement>& t_schemas,
+ size_t curr_pos, FieldSchema*
variant_field) {
+ RETURN_IF_ERROR(parse_struct_field(t_schemas, curr_pos, variant_field));
+
+ bool has_metadata = false;
+ bool has_value = false;
+ for (const auto& child : variant_field->children) {
+ if (child.name == "metadata") {
+ has_metadata = child.physical_type == tparquet::Type::BYTE_ARRAY;
+ } else if (child.name == "value") {
+ has_value = child.physical_type == tparquet::Type::BYTE_ARRAY;
+ }
+ }
+ if (!has_metadata || !has_value) {
Review Comment:
This rejects a valid fully-shredded VARIANT schema where the top-level
`value` field is omitted and all data is represented by `typed_value`. The
Parquet shredding layout makes the fallback `value` optional for fully shredded
values, and the reader below already has code paths that can decode wrappers
with `typed_value` but no `value`. With this check, such files fail schema
parsing before reading. Please accept `metadata` plus either `value` or
`typed_value` (validating their physical/logical shape), and add coverage for a
top-level typed-value-only shredded variant.
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,240 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema&
field_schema,
+ std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+ std::string_view name) {
+ for (const auto& child : field_schema.children) {
+ if (child.name == name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_structural_name(variant_field,
"metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_structural_name(variant_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {
+ return find_child_by_structural_name(field_schema, "value") != nullptr &&
Review Comment:
This only treats a nested shredded field as variant-like when both `value`
and `typed_value` are present. A fully shredded field may omit the fallback
`value`, for example `typed_value.metric { typed_value { x ... } }`. For an
access like `v['metric']['x']`, `extract_variant_nested_column_ids()` recurses
into `metric`, this predicate returns false, and the generic struct path looks
for a child named `x` next to `typed_value`, so it returns false and falls back
to top-level `v.value` instead of selecting
`v.typed_value.metric.typed_value.x`. Please also recognize `typed_value`-only
shredded field groups and add a pruning/read regression for that layout.
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,241 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema&
field_schema,
+ std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+ std::string_view name) {
+ for (const auto& child : field_schema.children) {
+ if (child.name == name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_structural_name(variant_field,
"metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_structural_name(variant_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {
Review Comment:
Same typed-value-only pruning issue as the Hive/local helper. Iceberg
VARIANT files can fully shred a nested field and omit its field-level `value`;
then deeper accesses under that field are treated as regular struct paths and
miss `typed_value.<field>.typed_value...`, falling back to the wrong top-level
residual. Please keep the Iceberg path in sync with the Hive/local fix and
cover the typed-value-only nested field case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]