eldenmoon commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229682477
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,176 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
Review Comment:
Fixed in latest push. VARIANT user keys now use exact matching while
structural Parquet fields still use structural lookup. The pruning helper now
treats unannotated value/typed_value shredded field groups as variant-like, so
v['metric']['x'] selects v.typed_value.metric.value instead of top-level
v.value. Added profile assertions for the deeper residual path.
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,177 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
Review Comment:
Fixed in latest push. VARIANT user keys now use exact matching while
structural Parquet fields still use structural lookup. The pruning helper now
treats unannotated value/typed_value shredded field groups as variant-like, so
v['metric']['x'] selects v.typed_value.metric.value instead of top-level
v.value. Added profile assertions for the deeper residual path.
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,177 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
Review Comment:
Fixed in latest push. VARIANT path keys now use exact matching, so v['Name']
does not match shredded field name and falls back to v.value. Added a
regression profile assertion that requires v.metadata and v.value while
rejecting v.typed_value.name.
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,176 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
Review Comment:
Fixed in latest push. VARIANT path keys now use exact matching, so v['Name']
does not match shredded field name and falls back to v.value. Added a
regression profile assertion that requires v.metadata and v.value while
rejecting v.typed_value.name.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]