github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3253607584
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -103,6 +127,1612 @@ static void fill_array_offset(FieldSchema* field,
ColumnArray::Offsets64& offset
}
}
+static constexpr int64_t UNIX_EPOCH_DAYNR = 719528;
+static constexpr int64_t MICROS_PER_SECOND = 1000000;
+
+static int64_t variant_date_value(const VecDateTimeValue& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_date_value(const DateV2Value<DateV2ValueType>& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_datetime_value(const VecDateTimeValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND;
+}
+
+static int64_t variant_datetime_value(const DateV2Value<DateTimeV2ValueType>&
value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int64_t variant_datetime_value(const TimestampTzValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int find_child_idx(const FieldSchema& field, std::string_view name) {
+ for (int i = 0; i < field.children.size(); ++i) {
+ if (field.children[i].lower_case_name == name) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool is_variant_wrapper_typed_value_child(const FieldSchema& field) {
+ auto type = remove_nullable(field.data_type);
+ return type->get_primitive_type() == TYPE_STRUCT ||
type->get_primitive_type() == TYPE_ARRAY;
+}
+
+static bool is_unannotated_variant_value_field(const FieldSchema& field) {
+ // VARIANT residual value is raw binary; annotated strings named value are
user fields.
+ return field.lower_case_name == "value" && field.physical_type ==
tparquet::Type::BYTE_ARRAY &&
+ !field.parquet_schema.__isset.logicalType &&
+ !field.parquet_schema.__isset.converted_type;
+}
+
+static bool is_variant_wrapper_field(const FieldSchema& field,
+ bool
allow_scalar_typed_value_only_wrapper) {
+ auto type = remove_nullable(field.data_type);
+ if (type->get_primitive_type() != TYPE_STRUCT &&
type->get_primitive_type() != TYPE_VARIANT) {
+ return false;
+ }
+
+ bool has_metadata = false;
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field.children) {
+ if (child.lower_case_name == "metadata") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_metadata = true;
+ continue;
+ }
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
Review Comment:
This wrapper check still accepts an ordinary annotated user field named
`value` when the same object also has a `typed_value` child. A valid typed-only
VARIANT object such as `{"obj": {"value": "abc", "typed_value": {"x": 1}}}` is
represented under `v.typed_value.obj` with a UTF8/STRING `value` leaf plus a
struct `typed_value` leaf. Row-wise reconstruction calls
`is_variant_wrapper_field(obj, ...)`, this branch only checks that `value` is
`BYTE_ARRAY`, and line 215 returns true because `typed_value != nullptr`;
`variant_to_variant_map()` then tries to decode the UTF8 bytes from `obj.value`
as Parquet VARIANT residual bytes with inherited metadata, causing corruption
or wrong output instead of producing user fields `obj.value` and
`obj.typed_value.x`. This is distinct from the existing value-only and
typed_value-only false-positive threads because the misclassification requires
both user children together. Please require `value` to satisfy
`is_unannotated_variant_value_field()` for
wrapper classification (as the pruning helper already does) and add coverage
for a typed-only object containing both annotated `value` and `typed_value`
user fields.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]