voonhous commented on code in PR #18036:
URL: https://github.com/apache/hudi/pull/18036#discussion_r3040196264
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -2463,6 +2569,60 @@ public Option<HoodieSchema> getTypedValueField() {
return typedValueSchema;
}
+ /**
+ * Returns the typed_value schema with plain (unwrapped) types suitable
for Spark shredding utilities, i.e. essentially removing the `value` field
+ *
+ * <p>If the typed_value follows the variant shredding spec (each field is
a struct with
+ * {@code {value: bytes, typed_value: <type>}}), this extracts only the
inner typed_value types and returns a record schema containing just those plain
types.</p>
+ *
+ * <p>If the typed_value is already in plain form (created with {@code
createVariantShredded}),
+ * returns the schema as-is.</p>
+ *
+ * @return Option containing the plain typed_value schema, or
Option.empty() if not present
+ */
+ public Option<HoodieSchema> getPlainTypedValueSchema() {
+ if (!typedValueSchema.isPresent()) {
+ return Option.empty();
+ }
+ HoodieSchema tvSchema = typedValueSchema.get();
+ if (tvSchema.getType() != HoodieSchemaType.RECORD) {
+ return typedValueSchema;
+ }
+
+ List<HoodieSchemaField> fields = tvSchema.getFields();
+ // Check if all fields follow the nested shredding pattern: each field
is a record with {value, typed_value}
+ boolean isNestedForm = !fields.isEmpty() &&
fields.stream().allMatch(field -> {
+ HoodieSchema fieldSchema = field.schema();
+ if (fieldSchema.isNullable()) {
+ fieldSchema = fieldSchema.getNonNullType();
+ }
+ if (fieldSchema.getType() != HoodieSchemaType.RECORD) {
+ return false;
+ }
+ Option<HoodieSchemaField> valueSubField =
fieldSchema.getField(VARIANT_VALUE_FIELD);
+ Option<HoodieSchemaField> typedValueSubField =
fieldSchema.getField(VARIANT_TYPED_VALUE_FIELD);
+ return valueSubField.isPresent() && typedValueSubField.isPresent()
+ && fieldSchema.getFields().size() == 2;
+ });
Review Comment:
Addressed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]