the-other-tim-brown commented on code in PR #18108:
URL: https://github.com/apache/hudi/pull/18108#discussion_r2828243326
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -941,6 +963,86 @@ public HoodieSchema getNonNullType() {
return HoodieSchema.createUnion(nonNullTypes);
}
+ boolean containsBlobType() {
+ if (getType() == HoodieSchemaType.BLOB) {
+ return true;
+ } else if (getType() == HoodieSchemaType.ARRAY) {
+ return getElementType().containsBlobType();
+ } else if (getType() == HoodieSchemaType.MAP) {
+ return getValueType().containsBlobType();
+ } else if (getType() == HoodieSchemaType.UNION) {
+ return getTypes().stream().anyMatch(HoodieSchema::containsBlobType);
+ } else if (hasFields()) {
+ return getFields().stream().anyMatch(field ->
field.schema().containsBlobType());
+ }
+ return false;
+ }
+
+ /**
+ * A convenience method to check if the current field represents a blob type.
+ * This checks if the current schema is a BLOB or if it is an ARRAY or MAP
whose element or value type is a BLOB, respectively.
+ * It does not check for BLOB types nested within unions or record fields.
+ * @return true if the current schema is a BLOB or an ARRAY/MAP of BLOBs,
false otherwise
+ */
+ public boolean isBlobField() {
+ HoodieSchema nonNullSchema = getNonNullType();
+ HoodieSchemaType nonNullSchemaType = nonNullSchema.getType();
+ return nonNullSchemaType == HoodieSchemaType.BLOB
+ || (nonNullSchemaType == HoodieSchemaType.ARRAY &&
nonNullSchema.getElementType().getNonNullType().getType() ==
HoodieSchemaType.BLOB)
+ || (nonNullSchemaType == HoodieSchemaType.MAP &&
nonNullSchema.getValueType().getNonNullType().getType() ==
HoodieSchemaType.BLOB);
+ }
+
+ /**
+ * Validates that the schema does not contain variants with shredded blob
types.
+ * This method recursively traverses the schema tree to check for invalid
structures.
+ *
+ * @param schema the schema to validate
+ * @throws HoodieSchemaException if the schema contains arrays or maps with
blob types
+ */
+ private static void validateNoBlobsInVariant(HoodieSchema schema) {
+ if (schema == null) {
+ return;
+ }
+
+ HoodieSchemaType type = schema.getType();
+
+ switch (type) {
+ case ARRAY:
+ HoodieSchema elementType = schema.getElementType();
+ validateNoBlobsInVariant(elementType);
+ break;
+ case MAP:
+ HoodieSchema valueType = schema.getValueType();
+ validateNoBlobsInVariant(valueType);
+ break;
+ case VARIANT:
+ HoodieSchema.Variant variantSchema = (HoodieSchema.Variant) schema;
+ variantSchema.getTypedValueField().ifPresent(typedValueField -> {
+ if (typedValueField.getNonNullType().containsBlobType()) {
+ throw new HoodieSchemaException("Variant typed_value field cannot
be or contain a BLOB type");
+ }
+ });
+ break;
+ case RECORD:
+ // Validate all record fields
+ List<HoodieSchemaField> fields = schema.getFields();
+ for (HoodieSchemaField field : fields) {
+ validateNoBlobsInVariant(field.schema());
+ }
+ break;
+ case UNION:
+ // Validate all union types
+ List<HoodieSchema> types = schema.getTypes();
+ for (HoodieSchema unionType : types) {
+ validateNoBlobsInVariant(unionType);
+ }
+ break;
+ // For primitives, BLOB, ENUM, FIXED, NULL - no nested validation needed
+ default:
+ break;
+ }
+ }
Review Comment:
It turns out that the avro parsing will not throw errors on invalid logical
types. The error will not be thrown on parsing but can be thrown when trying to
access the field or when creating the field programatically.
I think that this case of blob inside of variant will be unlikely so I think
throwing at runtime is a fair tradeoff here to avoid traversing the schema
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]