the-other-tim-brown commented on code in PR #18108:
URL: https://github.com/apache/hudi/pull/18108#discussion_r2828109858
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -941,6 +963,86 @@ public HoodieSchema getNonNullType() {
return HoodieSchema.createUnion(nonNullTypes);
}
+ boolean containsBlobType() {
+ if (getType() == HoodieSchemaType.BLOB) {
+ return true;
+ } else if (getType() == HoodieSchemaType.ARRAY) {
+ return getElementType().containsBlobType();
+ } else if (getType() == HoodieSchemaType.MAP) {
+ return getValueType().containsBlobType();
+ } else if (getType() == HoodieSchemaType.UNION) {
+ return getTypes().stream().anyMatch(HoodieSchema::containsBlobType);
+ } else if (hasFields()) {
+ return getFields().stream().anyMatch(field ->
field.schema().containsBlobType());
+ }
+ return false;
+ }
+
+ /**
+ * A convenience method to check if the current field represents a blob type.
+ * This checks if the current schema is a BLOB or if it is an ARRAY or MAP
whose element or value type is a BLOB, respectively.
+ * It does not check for BLOB types nested within unions or record fields.
+ * @return true if the current schema is a BLOB or an ARRAY/MAP of BLOBs,
false otherwise
+ */
+ public boolean isBlobField() {
+ HoodieSchema nonNullSchema = getNonNullType();
+ HoodieSchemaType nonNullSchemaType = nonNullSchema.getType();
+ return nonNullSchemaType == HoodieSchemaType.BLOB
+ || (nonNullSchemaType == HoodieSchemaType.ARRAY &&
nonNullSchema.getElementType().getNonNullType().getType() ==
HoodieSchemaType.BLOB)
+ || (nonNullSchemaType == HoodieSchemaType.MAP &&
nonNullSchema.getValueType().getNonNullType().getType() ==
HoodieSchemaType.BLOB);
+ }
+
+ /**
+ * Validates that the schema does not contain variants with shredded blob
types.
+ * This method recursively traverses the schema tree to check for invalid
structures.
+ *
+ * @param schema the schema to validate
+ * @throws HoodieSchemaException if the schema contains arrays or maps with
blob types
+ */
+ private static void validateNoBlobsInVariant(HoodieSchema schema) {
+ if (schema == null) {
+ return;
+ }
+
+ HoodieSchemaType type = schema.getType();
+
+ switch (type) {
+ case ARRAY:
+ HoodieSchema elementType = schema.getElementType();
+ validateNoBlobsInVariant(elementType);
+ break;
+ case MAP:
+ HoodieSchema valueType = schema.getValueType();
+ validateNoBlobsInVariant(valueType);
+ break;
+ case VARIANT:
+ HoodieSchema.Variant variantSchema = (HoodieSchema.Variant) schema;
+ variantSchema.getTypedValueField().ifPresent(typedValueField -> {
+ if (typedValueField.getNonNullType().containsBlobType()) {
+ throw new HoodieSchemaException("Variant typed_value field cannot
be or contain a BLOB type");
+ }
+ });
+ break;
+ case RECORD:
+ // Validate all record fields
+ List<HoodieSchemaField> fields = schema.getFields();
+ for (HoodieSchemaField field : fields) {
+ validateNoBlobsInVariant(field.schema());
+ }
+ break;
+ case UNION:
+ // Validate all union types
+ List<HoodieSchema> types = schema.getTypes();
+ for (HoodieSchema unionType : types) {
+ validateNoBlobsInVariant(unionType);
+ }
+ break;
+ // For primitives, BLOB, ENUM, FIXED, NULL - no nested validation needed
+ default:
+ break;
+ }
+ }
Review Comment:
Now that this is limited to inspecting variants, I think I can make the
validation happen at the variant logical type level which would make this more
efficient. I will test this out.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]