the-other-tim-brown commented on code in PR #18146:
URL: https://github.com/apache/hudi/pull/18146#discussion_r2793856934
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1553,200 @@ public int hashCode() {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+ public static final String VALUES_FIXED_FIELD = "valuesFixed";
+
+ // Element types
+ public static final String ELEMENT_TYPE_FLOAT = "FLOAT";
+ public static final String ELEMENT_TYPE_DOUBLE = "DOUBLE";
+ public static final String ELEMENT_TYPE_INT8 = "INT8";
+
+ // Storage backing types
+ public static final String STORAGE_BACKING_FIXED_BYTES = "FIXED_BYTES";
+
+ private final int dimension;
+ private final String elementType;
+ private final String storageBacking;
+
+ /**
+ * Creates Vector from pre-built schema (used by factory methods).
+ *
+ * @param avroSchema the Avro schema to wrap, must be a valid Vector schema
+ * @throws IllegalArgumentException if avroSchema is null or not a valid
Vector schema
+ */
+ private Vector(Schema avroSchema) {
+ super(avroSchema);
+
+ // Extract properties from LogicalType
+ LogicalType logicalType = avroSchema.getLogicalType();
+ if (!(logicalType instanceof VectorLogicalType)) {
+ throw new IllegalArgumentException(
+ "Schema must have VectorLogicalType, got: " + logicalType);
+ }
+
+ VectorLogicalType vectorLogicalType = (VectorLogicalType) logicalType;
+ this.dimension = vectorLogicalType.getDimension();
+ this.elementType = vectorLogicalType.getElementType();
+ this.storageBacking = vectorLogicalType.getStorageBacking();
+
+ // Validate schema structure
+ validateVectorSchema(avroSchema);
+ }
+
+ @Override
+ public String getName() {
+ return "vector";
+ }
+
+ @Override
+ public HoodieSchemaType getType() {
+ return HoodieSchemaType.VECTOR;
+ }
+
+ /**
+ * Gets the byte size of a single element based on element type.
+ *
+ * @param elementType the element type (FLOAT, DOUBLE, INT8, etc.)
+ * @return number of bytes per element
+ */
+ private static int getElementSize(String elementType) {
+ switch (elementType) {
+ case ELEMENT_TYPE_FLOAT:
+ return 4;
+ case ELEMENT_TYPE_DOUBLE:
+ return 8;
+ case ELEMENT_TYPE_INT8:
+ return 1;
+ default:
+ throw new IllegalArgumentException("Unknown elementType: " +
elementType);
+ }
+ }
+
+ /**
+ * Creates vector schema with specified dimension and element type.
+ *
+ * @param name record name (not null)
+ * @param dimension vector dimension (must be > 0)
+ * @param elementType element type (FLOAT or INT8, defaults to FLOAT if
null)
+ * @return new Vector schema
+ */
+ private static Schema createSchema(String name, int dimension, String
elementType) {
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Vector dimension must be positive: " + dimension);
+
+ // Validate elementType
+ String resolvedElementType = elementType != null ? elementType :
ELEMENT_TYPE_FLOAT;
+
+ // Calculate fixed size: dimension × element size in bytes
+ int elementSize = getElementSize(resolvedElementType);
+ int fixedSize = dimension * elementSize;
+
+ // Create FIXED type for vector bytes
+ Schema fixedSchema = Schema.createFixed(name + "_bytes", null, null,
fixedSize);
+ Schema nullableFixedSchema =
AvroSchemaUtils.createNullableSchema(fixedSchema);
+
+ // Create RECORD wrapper with only valuesFixed field
+ Schema vectorSchema = Schema.createRecord(name, null, null, false);
Review Comment:
We don't need to nest this within another record. This is essentially a
logical type on a fixed-size bytes field.
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1553,200 @@ public int hashCode() {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+ public static final String VALUES_FIXED_FIELD = "valuesFixed";
+
+ // Element types
+ public static final String ELEMENT_TYPE_FLOAT = "FLOAT";
+ public static final String ELEMENT_TYPE_DOUBLE = "DOUBLE";
+ public static final String ELEMENT_TYPE_INT8 = "INT8";
+
+ // Storage backing types
+ public static final String STORAGE_BACKING_FIXED_BYTES = "FIXED_BYTES";
+
+ private final int dimension;
+ private final String elementType;
+ private final String storageBacking;
+
+ /**
+ * Creates Vector from pre-built schema (used by factory methods).
+ *
+ * @param avroSchema the Avro schema to wrap, must be a valid Vector schema
+ * @throws IllegalArgumentException if avroSchema is null or not a valid
Vector schema
+ */
+ private Vector(Schema avroSchema) {
+ super(avroSchema);
+
+ // Extract properties from LogicalType
+ LogicalType logicalType = avroSchema.getLogicalType();
+ if (!(logicalType instanceof VectorLogicalType)) {
+ throw new IllegalArgumentException(
+ "Schema must have VectorLogicalType, got: " + logicalType);
+ }
+
+ VectorLogicalType vectorLogicalType = (VectorLogicalType) logicalType;
+ this.dimension = vectorLogicalType.getDimension();
+ this.elementType = vectorLogicalType.getElementType();
+ this.storageBacking = vectorLogicalType.getStorageBacking();
+
+ // Validate schema structure
+ validateVectorSchema(avroSchema);
+ }
+
+ @Override
+ public String getName() {
+ return "vector";
+ }
+
+ @Override
+ public HoodieSchemaType getType() {
+ return HoodieSchemaType.VECTOR;
+ }
+
+ /**
+ * Gets the byte size of a single element based on element type.
+ *
+ * @param elementType the element type (FLOAT, DOUBLE, INT8, etc.)
+ * @return number of bytes per element
+ */
+ private static int getElementSize(String elementType) {
+ switch (elementType) {
+ case ELEMENT_TYPE_FLOAT:
+ return 4;
+ case ELEMENT_TYPE_DOUBLE:
+ return 8;
+ case ELEMENT_TYPE_INT8:
+ return 1;
+ default:
+ throw new IllegalArgumentException("Unknown elementType: " +
elementType);
+ }
+ }
+
+ /**
+ * Creates vector schema with specified dimension and element type.
+ *
+ * @param name record name (not null)
+ * @param dimension vector dimension (must be > 0)
+ * @param elementType element type (FLOAT or INT8, defaults to FLOAT if
null)
+ * @return new Vector schema
+ */
+ private static Schema createSchema(String name, int dimension, String
elementType) {
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Vector dimension must be positive: " + dimension);
+
+ // Validate elementType
+ String resolvedElementType = elementType != null ? elementType :
ELEMENT_TYPE_FLOAT;
+
+ // Calculate fixed size: dimension × element size in bytes
+ int elementSize = getElementSize(resolvedElementType);
+ int fixedSize = dimension * elementSize;
+
+ // Create FIXED type for vector bytes
+ Schema fixedSchema = Schema.createFixed(name + "_bytes", null, null,
fixedSize);
+ Schema nullableFixedSchema =
AvroSchemaUtils.createNullableSchema(fixedSchema);
+
+ // Create RECORD wrapper with only valuesFixed field
+ Schema vectorSchema = Schema.createRecord(name, null, null, false);
+ List<Schema.Field> fields = Arrays.asList(
+ new Schema.Field(VALUES_FIXED_FIELD, nullableFixedSchema,
+ "vector fixed bytes", Schema.Field.NULL_DEFAULT_VALUE)
+ );
+ vectorSchema.setFields(fields);
+
+ // Apply logical type with properties
+ VectorLogicalType vectorLogicalType = new VectorLogicalType(dimension,
resolvedElementType, STORAGE_BACKING_FIXED_BYTES);
+ vectorLogicalType.addToSchema(vectorSchema);
+
+ return vectorSchema;
+ }
+
+ /**
+ * Validates that the given Avro schema conforms to Vector specification.
+ *
+ * @param avroSchema the schema to validate
+ * @throws IllegalArgumentException if schema is invalid
+ */
+ private void validateVectorSchema(Schema avroSchema) {
+ ValidationUtils.checkArgument(avroSchema.getType() == Schema.Type.RECORD,
+ () -> "Vector schema must be RECORD type, got: " +
avroSchema.getType());
+
+ // Validate valuesFixed field exists and is nullable FIXED
+ Schema.Field valuesFixedField = avroSchema.getField(VALUES_FIXED_FIELD);
+ ValidationUtils.checkArgument(valuesFixedField != null,
+ () -> "Vector schema missing '" + VALUES_FIXED_FIELD + "' field");
+
+ Schema valuesFixedSchema =
AvroSchemaUtils.getNonNullTypeFromUnion(valuesFixedField.schema());
+ ValidationUtils.checkArgument(valuesFixedSchema.getType() ==
Schema.Type.FIXED,
+ () -> "Vector valuesFixed field must be FIXED, got: " +
valuesFixedSchema.getType());
+
+ // Verify FIXED size matches: dimension × elementSize
+ int expectedSize = dimension * getElementSize(elementType);
+ int actualSize = valuesFixedSchema.getFixedSize();
+ ValidationUtils.checkArgument(actualSize == expectedSize,
+ () -> "Vector FIXED size mismatch: expected " + expectedSize + "
bytes (dimension=" + dimension + " × elementSize=" +
getElementSize(elementType) + "), got " + actualSize);
+ }
+
+ /**
+ * Returns the dimension of this vector.
+ *
+ * @return vector dimension (always > 0)
+ */
+ public int getDimension() {
+ return dimension;
+ }
+
+ /**
+ * Returns the element type of this vector.
+ *
+ * @return element type string (e.g., "FLOAT" or "DOUBLE")
+ */
+ public String getVectorElementType() {
+ return elementType;
+ }
+
+ /**
+ * Returns the storage backing type.
+ *
+ * @return storage backing string (e.g., "ARRAY_FLOAT")
Review Comment:
Update the comment now that `ARRAY_FLOAT` is removed
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1553,200 @@ public int hashCode() {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+ public static final String VALUES_FIXED_FIELD = "valuesFixed";
+
+ // Element types
Review Comment:
Make this an enum?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]