the-other-tim-brown commented on code in PR #18146:
URL: https://github.com/apache/hudi/pull/18146#discussion_r2799784858
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -632,6 +635,54 @@ public static HoodieSchema.Variant
createVariantShredded(String name, String nam
return new HoodieSchema.Variant(recordSchema);
}
+ /**
+ * Creates Vector schema with default name and specified dimension.
+ *
+ * @param dimension vector dimension (must be > 0)
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(int dimension) {
+ return createVector(null, dimension);
+ }
+
+ /**
+ * Creates Vector schema with custom name and dimension.
+ *
+ * @param name record name (null uses default "vector")
+ * @param dimension vector dimension (must be > 0)
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(String name, int dimension) {
+ String vectorName = (name != null && !name.isEmpty()) ? name :
Vector.DEFAULT_NAME;
+ Schema vectorSchema = Vector.createSchema(vectorName, dimension,
Vector.VectorElementType.FLOAT);
+ return new HoodieSchema.Vector(vectorSchema);
Review Comment:
```suggestion
return createVector(name, dimension, Vector.VectorElementType.FLOAT);
```
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1551,216 @@ public int hashCode() {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+
+ /**
+ * Enum representing vector element data types.
+ */
+ public enum VectorElementType {
+ FLOAT("FLOAT", 4),
+ DOUBLE("DOUBLE", 8),
+ INT8("INT8", 1);
+
+ private final String name;
Review Comment:
nit: this is overloading the `name` of an enum. Can we call this dataType or
something else?
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1551,216 @@ public int hashCode() {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+
+ /**
+ * Enum representing vector element data types.
+ */
+ public enum VectorElementType {
+ FLOAT("FLOAT", 4),
+ DOUBLE("DOUBLE", 8),
+ INT8("INT8", 1);
+
+ private final String name;
+ private final int byteSize;
+
+ VectorElementType(String name, int byteSize) {
+ this.name = name;
+ this.byteSize = byteSize;
+ }
+
+ /**
+ * Returns the string representation for serialization.
+ *
+ * @return element type name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Returns the byte size of this element type.
+ *
+ * @return number of bytes per element
+ */
+ public int getByteSize() {
Review Comment:
nit: `getElementSize()` would be more clear to the caller? otherwise they
may think it is the vector size if they don't read the docs.
##########
hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java:
##########
@@ -838,6 +839,312 @@ void testCreateDecimalSchema() {
assertEquals(5, decimalFixedSchema.getFixedSize());
}
+ @Test
+ void testCreateVectorWithDimension() {
+ // Create vector with dimension only (defaults to FLOAT)
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals(1536, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+
+ // Verify properties are at schema level
+ Schema avroSchema = vectorSchema.getAvroSchema();
+ assertEquals(1536, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify Vector is FIXED type (not RECORD)
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+ assertEquals(1536 * 4, avroSchema.getFixedSize());
+ }
+
+ @Test
+ void testCreateVectorWithNameAndDimension() {
+ // Create vector with custom name and dimension
+ HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+ assertEquals(768, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithDimensionAndElementType() {
+ // Create vector with DOUBLE element type
+ HoodieSchema schemaDouble = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ assertTrue(schemaDouble instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorDouble = (HoodieSchema.Vector) schemaDouble;
+ assertEquals(1536, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+ HoodieSchema schemaFloat = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ assertTrue(schemaFloat instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorFloat = (HoodieSchema.Vector) schemaFloat;
+ assertEquals(512, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithAllParameters() {
+ // Create vector with all parameters: custom name, dimension, and element
type
+ HoodieSchema schema = HoodieSchema.createVector("precise_vectors", 512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+
+ assertEquals("precise_vectors", vectorSchema.getAvroSchema().getName());
+ assertEquals(512, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorSchema.getVectorElementType());
+ assertEquals(HoodieSchemaType.VECTOR, vectorSchema.getType());
+ }
+
+ @Test
+ void testVectorInvalidDimension() {
+ // Test zero dimension
+ IllegalArgumentException ex1 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(0)
+ );
+ assertTrue(ex1.getMessage().contains("must be positive"));
+
+ // Test negative dimension
+ IllegalArgumentException ex2 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(-1)
+ );
+ assertTrue(ex2.getMessage().contains("must be positive"));
+ }
+
+ @Test
+ void testVectorLogicalTypeDetection() {
+ // Create vector schema
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+ }
+
+ @Test
+ void testVectorSchemaValidation() {
+ // Create vector and verify FIXED structure
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(768);
+ Schema avroSchema = vectorSchema.getAvroSchema();
+
+ // Verify Vector is FIXED type
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify dimension, elementType, storageBacking are schema properties
+ assertEquals(768, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify FIXED size = dimension × elementSize (768 × 4 bytes for FLOAT)
+ assertEquals(768 * 4, avroSchema.getFixedSize());
+ assertEquals(768 * 4, vectorSchema.getFixedSize());
+ }
+
+ @Test
+ void testVectorFieldAccess() {
+ // Create vector with FLOAT
+ HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536);
+ assertEquals(1536, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+
+ HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(768, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+
+ // Verify dimension/elementType/storageBacking are accessible via
properties
+ assertEquals(1536, ((Number)
vectorFloat.getAvroSchema().getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
vectorFloat.getAvroSchema().getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
vectorFloat.getAvroSchema().getProp("storageBacking"));
+
+ // Verify FIXED size access
+ assertEquals(1536 * 4, vectorFloat.getFixedSize()); // FLOAT is 4 bytes
+ assertEquals(768 * 8, vectorDouble.getFixedSize()); // DOUBLE is 8 bytes
+ }
+
+ @Test
+ void testVectorEquality() {
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v3 = HoodieSchema.createVector(768);
+ HoodieSchema.Vector v4 = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Same dimension and element type -> equal
+ assertEquals(v1, v2);
+ assertEquals(v1.hashCode(), v2.hashCode());
+
+ // Different dimension -> not equal
+ assertNotEquals(v1, v3);
+
+ // Different element type -> not equal
+ assertNotEquals(v1, v4);
+
+ // Reflexivity
+ assertEquals(v1, v1);
+
+ // Null check
+ assertNotEquals(v1, null);
+
+ // Different class
+ assertNotEquals(v1, "string");
+ }
+
+ @Test
+ void testVectorRoundTripSerializationToJson() throws Exception {
Review Comment:
Let's do this round trip with more complex schemas. You can add this round
trip assertion as part of the `testVectorInNestedStructures` to leverage those
more complex cases.
##########
hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java:
##########
@@ -838,6 +839,312 @@ void testCreateDecimalSchema() {
assertEquals(5, decimalFixedSchema.getFixedSize());
}
+ @Test
+ void testCreateVectorWithDimension() {
+ // Create vector with dimension only (defaults to FLOAT)
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals(1536, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+
+ // Verify properties are at schema level
+ Schema avroSchema = vectorSchema.getAvroSchema();
+ assertEquals(1536, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify Vector is FIXED type (not RECORD)
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+ assertEquals(1536 * 4, avroSchema.getFixedSize());
+ }
+
+ @Test
+ void testCreateVectorWithNameAndDimension() {
+ // Create vector with custom name and dimension
+ HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+ assertEquals(768, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithDimensionAndElementType() {
+ // Create vector with DOUBLE element type
+ HoodieSchema schemaDouble = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ assertTrue(schemaDouble instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorDouble = (HoodieSchema.Vector) schemaDouble;
+ assertEquals(1536, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+ HoodieSchema schemaFloat = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ assertTrue(schemaFloat instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorFloat = (HoodieSchema.Vector) schemaFloat;
+ assertEquals(512, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithAllParameters() {
+ // Create vector with all parameters: custom name, dimension, and element
type
+ HoodieSchema schema = HoodieSchema.createVector("precise_vectors", 512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+
+ assertEquals("precise_vectors", vectorSchema.getAvroSchema().getName());
+ assertEquals(512, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorSchema.getVectorElementType());
+ assertEquals(HoodieSchemaType.VECTOR, vectorSchema.getType());
+ }
+
+ @Test
+ void testVectorInvalidDimension() {
+ // Test zero dimension
+ IllegalArgumentException ex1 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(0)
+ );
+ assertTrue(ex1.getMessage().contains("must be positive"));
+
+ // Test negative dimension
+ IllegalArgumentException ex2 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(-1)
+ );
+ assertTrue(ex2.getMessage().contains("must be positive"));
+ }
+
+ @Test
+ void testVectorLogicalTypeDetection() {
+ // Create vector schema
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+ }
+
+ @Test
+ void testVectorSchemaValidation() {
+ // Create vector and verify FIXED structure
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(768);
+ Schema avroSchema = vectorSchema.getAvroSchema();
+
+ // Verify Vector is FIXED type
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify dimension, elementType, storageBacking are schema properties
+ assertEquals(768, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify FIXED size = dimension × elementSize (768 × 4 bytes for FLOAT)
+ assertEquals(768 * 4, avroSchema.getFixedSize());
+ assertEquals(768 * 4, vectorSchema.getFixedSize());
+ }
+
+ @Test
+ void testVectorFieldAccess() {
+ // Create vector with FLOAT
+ HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536);
+ assertEquals(1536, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+
+ HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(768, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+
+ // Verify dimension/elementType/storageBacking are accessible via
properties
+ assertEquals(1536, ((Number)
vectorFloat.getAvroSchema().getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
vectorFloat.getAvroSchema().getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
vectorFloat.getAvroSchema().getProp("storageBacking"));
+
+ // Verify FIXED size access
+ assertEquals(1536 * 4, vectorFloat.getFixedSize()); // FLOAT is 4 bytes
+ assertEquals(768 * 8, vectorDouble.getFixedSize()); // DOUBLE is 8 bytes
+ }
+
+ @Test
+ void testVectorEquality() {
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v3 = HoodieSchema.createVector(768);
+ HoodieSchema.Vector v4 = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Same dimension and element type -> equal
+ assertEquals(v1, v2);
+ assertEquals(v1.hashCode(), v2.hashCode());
+
+ // Different dimension -> not equal
+ assertNotEquals(v1, v3);
+
+ // Different element type -> not equal
+ assertNotEquals(v1, v4);
+
+ // Reflexivity
+ assertEquals(v1, v1);
+
+ // Null check
+ assertNotEquals(v1, null);
+
+ // Different class
+ assertNotEquals(v1, "string");
+ }
+
+ @Test
+ void testVectorRoundTripSerializationToJson() throws Exception {
+ // Create vector with DOUBLE element type
+ HoodieSchema.Vector original = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Serialize to JSON
+ String jsonSchema = original.toString();
+ assertNotNull(jsonSchema);
+
+ // Parse from JSON
+ HoodieSchema parsed = HoodieSchema.parse(jsonSchema);
+
+ // Verify
+ assertTrue(parsed instanceof HoodieSchema.Vector);
+ assertEquals(original, parsed);
+
+ HoodieSchema.Vector parsedVector = (HoodieSchema.Vector) parsed;
+ assertEquals(512, parsedVector.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
parsedVector.getVectorElementType());
+ }
+
+ @Test
+ void testVectorSerialization() throws Exception {
+ // Create vector with DOUBLE element type
+ HoodieSchema.Vector original = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Java serialize
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+ ObjectOutputStream out = new ObjectOutputStream(byteOut);
+ out.writeObject(original);
+ out.close();
+
+ // Java deserialize
+ ByteArrayInputStream byteIn = new
ByteArrayInputStream(byteOut.toByteArray());
+ ObjectInputStream in = new ObjectInputStream(byteIn);
+ HoodieSchema deserialized = (HoodieSchema) in.readObject();
+ in.close();
+
+ // Verify
+ assertTrue(deserialized instanceof HoodieSchema.Vector);
+ assertEquals(original, deserialized);
+
+ HoodieSchema.Vector deserializedVector = (HoodieSchema.Vector)
deserialized;
+ assertEquals(768, deserializedVector.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
deserializedVector.getVectorElementType());
+ }
+
+ @Test
+ void testVectorInNestedStructures() throws Exception {
+ // Create vector schema
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Test vector in record - verify it can be used as a field
+ List<HoodieSchemaField> fields = Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("embedding", vectorSchema)
+ );
+ HoodieSchema recordSchema = HoodieSchema.createRecord("TestRecord", null,
null, fields);
+ assertEquals(HoodieSchemaType.RECORD, recordSchema.getType());
+
+ // Verify vector field is preserved in the Avro schema
+ Schema.Field embeddingField =
recordSchema.getAvroSchema().getField("embedding");
+ assertNotNull(embeddingField);
+ HoodieSchema embeddingSchema =
HoodieSchema.fromAvroSchema(embeddingField.schema());
+ assertTrue(embeddingSchema instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) embeddingSchema).getDimension());
+
+ // Test vector in array
+ HoodieSchema arraySchema = HoodieSchema.createArray(vectorSchema);
+ assertEquals(HoodieSchemaType.ARRAY, arraySchema.getType());
+ HoodieSchema arrayElement = arraySchema.getElementType();
+ assertTrue(arrayElement instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) arrayElement).getDimension());
+
+ // Test vector in map
+ HoodieSchema mapSchema = HoodieSchema.createMap(vectorSchema);
+ assertEquals(HoodieSchemaType.MAP, mapSchema.getType());
+ HoodieSchema mapValue = mapSchema.getValueType();
+ assertTrue(mapValue instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) mapValue).getDimension());
+ }
+
+ @Test
+ void testVectorElementTypes() {
+ // Create FLOAT vector
+ HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.FLOAT);
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+ assertEquals("FLOAT",
HoodieSchema.Vector.VectorElementType.FLOAT.getName());
+
+ // Create DOUBLE vector
+ HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+ assertEquals("DOUBLE",
HoodieSchema.Vector.VectorElementType.DOUBLE.getName());
+
+ // FLOAT and DOUBLE vectors should not be equal (different element types)
+ assertNotEquals(vectorFloat, vectorDouble);
+ }
+
+ @Test
+ void testVectorWithDefaultName() {
+ // Create vector with null name
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(null, 1536);
+ assertEquals("vector", v1.getAvroSchema().getName());
+
+ // Create vector with empty string name
+ HoodieSchema.Vector v2 = HoodieSchema.createVector("", 768);
+ assertEquals("vector", v2.getAvroSchema().getName());
+ }
+
+ @Test
+ void testVectorTypeInHoodieSchemaType() {
Review Comment:
Similarly it looks like these assertions are already covered in other tests?
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -908,6 +960,13 @@ public boolean isSchemaNull() {
return type == null || type == HoodieSchemaType.NULL;
}
+ public boolean isVector() {
Review Comment:
Do we need this or is it enough to just use `getType() ==
HoodieSchemaType.VECTOR`?
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -698,7 +749,8 @@ public Option<String> getDoc() {
* @return true if this type can have fields (RECORD or VARIANT)
*/
public boolean hasFields() {
- return type == HoodieSchemaType.RECORD || type == HoodieSchemaType.VARIANT;
+ return type == HoodieSchemaType.RECORD
Review Comment:
nit: Let's remove this change
##########
hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java:
##########
@@ -838,6 +839,312 @@ void testCreateDecimalSchema() {
assertEquals(5, decimalFixedSchema.getFixedSize());
}
+ @Test
+ void testCreateVectorWithDimension() {
+ // Create vector with dimension only (defaults to FLOAT)
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals(1536, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+
+ // Verify properties are at schema level
+ Schema avroSchema = vectorSchema.getAvroSchema();
+ assertEquals(1536, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify Vector is FIXED type (not RECORD)
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+ assertEquals(1536 * 4, avroSchema.getFixedSize());
+ }
+
+ @Test
+ void testCreateVectorWithNameAndDimension() {
+ // Create vector with custom name and dimension
+ HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+ assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+ assertEquals(768, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithDimensionAndElementType() {
+ // Create vector with DOUBLE element type
+ HoodieSchema schemaDouble = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ assertTrue(schemaDouble instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorDouble = (HoodieSchema.Vector) schemaDouble;
+ assertEquals(1536, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+ HoodieSchema schemaFloat = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ assertTrue(schemaFloat instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorFloat = (HoodieSchema.Vector) schemaFloat;
+ assertEquals(512, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+ }
+
+ @Test
+ void testCreateVectorWithAllParameters() {
+ // Create vector with all parameters: custom name, dimension, and element
type
+ HoodieSchema schema = HoodieSchema.createVector("precise_vectors", 512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+
+ assertEquals("precise_vectors", vectorSchema.getAvroSchema().getName());
+ assertEquals(512, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorSchema.getVectorElementType());
+ assertEquals(HoodieSchemaType.VECTOR, vectorSchema.getType());
+ }
+
+ @Test
+ void testVectorInvalidDimension() {
+ // Test zero dimension
+ IllegalArgumentException ex1 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(0)
+ );
+ assertTrue(ex1.getMessage().contains("must be positive"));
+
+ // Test negative dimension
+ IllegalArgumentException ex2 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(-1)
+ );
+ assertTrue(ex2.getMessage().contains("must be positive"));
+ }
+
+ @Test
+ void testVectorLogicalTypeDetection() {
+ // Create vector schema
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+ }
+
+ @Test
+ void testVectorSchemaValidation() {
+ // Create vector and verify FIXED structure
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(768);
+ Schema avroSchema = vectorSchema.getAvroSchema();
+
+ // Verify Vector is FIXED type
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify dimension, elementType, storageBacking are schema properties
+ assertEquals(768, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
avroSchema.getProp("storageBacking"));
+
+ // Verify FIXED size = dimension × elementSize (768 × 4 bytes for FLOAT)
+ assertEquals(768 * 4, avroSchema.getFixedSize());
+ assertEquals(768 * 4, vectorSchema.getFixedSize());
+ }
+
+ @Test
+ void testVectorFieldAccess() {
+ // Create vector with FLOAT
+ HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536);
+ assertEquals(1536, vectorFloat.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorFloat.getVectorElementType());
+
+ HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(768, vectorDouble.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
vectorDouble.getVectorElementType());
+
+ // Verify dimension/elementType/storageBacking are accessible via
properties
+ assertEquals(1536, ((Number)
vectorFloat.getAvroSchema().getObjectProp("dimension")).intValue());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getName(),
vectorFloat.getAvroSchema().getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES,
vectorFloat.getAvroSchema().getProp("storageBacking"));
+
+ // Verify FIXED size access
+ assertEquals(1536 * 4, vectorFloat.getFixedSize()); // FLOAT is 4 bytes
+ assertEquals(768 * 8, vectorDouble.getFixedSize()); // DOUBLE is 8 bytes
+ }
+
+ @Test
+ void testVectorEquality() {
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v3 = HoodieSchema.createVector(768);
+ HoodieSchema.Vector v4 = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Same dimension and element type -> equal
+ assertEquals(v1, v2);
+ assertEquals(v1.hashCode(), v2.hashCode());
+
+ // Different dimension -> not equal
+ assertNotEquals(v1, v3);
+
+ // Different element type -> not equal
+ assertNotEquals(v1, v4);
+
+ // Reflexivity
+ assertEquals(v1, v1);
+
+ // Null check
+ assertNotEquals(v1, null);
+
+ // Different class
+ assertNotEquals(v1, "string");
+ }
+
+ @Test
+ void testVectorRoundTripSerializationToJson() throws Exception {
+ // Create vector with DOUBLE element type
+ HoodieSchema.Vector original = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Serialize to JSON
+ String jsonSchema = original.toString();
+ assertNotNull(jsonSchema);
+
+ // Parse from JSON
+ HoodieSchema parsed = HoodieSchema.parse(jsonSchema);
+
+ // Verify
+ assertTrue(parsed instanceof HoodieSchema.Vector);
+ assertEquals(original, parsed);
+
+ HoodieSchema.Vector parsedVector = (HoodieSchema.Vector) parsed;
+ assertEquals(512, parsedVector.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
parsedVector.getVectorElementType());
+ }
+
+ @Test
+ void testVectorSerialization() throws Exception {
+ // Create vector with DOUBLE element type
+ HoodieSchema.Vector original = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Java serialize
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+ ObjectOutputStream out = new ObjectOutputStream(byteOut);
+ out.writeObject(original);
+ out.close();
+
+ // Java deserialize
+ ByteArrayInputStream byteIn = new
ByteArrayInputStream(byteOut.toByteArray());
+ ObjectInputStream in = new ObjectInputStream(byteIn);
+ HoodieSchema deserialized = (HoodieSchema) in.readObject();
+ in.close();
+
+ // Verify
+ assertTrue(deserialized instanceof HoodieSchema.Vector);
+ assertEquals(original, deserialized);
+
+ HoodieSchema.Vector deserializedVector = (HoodieSchema.Vector)
deserialized;
+ assertEquals(768, deserializedVector.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE,
deserializedVector.getVectorElementType());
+ }
+
+ @Test
+ void testVectorInNestedStructures() throws Exception {
+ // Create vector schema
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Test vector in record - verify it can be used as a field
+ List<HoodieSchemaField> fields = Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("embedding", vectorSchema)
+ );
+ HoodieSchema recordSchema = HoodieSchema.createRecord("TestRecord", null,
null, fields);
+ assertEquals(HoodieSchemaType.RECORD, recordSchema.getType());
+
+ // Verify vector field is preserved in the Avro schema
+ Schema.Field embeddingField =
recordSchema.getAvroSchema().getField("embedding");
+ assertNotNull(embeddingField);
+ HoodieSchema embeddingSchema =
HoodieSchema.fromAvroSchema(embeddingField.schema());
+ assertTrue(embeddingSchema instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) embeddingSchema).getDimension());
+
+ // Test vector in array
+ HoodieSchema arraySchema = HoodieSchema.createArray(vectorSchema);
+ assertEquals(HoodieSchemaType.ARRAY, arraySchema.getType());
+ HoodieSchema arrayElement = arraySchema.getElementType();
+ assertTrue(arrayElement instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) arrayElement).getDimension());
+
+ // Test vector in map
+ HoodieSchema mapSchema = HoodieSchema.createMap(vectorSchema);
+ assertEquals(HoodieSchemaType.MAP, mapSchema.getType());
+ HoodieSchema mapValue = mapSchema.getValueType();
+ assertTrue(mapValue instanceof HoodieSchema.Vector);
+ assertEquals(128, ((HoodieSchema.Vector) mapValue).getDimension());
+ }
+
+ @Test
+ void testVectorElementTypes() {
Review Comment:
This seems redundant with `testCreateVectorWithDimensionAndElementType`
##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1660,6 +1929,86 @@ public void validate(Schema schema) {
}
}
+ static class VectorLogicalType extends LogicalType {
+ private static final String VECTOR_LOGICAL_TYPE_NAME = "vector";
+ private static final String PROP_DIMENSION = "dimension";
+ private static final String PROP_ELEMENT_TYPE = "elementType";
+ private static final String PROP_STORAGE_BACKING = "storageBacking";
+
+ private final int dimension;
+ private final String elementType;
+ private final String storageBacking;
+
+ public VectorLogicalType(int dimension, String elementType, String
storageBacking) {
+ super(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME);
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Vector dimension must be positive: " + dimension);
+ ValidationUtils.checkArgument(elementType != null &&
!elementType.isEmpty(),
+ () -> "Element type cannot be null or empty");
+ ValidationUtils.checkArgument(storageBacking != null &&
!storageBacking.isEmpty(),
+ () -> "Storage backing cannot be null or empty");
+
+ this.dimension = dimension;
+ this.elementType = elementType;
+ this.storageBacking = storageBacking;
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getElementType() {
+ return elementType;
+ }
+
+ public String getStorageBacking() {
+ return storageBacking;
+ }
+
+ @Override
+ public Schema addToSchema(Schema schema) {
+ super.addToSchema(schema);
+ schema.addProp(PROP_DIMENSION, dimension);
+ schema.addProp(PROP_ELEMENT_TYPE, elementType);
+ schema.addProp(PROP_STORAGE_BACKING, storageBacking);
+ return schema;
+ }
+
+ @Override
+ public void validate(Schema schema) {
+ super.validate(schema);
+ }
Review Comment:
If we don't have any logic to override, I think this can be removed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]