the-other-tim-brown commented on code in PR #18146:
URL: https://github.com/apache/hudi/pull/18146#discussion_r2805462149


##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1660,6 +1919,81 @@ public void validate(Schema schema) {
     }
   }
 
+  static class VectorLogicalType extends LogicalType {
+    private static final String VECTOR_LOGICAL_TYPE_NAME = "vector";
+    private static final String PROP_DIMENSION = "dimension";
+    private static final String PROP_ELEMENT_TYPE = "elementType";
+    private static final String PROP_STORAGE_BACKING = "storageBacking";
+
+    private final int dimension;
+    private final String elementType;
+    private final String storageBacking;
+
+    public VectorLogicalType(int dimension, String elementType, String 
storageBacking) {
+      super(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME);
+      ValidationUtils.checkArgument(dimension > 0,
+          () -> "Vector dimension must be positive: " + dimension);
+      ValidationUtils.checkArgument(elementType != null && 
!elementType.isEmpty(),
+          () -> "Element type cannot be null or empty");
+      ValidationUtils.checkArgument(storageBacking != null && 
!storageBacking.isEmpty(),
+          () -> "Storage backing cannot be null or empty");
+
+      this.dimension = dimension;
+      this.elementType = elementType;
+      this.storageBacking = storageBacking;
+    }
+
+    public int getDimension() {
+      return dimension;
+    }
+
+    public String getElementType() {
+      return elementType;
+    }
+
+    public String getStorageBacking() {
+      return storageBacking;
+    }
+
+    @Override
+    public Schema addToSchema(Schema schema) {
+      super.addToSchema(schema);
+      schema.addProp(PROP_DIMENSION, dimension);
+      schema.addProp(PROP_ELEMENT_TYPE, elementType);
+      schema.addProp(PROP_STORAGE_BACKING, storageBacking);
+      return schema;
+    }
+  }
+
+  /**
+   * Factory for creating VectorLogicalType instances.
+   */
+  private static class VectorLogicalTypeFactory implements 
LogicalTypes.LogicalTypeFactory {
+    @Override
+    public LogicalType fromSchema(Schema schema) {
+      // Extract properties from schema
+      Object dimObj = schema.getObjectProp("dimension");

Review Comment:
   Can you use the constants like `PROP_DIMENSION` in this method?



##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -632,6 +635,52 @@ public static HoodieSchema.Variant 
createVariantShredded(String name, String nam
     return new HoodieSchema.Variant(recordSchema);
   }
 
+  /**
+   * Creates Vector schema with default name and specified dimension.
+   *
+   * @param dimension vector dimension (must be > 0)
+   * @return new HoodieSchema.Vector
+   */
+  public static HoodieSchema.Vector createVector(int dimension) {
+    return createVector(null, dimension);
+  }
+
+  /**
+   * Creates Vector schema with custom name and dimension.
+   *
+   * @param name record name (null uses default "vector")
+   * @param dimension vector dimension (must be > 0)
+   * @return new HoodieSchema.Vector
+   */
+  public static HoodieSchema.Vector createVector(String name, int dimension) {
+    return createVector(name, dimension, Vector.VectorElementType.FLOAT);
+  }
+
+  /**
+   * Creates Vector schema with custom dimension and element type.
+   *
+   * @param dimension vector dimension (must be > 0)
+   * @param elementType element type (use Vector.VectorElementType.FLOAT or 
Vector.VectorElementType.DOUBLE)

Review Comment:
   nit: use use a `@link` to link to the enum? Same applies for other javadocs 
like this



##########
hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java:
##########
@@ -838,6 +839,291 @@ void testCreateDecimalSchema() {
     assertEquals(5, decimalFixedSchema.getFixedSize());
   }
 
+  @Test
+  void testCreateVectorWithDimension() {
+    // Create vector with dimension only (defaults to FLOAT)
+    HoodieSchema schema = HoodieSchema.createVector(1536);
+
+    assertTrue(schema instanceof HoodieSchema.Vector);
+    assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+    HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+    assertEquals(1536, vectorSchema.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorSchema.getVectorElementType());
+
+    assertTrue(schema.getAvroSchema().getLogicalType() instanceof 
VectorLogicalType);
+
+    // Verify properties are at schema level
+    Schema avroSchema = vectorSchema.getAvroSchema();
+    assertEquals(1536, ((Number) 
avroSchema.getObjectProp("dimension")).intValue());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getDataType(), 
avroSchema.getProp("elementType"));
+    assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES, 
avroSchema.getProp("storageBacking"));
+
+    // Verify Vector is FIXED type (not RECORD)
+    assertEquals(Schema.Type.FIXED, avroSchema.getType());
+    assertFalse(vectorSchema.hasFields());
+
+    // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+    assertEquals(1536 * 4, avroSchema.getFixedSize());
+  }
+
+  @Test
+  void testCreateVectorWithNameAndDimension() {
+    // Create vector with custom name and dimension
+    HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+    assertTrue(schema instanceof HoodieSchema.Vector);
+    assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+    HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+    assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+    assertEquals(768, vectorSchema.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorSchema.getVectorElementType());
+  }
+
+  @Test
+  void testCreateVectorWithDimensionAndElementType() {
+    // Create vector with DOUBLE element type
+    HoodieSchema schemaDouble = HoodieSchema.createVector(1536, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+    assertTrue(schemaDouble instanceof HoodieSchema.Vector);
+    HoodieSchema.Vector vectorDouble = (HoodieSchema.Vector) schemaDouble;
+    assertEquals(1536, vectorDouble.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE, 
vectorDouble.getVectorElementType());
+    HoodieSchema schemaFloat = HoodieSchema.createVector(512, 
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+    assertTrue(schemaFloat instanceof HoodieSchema.Vector);
+    HoodieSchema.Vector vectorFloat = (HoodieSchema.Vector) schemaFloat;
+    assertEquals(512, vectorFloat.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorFloat.getVectorElementType());

Review Comment:
   Let's move some of these checks into a common helper method to make the test 
leaner



##########
hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java:
##########
@@ -838,6 +839,291 @@ void testCreateDecimalSchema() {
     assertEquals(5, decimalFixedSchema.getFixedSize());
   }
 
+  @Test
+  void testCreateVectorWithDimension() {
+    // Create vector with dimension only (defaults to FLOAT)
+    HoodieSchema schema = HoodieSchema.createVector(1536);
+
+    assertTrue(schema instanceof HoodieSchema.Vector);
+    assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+    HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+    assertEquals(1536, vectorSchema.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorSchema.getVectorElementType());
+
+    assertTrue(schema.getAvroSchema().getLogicalType() instanceof 
VectorLogicalType);
+
+    // Verify properties are at schema level
+    Schema avroSchema = vectorSchema.getAvroSchema();
+    assertEquals(1536, ((Number) 
avroSchema.getObjectProp("dimension")).intValue());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getDataType(), 
avroSchema.getProp("elementType"));
+    assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES, 
avroSchema.getProp("storageBacking"));
+
+    // Verify Vector is FIXED type (not RECORD)
+    assertEquals(Schema.Type.FIXED, avroSchema.getType());
+    assertFalse(vectorSchema.hasFields());
+
+    // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+    assertEquals(1536 * 4, avroSchema.getFixedSize());
+  }
+
+  @Test
+  void testCreateVectorWithNameAndDimension() {
+    // Create vector with custom name and dimension
+    HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+    assertTrue(schema instanceof HoodieSchema.Vector);
+    assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+    HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+    assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+    assertEquals(768, vectorSchema.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorSchema.getVectorElementType());
+  }
+
+  @Test
+  void testCreateVectorWithDimensionAndElementType() {
+    // Create vector with DOUBLE element type
+    HoodieSchema schemaDouble = HoodieSchema.createVector(1536, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+    assertTrue(schemaDouble instanceof HoodieSchema.Vector);
+    HoodieSchema.Vector vectorDouble = (HoodieSchema.Vector) schemaDouble;
+    assertEquals(1536, vectorDouble.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE, 
vectorDouble.getVectorElementType());
+    HoodieSchema schemaFloat = HoodieSchema.createVector(512, 
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+    assertTrue(schemaFloat instanceof HoodieSchema.Vector);
+    HoodieSchema.Vector vectorFloat = (HoodieSchema.Vector) schemaFloat;
+    assertEquals(512, vectorFloat.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorFloat.getVectorElementType());
+  }
+
+  @Test
+  void testCreateVectorWithAllParameters() {
+    // Create vector with all parameters: custom name, dimension, and element 
type
+    HoodieSchema schema = HoodieSchema.createVector("precise_vectors", 512, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+    assertTrue(schema instanceof HoodieSchema.Vector);
+    HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+
+    assertEquals("precise_vectors", vectorSchema.getAvroSchema().getName());
+    assertEquals(512, vectorSchema.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE, 
vectorSchema.getVectorElementType());
+    assertEquals(HoodieSchemaType.VECTOR, vectorSchema.getType());
+  }
+
+  @Test
+  void testVectorInvalidDimension() {
+    // Test zero dimension
+    IllegalArgumentException ex1 = assertThrows(
+        IllegalArgumentException.class,
+        () -> HoodieSchema.createVector(0)
+    );
+    assertTrue(ex1.getMessage().contains("must be positive"));
+
+    // Test negative dimension
+    IllegalArgumentException ex2 = assertThrows(
+        IllegalArgumentException.class,
+        () -> HoodieSchema.createVector(-1)
+    );
+    assertTrue(ex2.getMessage().contains("must be positive"));
+  }
+
+  @Test
+  void testVectorLogicalTypeDetection() {
+    // Create vector schema
+    HoodieSchema schema = HoodieSchema.createVector(1536);
+    assertTrue(schema.getAvroSchema().getLogicalType() instanceof 
VectorLogicalType);
+    assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+  }
+
+  @Test
+  void testVectorSchemaValidation() {
+    // Create vector and verify FIXED structure
+    HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(768);
+    Schema avroSchema = vectorSchema.getAvroSchema();
+
+    // Verify Vector is FIXED type
+    assertEquals(Schema.Type.FIXED, avroSchema.getType());
+    assertFalse(vectorSchema.hasFields());
+
+    // Verify dimension, elementType, storageBacking are schema properties
+    assertEquals(768, ((Number) 
avroSchema.getObjectProp("dimension")).intValue());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getDataType(), 
avroSchema.getProp("elementType"));
+    assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES, 
avroSchema.getProp("storageBacking"));
+
+    // Verify FIXED size = dimension × elementSize (768 × 4 bytes for FLOAT)
+    assertEquals(768 * 4, avroSchema.getFixedSize());
+    assertEquals(768 * 4, vectorSchema.getFixedSize());
+  }
+
+  @Test
+  void testVectorFieldAccess() {
+    // Create vector with FLOAT
+    HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536);
+    assertEquals(1536, vectorFloat.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT, 
vectorFloat.getVectorElementType());
+
+    HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(768, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+    assertEquals(768, vectorDouble.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE, 
vectorDouble.getVectorElementType());
+
+    // Verify dimension/elementType/storageBacking are accessible via 
properties
+    assertEquals(1536, ((Number) 
vectorFloat.getAvroSchema().getObjectProp("dimension")).intValue());
+    assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT.getDataType(), 
vectorFloat.getAvroSchema().getProp("elementType"));
+    assertEquals(HoodieSchema.Vector.STORAGE_BACKING_FIXED_BYTES, 
vectorFloat.getAvroSchema().getProp("storageBacking"));
+
+    // Verify FIXED size access
+    assertEquals(1536 * 4, vectorFloat.getFixedSize()); // FLOAT is 4 bytes
+    assertEquals(768 * 8, vectorDouble.getFixedSize()); // DOUBLE is 8 bytes
+  }
+  
+  @Test
+  void testVectorEquality() {
+    HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+    HoodieSchema.Vector v2 = HoodieSchema.createVector(1536);
+    HoodieSchema.Vector v3 = HoodieSchema.createVector(768);
+    HoodieSchema.Vector v4 = HoodieSchema.createVector(1536, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+    // Same dimension and element type -> equal
+    assertEquals(v1, v2);
+    assertEquals(v1.hashCode(), v2.hashCode());
+
+    // Different dimension -> not equal
+    assertNotEquals(v1, v3);
+
+    // Different element type -> not equal
+    assertNotEquals(v1, v4);
+
+    // Reflexivity
+    assertEquals(v1, v1);
+
+    // Null check
+    assertNotEquals(v1, null);
+
+    // Different class
+    assertNotEquals(v1, "string");
+  }
+  
+  @Test
+  void testVectorSerialization() throws Exception {
+    // Create vector with DOUBLE element type
+    HoodieSchema.Vector original = HoodieSchema.createVector(768, 
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+    // Java serialize
+    ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+    ObjectOutputStream out = new ObjectOutputStream(byteOut);
+    out.writeObject(original);
+    out.close();
+
+    // Java deserialize
+    ByteArrayInputStream byteIn = new 
ByteArrayInputStream(byteOut.toByteArray());
+    ObjectInputStream in = new ObjectInputStream(byteIn);
+    HoodieSchema deserialized = (HoodieSchema) in.readObject();
+    in.close();
+
+    // Verify
+    assertTrue(deserialized instanceof HoodieSchema.Vector);
+    assertEquals(original, deserialized);
+
+    HoodieSchema.Vector deserializedVector = (HoodieSchema.Vector) 
deserialized;
+    assertEquals(768, deserializedVector.getDimension());
+    assertEquals(HoodieSchema.Vector.VectorElementType.DOUBLE, 
deserializedVector.getVectorElementType());
+  }
+
+  @Test
+  void testVectorInNestedStructures() throws Exception {
+    // Create vector schema
+    HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(128, 
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+    // Test vector in record - verify it can be used as a field
+    List<HoodieSchemaField> fields = Arrays.asList(
+        HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+        HoodieSchemaField.of("embedding", vectorSchema)
+    );
+    HoodieSchema recordSchema = HoodieSchema.createRecord("TestRecord", null, 
null, fields);
+    assertEquals(HoodieSchemaType.RECORD, recordSchema.getType());
+
+    // Verify vector field is preserved in the Avro schema
+    Schema.Field embeddingField = 
recordSchema.getAvroSchema().getField("embedding");
+    assertNotNull(embeddingField);
+    HoodieSchema embeddingSchema = 
HoodieSchema.fromAvroSchema(embeddingField.schema());
+    assertTrue(embeddingSchema instanceof HoodieSchema.Vector);
+    assertEquals(128, ((HoodieSchema.Vector) embeddingSchema).getDimension());
+
+    // Round-trip record with vector field through JSON
+    String recordJson = recordSchema.toString();
+    HoodieSchema parsedRecord = HoodieSchema.parse(recordJson);
+    assertEquals(recordSchema, parsedRecord);
+    Schema.Field parsedEmbeddingField = 
parsedRecord.getAvroSchema().getField("embedding");
+    assertNotNull(parsedEmbeddingField);
+    HoodieSchema parsedEmbedding = 
HoodieSchema.fromAvroSchema(parsedEmbeddingField.schema());
+    assertTrue(parsedEmbedding instanceof HoodieSchema.Vector);
+    assertEquals(128, ((HoodieSchema.Vector) parsedEmbedding).getDimension());

Review Comment:
   Let's add validation on the type as well?



##########
hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java:
##########
@@ -1492,6 +1541,216 @@ public int hashCode() {
     }
   }
 
+  public static class Vector extends HoodieSchema {
+    private static final String DEFAULT_NAME = "vector";
+
+    /**
+     * Enum representing vector element data types.
+     */
+    public enum VectorElementType {
+      FLOAT("FLOAT", 4),
+      DOUBLE("DOUBLE", 8),
+      INT8("INT8", 1);
+
+      private final String dataType;
+      private final int elementSize;
+
+      VectorElementType(String dataType, int elementSize) {
+        this.dataType = dataType;
+        this.elementSize = elementSize;
+      }
+
+      /**
+       * Returns the string representation for serialization.
+       *
+       * @return element type name
+       */
+      public String getDataType() {
+        return dataType;
+      }
+
+      /**
+       * Returns the byte size of a single element.
+       *
+       * @return number of bytes per element
+       */
+      public int getElementSize() {
+        return elementSize;
+      }
+
+      /**
+       * Converts a string to VectorElementType enum.
+       *
+       * @param name the element type name (e.g., "FLOAT", "DOUBLE", "INT8")
+       * @return the corresponding enum value
+       * @throws IllegalArgumentException if name is unknown
+       */
+      public static VectorElementType fromString(String name) {
+        for (VectorElementType type : values()) {
+          if (type.dataType.equals(name)) {

Review Comment:
   You may want to use `equalsIgnoreCase` to allow more flexibility



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to