This is an automated email from the ASF dual-hosted git repository.
mchades pushed a commit to branch branch-lance-namepspace-dev
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/branch-lance-namepspace-dev by
this push:
new bc1b77a3bb [#8946] improvement(lance): supports more dataTypes for
lance table creation (#8947)
bc1b77a3bb is described below
commit bc1b77a3bb357dd26bd64f025899b0069ece17b3
Author: mchades <[email protected]>
AuthorDate: Wed Oct 29 23:13:59 2025 +0800
[#8946] improvement(lance): supports more dataTypes for lance table
creation (#8947)
### What changes were proposed in this pull request?
supports more dataTypes for lance table creation
### Why are the changes needed?
Fix: #8946
### Does this PR introduce _any_ user-facing change?
yes, more column data types supports
### How was this patch tested?
tests added
---
.../lakehouse/lance/LanceCatalogOperations.java | 37 +--
.../lakehouse/lance/LanceDataTypeConverter.java | 268 +++++++++--------
.../lance/TestLanceDataTypeConverter.java | 327 +++++++++++++++++++++
docs/generic-lakehouse-catalog.md | 140 +++++++++
4 files changed, 615 insertions(+), 157 deletions(-)
diff --git
a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java
b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java
index dcfe6bd489..9572c656d2 100644
---
a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java
+++
b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java
@@ -38,7 +38,6 @@ import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.gravitino.Catalog;
@@ -129,7 +128,7 @@ public class LanceCatalogOperations implements
LakehouseCatalogOperations {
Dataset.create(
new RootAllocator(),
location,
- convertColumnsToSchema(columns),
+ convertColumnsToArrowSchema(columns),
new
WriteParams.Builder().withStorageOptions(storageProps).build())) {
GenericLakehouseTable.Builder builder = GenericLakehouseTable.builder();
return builder
@@ -151,39 +150,13 @@ public class LanceCatalogOperations implements
LakehouseCatalogOperations {
}
}
- private org.apache.arrow.vector.types.pojo.Schema
convertColumnsToSchema(Column[] columns) {
- LanceDataTypeConverter converter = new LanceDataTypeConverter();
+ private org.apache.arrow.vector.types.pojo.Schema
convertColumnsToArrowSchema(Column[] columns) {
List<Field> fields =
Arrays.stream(columns)
.map(
- col -> {
- boolean nullable = col.nullable();
- ArrowType parentType =
converter.fromGravitino(col.dataType());
- List<ArrowType> childTypes =
converter.getChildTypes(col.dataType());
- List<Field> childFields =
- childTypes.stream()
- .map(
- childType ->
- new org.apache.arrow.vector.types.pojo.Field(
- "",
-
org.apache.arrow.vector.types.pojo.FieldType.nullable(
- childType),
- null))
- .collect(Collectors.toList());
-
- if (nullable) {
- return new org.apache.arrow.vector.types.pojo.Field(
- col.name(),
-
org.apache.arrow.vector.types.pojo.FieldType.nullable(parentType),
- childFields);
- }
-
- // not nullable
- return new org.apache.arrow.vector.types.pojo.Field(
- col.name(),
-
org.apache.arrow.vector.types.pojo.FieldType.notNullable(parentType),
- childFields);
- })
+ col ->
+ LanceDataTypeConverter.CONVERTER.toArrowField(
+ col.name(), col.dataType(), col.nullable()))
.collect(Collectors.toList());
return new org.apache.arrow.vector.types.pojo.Schema(fields);
}
diff --git
a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java
b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java
index d7966edd5e..9cd5783de1 100644
---
a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java
+++
b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java
@@ -19,82 +19,183 @@
package org.apache.gravitino.catalog.lakehouse.lance;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
+import java.util.Arrays;
import java.util.List;
+import org.apache.arrow.vector.complex.MapVector;
import org.apache.arrow.vector.types.DateUnit;
import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.IntervalUnit;
import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.ArrowType.Bool;
import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint;
import org.apache.arrow.vector.types.pojo.ArrowType.Int;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.gravitino.connector.DataTypeConverter;
import org.apache.gravitino.json.JsonUtils;
import org.apache.gravitino.rel.types.Type;
import org.apache.gravitino.rel.types.Types;
import org.apache.gravitino.rel.types.Types.FixedType;
-import org.apache.gravitino.rel.types.Types.UnparsedType;
public class LanceDataTypeConverter implements DataTypeConverter<ArrowType,
ArrowType> {
+ public static final LanceDataTypeConverter CONVERTER = new
LanceDataTypeConverter();
+
+ public Field toArrowField(String name, Type type, boolean nullable) {
+ switch (type.name()) {
+ case LIST:
+ Types.ListType listType = (Types.ListType) type;
+ FieldType listField = new FieldType(nullable, ArrowType.List.INSTANCE,
null);
+ return new Field(
+ name,
+ listField,
+ Lists.newArrayList(
+ toArrowField("element", listType.elementType(),
listType.elementNullable())));
+
+ case STRUCT:
+ Types.StructType structType = (Types.StructType) type;
+ FieldType structField = new FieldType(nullable,
ArrowType.Struct.INSTANCE, null);
+ return new Field(
+ name,
+ structField,
+ Arrays.stream(structType.fields())
+ .map(field -> toArrowField(field.name(), field.type(),
field.nullable()))
+ .toList());
+
+ case MAP:
+ Types.MapType mapType = (Types.MapType) type;
+ FieldType mapField = new FieldType(nullable, new ArrowType.Map(false),
null);
+ return new Field(
+ name,
+ mapField,
+ Lists.newArrayList(
+ toArrowField(
+ MapVector.DATA_VECTOR_NAME,
+ Types.StructType.of(
+ Types.StructType.Field.of(
+ // Note: Arrow MapVector requires key field to be
non-nullable
+ MapVector.KEY_NAME,
+ mapType.keyType(),
+ false /*nullable*/,
+ null /*comment*/),
+ Types.StructType.Field.of(
+ MapVector.VALUE_NAME,
+ mapType.valueType(),
+ mapType.valueNullable(),
+ null)),
+ false /*nullable*/)));
+
+ case UNION:
+ Types.UnionType unionType = (Types.UnionType) type;
+ List<Field> types =
+ Arrays.stream(unionType.types())
+ .map(
+ t ->
+ toArrowField(
+ t.simpleString(), t, true /*nullable*/) // union
members are nullable
+ )
+ .toList();
+ int[] typeIds =
+ types.stream()
+ .mapToInt(
+ f ->
+
org.apache.arrow.vector.types.Types.getMinorTypeForArrowType(f.getType())
+ .ordinal())
+ .toArray();
+ FieldType unionField =
+ new FieldType(nullable, new ArrowType.Union(UnionMode.Sparse,
typeIds), null);
+ return new Field(name, unionField, types);
+
+ case EXTERNAL:
+ Types.ExternalType externalType = (Types.ExternalType) type;
+ Field field;
+ try {
+ field =
JsonUtils.anyFieldMapper().readValue(externalType.catalogString(), Field.class);
+ } catch (JsonProcessingException e) {
+ throw new RuntimeException(
+ "Failed to parse external type catalog string: " +
externalType.catalogString(), e);
+ }
+ Preconditions.checkArgument(
+ name.equals(field.getName()),
+ "expected field name %s but got %s",
+ name,
+ field.getName());
+ Preconditions.checkArgument(
+ nullable == field.isNullable(),
+ "expected field nullable %s but got %s",
+ nullable,
+ field.isNullable());
+ return field;
+
+ default:
+ // non-complex type
+ FieldType fieldType = new FieldType(nullable, fromGravitino(type),
null);
+ return new Field(name, fieldType, null);
+ }
+ }
+
@Override
public ArrowType fromGravitino(Type type) {
switch (type.name()) {
case BOOLEAN:
return Bool.INSTANCE;
case BYTE:
- return new Int(8, true);
+ return new Int(8, ((Types.ByteType) type).signed());
case SHORT:
- return new Int(16, true);
+ return new Int(8 * 2, ((Types.ShortType) type).signed());
case INTEGER:
- return new Int(32, true);
+ return new Int(8 * 4, ((Types.IntegerType) type).signed());
case LONG:
- return new Int(64, true);
+ return new Int(8 * 8, ((Types.LongType) type).signed());
case FLOAT:
return new FloatingPoint(FloatingPointPrecision.SINGLE);
case DOUBLE:
return new FloatingPoint(FloatingPointPrecision.DOUBLE);
+ case STRING:
+ return ArrowType.Utf8.INSTANCE;
+ case BINARY:
+ return ArrowType.Binary.INSTANCE;
case DECIMAL:
- // Lance uses FIXED_SIZE_BINARY for decimal types
- return new ArrowType.FixedSizeBinary(16); // assuming 16 bytes for
decimal
+ Types.DecimalType decimalType = (Types.DecimalType) type;
+ return new ArrowType.Decimal(decimalType.precision(),
decimalType.scale(), 8 * 16);
case DATE:
return new ArrowType.Date(DateUnit.DAY);
- case TIME:
- return new ArrowType.Time(TimeUnit.MILLISECOND, 32);
case TIMESTAMP:
- return new ArrowType.Timestamp(TimeUnit.MILLISECOND, null);
- case VARCHAR:
- case STRING:
- return new ArrowType.Utf8();
+ Types.TimestampType timestampType = (Types.TimestampType) type;
+ TimeUnit timeUnit = TimeUnit.MICROSECOND;
+ if (timestampType.hasPrecisionSet()) {
+ timeUnit =
+ switch (timestampType.precision()) {
+ case 0 -> TimeUnit.SECOND;
+ case 3 -> TimeUnit.MILLISECOND;
+ case 6 -> TimeUnit.MICROSECOND;
+ case 9 -> TimeUnit.NANOSECOND;
+ default -> throw new UnsupportedOperationException(
+ "Expected precision to be one of 0, 3, 6, 9 but got: "
+ + timestampType.precision());
+ };
+ }
+ if (timestampType.hasTimeZone()) {
+ // todo: need timeZoneId for timestamp with time zone
+ return new ArrowType.Timestamp(timeUnit, "UTC");
+ }
+ return new ArrowType.Timestamp(timeUnit, null);
+ case TIME:
+ return new ArrowType.Time(TimeUnit.NANOSECOND, 8 * 8);
+ case NULL:
+ return ArrowType.Null.INSTANCE;
+ case INTERVAL_YEAR:
+ return new ArrowType.Interval(IntervalUnit.YEAR_MONTH);
+ case INTERVAL_DAY:
+ return new ArrowType.Duration(TimeUnit.MICROSECOND);
case FIXED:
FixedType fixedType = (FixedType) type;
return new ArrowType.FixedSizeBinary(fixedType.length());
- case BINARY:
- return new ArrowType.Binary();
- case UNPARSED:
- String typeStr = ((UnparsedType) type).unparsedType().toString();
- try {
- Type t = JsonUtils.anyFieldMapper().readValue(typeStr, Type.class);
- if (t instanceof Types.ListType) {
- return ArrowType.List.INSTANCE;
- } else if (t instanceof Types.MapType) {
- return new ArrowType.Map(false);
- } else if (t instanceof Types.StructType) {
- return ArrowType.Struct.INSTANCE;
- } else {
- throw new UnsupportedOperationException(
- "Unsupported UnparsedType conversion: " + t.simpleString());
- }
- } catch (Exception e) {
- // FixedSizeListArray(integer, 3)
- if (typeStr.startsWith("FixedSizeListArray")) {
- int size =
- Integer.parseInt(
- typeStr.substring(typeStr.indexOf(',') + 1,
typeStr.indexOf(')')).trim());
- return new ArrowType.FixedSizeList(size);
- }
- throw new UnsupportedOperationException("Failed to parse
UnparsedType: " + typeStr, e);
- }
default:
throw new UnsupportedOperationException("Unsupported Gravitino type: "
+ type.name());
}
@@ -102,91 +203,8 @@ public class LanceDataTypeConverter implements
DataTypeConverter<ArrowType, Arro
@Override
public Type toGravitino(ArrowType arrowType) {
- if (arrowType instanceof Bool) {
- return Types.BooleanType.get();
- } else if (arrowType instanceof Int intType) {
- switch (intType.getBitWidth()) {
- case 8 -> {
- return Types.ByteType.get();
- }
- case 16 -> {
- return Types.ShortType.get();
- }
- case 32 -> {
- return Types.IntegerType.get();
- }
- case 64 -> {
- return Types.LongType.get();
- }
- default -> throw new UnsupportedOperationException(
- "Unsupported Int bit width: " + intType.getBitWidth());
- }
- } else if (arrowType instanceof FloatingPoint floatingPoint) {
- switch (floatingPoint.getPrecision()) {
- case SINGLE:
- return Types.FloatType.get();
- case DOUBLE:
- return Types.DoubleType.get();
- default:
- throw new UnsupportedOperationException(
- "Unsupported FloatingPoint precision: " +
floatingPoint.getPrecision());
- }
- } else if (arrowType instanceof ArrowType.FixedSizeBinary) {
- ArrowType.FixedSizeBinary fixedSizeBinary = (ArrowType.FixedSizeBinary)
arrowType;
- return Types.FixedType.of(fixedSizeBinary.getByteWidth());
- } else if (arrowType instanceof ArrowType.Date) {
- return Types.DateType.get();
- } else if (arrowType instanceof ArrowType.Time) {
- return Types.TimeType.get();
- } else if (arrowType instanceof ArrowType.Timestamp) {
- return Types.TimestampType.withoutTimeZone();
- } else if (arrowType instanceof ArrowType.Utf8) {
- return Types.StringType.get();
- } else if (arrowType instanceof ArrowType.Binary) {
- return Types.BinaryType.get();
- // TODO handle complex types like List, Map, Struct
- } else {
- throw new UnsupportedOperationException("Unsupported Arrow type: " +
arrowType);
- }
- }
-
- public List<ArrowType> getChildTypes(Type parentType) {
- if (parentType.name() != Type.Name.UNPARSED) {
- return List.of();
- }
-
- List<ArrowType> arrowTypes = Lists.newArrayList();
- String typeStr = ((UnparsedType) parentType).unparsedType().toString();
- try {
- Type t = JsonUtils.anyFieldMapper().readValue(typeStr, Type.class);
- if (t instanceof Types.ListType listType) {
- arrowTypes.add(fromGravitino(listType.elementType()));
- } else if (t instanceof Types.MapType mapType) {
- arrowTypes.add(fromGravitino(mapType.keyType()));
- arrowTypes.add(fromGravitino(mapType.valueType()));
- } else {
- // TODO support struct type.
- throw new UnsupportedOperationException(
- "Unsupported UnparsedType conversion: " + t.simpleString());
- }
-
- return arrowTypes;
- } catch (Exception e) {
- // FixedSizeListArray(integer, 3)
-
- try {
- if (typeStr.startsWith("FixedSizeListArray")) {
- String type = typeStr.substring(typeStr.indexOf('(') + 1,
typeStr.indexOf(',')).trim();
- Type childType = JsonUtils.anyFieldMapper().readValue("\"" + type +
"\"", Type.class);
- arrowTypes.add(fromGravitino(childType));
-
- return arrowTypes;
- }
- } catch (Exception e1) {
- throw new UnsupportedOperationException("Failed to parse UnparsedType:
" + typeStr, e1);
- }
-
- throw new UnsupportedOperationException("Failed to parse UnparsedType: "
+ typeStr, e);
- }
+ // since the table metadata will load from Gravitino storage directly, we
don't need to
+ // implement this method for now.
+ throw new UnsupportedOperationException("toGravitino is not implemented
yet.");
}
}
diff --git
a/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java
b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java
new file mode 100644
index 0000000000..cf28ee7434
--- /dev/null
+++
b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.catalog.lakehouse.lance;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.function.Consumer;
+import java.util.stream.Stream;
+import org.apache.arrow.vector.complex.MapVector;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.UnionMode;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.gravitino.rel.types.Type;
+import org.apache.gravitino.rel.types.Types;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.MethodSource;
+
+public class TestLanceDataTypeConverter {
+ private static final LanceDataTypeConverter CONVERTER =
LanceDataTypeConverter.CONVERTER;
+
+ // Gravitino complex type definitions for testing
+ private static final Types.StructType SIMPLE_STRUCT =
+ Types.StructType.of(
+ Types.StructType.Field.of("id", Types.LongType.get(), false, null),
+ Types.StructType.Field.of("name", Types.StringType.get(), true,
null));
+
+ private static final Types.StructType NESTED_STRUCT =
+ Types.StructType.of(
+ Types.StructType.Field.of("id", Types.LongType.get(), false, null),
+ Types.StructType.Field.of(
+ "address",
+ Types.StructType.of(
+ Types.StructType.Field.of("street", Types.StringType.get(),
false, null),
+ Types.StructType.Field.of("city", Types.StringType.get(),
false, null)),
+ true,
+ null));
+ private static final String NESTED_STRUCT_JSON =
+
"{\"name\":\"person_nested_json\",\"nullable\":false,\"type\":{\"name\":\"struct\"},\"children\":["
+ +
"{\"name\":\"id\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":64,\"isSigned\":true},\"children\":[]},"
+ +
"{\"name\":\"address\",\"nullable\":true,\"type\":{\"name\":\"struct\"},\"children\":["
+ +
"{\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"},\"children\":[]},"
+ +
"{\"name\":\"city\",\"nullable\":false,\"type\":{\"name\":\"utf8\"},\"children\":[]}"
+ + "]}"
+ + "]}";
+
+ private static final Types.ListType LIST_OF_STRUCTS =
+ Types.ListType.of(
+ Types.StructType.of(
+ Types.StructType.Field.of("sku", Types.StringType.get(), false,
null),
+ Types.StructType.Field.of("quantity", Types.IntegerType.get(),
false, null)),
+ true);
+
+ // Field validators for Arrow conversion tests
+ private static Consumer<Field> INT_VALIDATOR =
+ field -> assertInstanceOf(ArrowType.Int.class,
field.getFieldType().getType());
+ private static Consumer<Field> STRING_VALIDATOR =
+ field -> assertInstanceOf(ArrowType.Utf8.class,
field.getFieldType().getType());
+ private static Consumer<Field> LARGE_UTF8_VALIDATOR =
+ field -> assertInstanceOf(ArrowType.LargeUtf8.class,
field.getFieldType().getType());
+ private static Consumer<Field> BOOLEAN_VALIDATOR =
+ field -> assertInstanceOf(ArrowType.Bool.class,
field.getFieldType().getType());
+ private static Consumer<Field> DECIMAL_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.Decimal.class,
field.getFieldType().getType());
+ ArrowType.Decimal decimal = (ArrowType.Decimal)
field.getFieldType().getType();
+
+ assertEquals(10, decimal.getPrecision());
+ assertEquals(2, decimal.getScale());
+ };
+ private static Consumer<Field> LIST_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.List.class, field.getFieldType().getType());
+ assertEquals(1, field.getChildren().size());
+
+ Field elementField = field.getChildren().get(0);
+ assertEquals("element", elementField.getName());
+ assertTrue(elementField.isNullable());
+ assertInstanceOf(ArrowType.Int.class,
elementField.getFieldType().getType());
+ };
+ private static Consumer<Field> MAP_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.Map.class, field.getFieldType().getType());
+ assertEquals(1, field.getChildren().size());
+
+ Field structField = field.getChildren().get(0);
+ assertEquals(MapVector.DATA_VECTOR_NAME, structField.getName());
+ assertEquals(2, structField.getChildren().size());
+
+ Field keyField = structField.getChildren().get(0);
+ assertEquals(MapVector.KEY_NAME, keyField.getName());
+ assertFalse(keyField.isNullable());
+ assertInstanceOf(ArrowType.Utf8.class,
keyField.getFieldType().getType());
+
+ Field valueField = structField.getChildren().get(1);
+ assertEquals(MapVector.VALUE_NAME, valueField.getName());
+ assertTrue(valueField.isNullable());
+ assertInstanceOf(ArrowType.Int.class,
valueField.getFieldType().getType());
+ };
+ private static Consumer<Field> STRUCT_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.Struct.class,
field.getFieldType().getType());
+ assertEquals(2, field.getChildren().size());
+
+ Field idField = field.getChildren().get(0);
+ assertEquals("id", idField.getName());
+ assertFalse(idField.isNullable());
+ assertInstanceOf(ArrowType.Int.class,
idField.getFieldType().getType());
+
+ Field nameField = field.getChildren().get(1);
+ assertEquals("name", nameField.getName());
+ assertTrue(nameField.isNullable());
+ assertInstanceOf(ArrowType.Utf8.class,
nameField.getFieldType().getType());
+ };
+ private static Consumer<Field> NESTED_STRUCT_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.Struct.class,
field.getFieldType().getType());
+ assertEquals(2, field.getChildren().size());
+
+ Field addressField = field.getChildren().get(1);
+ assertEquals("address", addressField.getName());
+ assertTrue(addressField.isNullable());
+
+ assertInstanceOf(ArrowType.Struct.class,
addressField.getFieldType().getType());
+ assertEquals(2, addressField.getChildren().size());
+ };
+ private static Consumer<Field> LIST_OF_STRUCTS_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.List.class, field.getFieldType().getType());
+ assertEquals(1, field.getChildren().size());
+
+ Field elementField = field.getChildren().get(0);
+ assertEquals("element", elementField.getName());
+ assertTrue(elementField.isNullable());
+ assertInstanceOf(ArrowType.Struct.class,
elementField.getFieldType().getType());
+ assertEquals(2, elementField.getChildren().size());
+ };
+ private static Consumer<Field> UNION_VALIDATOR =
+ field -> {
+ assertInstanceOf(ArrowType.Union.class,
field.getFieldType().getType());
+ ArrowType.Union unionType = (ArrowType.Union)
field.getFieldType().getType();
+ assertEquals(UnionMode.Sparse, unionType.getMode());
+ assertEquals(2, field.getChildren().size());
+ assertInstanceOf(ArrowType.Int.class,
field.getChildren().get(0).getFieldType().getType());
+ assertInstanceOf(ArrowType.Utf8.class,
field.getChildren().get(1).getFieldType().getType());
+ };
+
+ @ParameterizedTest
+ @DisplayName("Test conversion of Integer types (Byte, Short, Integer, Long)")
+ @CsvSource({"BYTE, 8, true", "SHORT, 16, true", "INTEGER, 32, true", "LONG,
64, true"})
+ public void testFromGravitinoIntegerTypes(
+ String typeName, int expectedBitWidth, boolean expectedSigned) {
+ Type type =
+ switch (typeName) {
+ case "BYTE" -> Types.ByteType.get();
+ case "SHORT" -> Types.ShortType.get();
+ case "INTEGER" -> Types.IntegerType.get();
+ case "LONG" -> Types.LongType.get();
+ default -> throw new IllegalArgumentException("Unknown type: " +
typeName);
+ };
+
+ ArrowType arrowType = CONVERTER.fromGravitino(type);
+ assertInstanceOf(ArrowType.Int.class, arrowType);
+
+ ArrowType.Int intType = (ArrowType.Int) arrowType;
+ assertEquals(expectedBitWidth, intType.getBitWidth());
+ assertEquals(expectedSigned, intType.getIsSigned());
+ }
+
+ @Test
+ public void testFromGravitinoTimestampWithTz() {
+ Types.TimestampType timestampType = Types.TimestampType.withTimeZone();
+ ArrowType arrowType = CONVERTER.fromGravitino(timestampType);
+ assertInstanceOf(ArrowType.Timestamp.class, arrowType);
+
+ ArrowType.Timestamp tsArrow = (ArrowType.Timestamp) arrowType;
+ assertEquals(TimeUnit.MICROSECOND, tsArrow.getUnit());
+ assertEquals("UTC", tsArrow.getTimezone());
+ }
+
+ @Test
+ public void testExternalTypeConversion() {
+ String expectedColumnName = "col_name";
+ boolean expectedNullable = true;
+ Types.ExternalType externalType =
+ Types.ExternalType.of(
+ "{\"name\":\"col_name\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"largeutf8\"},\"children\":[]}");
+ Field arrowField = CONVERTER.toArrowField(expectedColumnName,
externalType, expectedNullable);
+ assertEquals(expectedColumnName, arrowField.getName());
+ assertEquals(expectedNullable, arrowField.isNullable());
+ assertInstanceOf(ArrowType.LargeUtf8.class,
arrowField.getFieldType().getType());
+
+ externalType =
+ Types.ExternalType.of(
+ "{\"name\":\"col_name\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"largebinary\"},\"children\":[]}");
+ arrowField = CONVERTER.toArrowField(expectedColumnName, externalType,
expectedNullable);
+ assertEquals(expectedColumnName, arrowField.getName());
+ assertEquals(expectedNullable, arrowField.isNullable());
+ assertInstanceOf(ArrowType.LargeBinary.class,
arrowField.getFieldType().getType());
+
+ externalType =
+ Types.ExternalType.of(
+ "{\"name\":\"col_name\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"largelist\"},"
+ + "\"children\":["
+ + "{\"name\":\"element\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\":
true},"
+ + "\"children\":[]}]}");
+ arrowField = CONVERTER.toArrowField(expectedColumnName, externalType,
expectedNullable);
+ assertEquals(expectedColumnName, arrowField.getName());
+ assertEquals(expectedNullable, arrowField.isNullable());
+ assertInstanceOf(ArrowType.LargeList.class,
arrowField.getFieldType().getType());
+
+ externalType =
+ Types.ExternalType.of(
+ "{\"name\":\"col_name\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"fixedsizelist\", \"listSize\":10},"
+ + "\"children\":["
+ + "{\"name\":\"element\",\"nullable\":true,"
+ + "\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\":
true},"
+ + "\"children\":[]}]}");
+ arrowField = CONVERTER.toArrowField(expectedColumnName, externalType,
expectedNullable);
+ assertEquals(expectedColumnName, arrowField.getName());
+ assertEquals(expectedNullable, arrowField.isNullable());
+ assertInstanceOf(ArrowType.FixedSizeList.class,
arrowField.getFieldType().getType());
+ assertEquals(10, ((ArrowType.FixedSizeList)
arrowField.getFieldType().getType()).getListSize());
+ }
+
+ @ParameterizedTest(name = "[{index}] name={0}, type={1}, nullable={2}")
+ @MethodSource("toArrowFieldArguments")
+ @DisplayName("Test toArrowField for various types")
+ public void testToArrowField(
+ String name, Type gravitinoType, boolean nullable, Consumer<Field>
validator) {
+ Field field = CONVERTER.toArrowField(name, gravitinoType, nullable);
+
+ assertEquals(name, field.getName());
+ assertEquals(nullable, field.isNullable());
+ validator.accept(field);
+ }
+
+ @Test
+ void testUnsupportedTypeThrowsException() {
+ Types.UnparsedType unparsedType = Types.UnparsedType.of("UNKNOWN_TYPE");
+ assertThrows(UnsupportedOperationException.class, () ->
CONVERTER.fromGravitino(unparsedType));
+ }
+
+ @Test
+ void testToGravitinoNotImplemented() {
+ assertThrows(
+ UnsupportedOperationException.class, () ->
CONVERTER.toGravitino(ArrowType.Utf8.INSTANCE));
+ }
+
+ private static Stream<Arguments> toArrowFieldArguments() {
+ return Stream.of(
+ // Simple types
+ Arguments.of("age", Types.IntegerType.get(), true, INT_VALIDATOR),
+ Arguments.of("id", Types.LongType.get(), false, INT_VALIDATOR),
+ Arguments.of("name", Types.StringType.get(), true, STRING_VALIDATOR),
+ Arguments.of(
+ "description",
+ Types.ExternalType.of(
+ "{\n"
+ + " \"name\": \"description\",\n"
+ + " \"nullable\": true,\n"
+ + " \"type\": {\n"
+ + " \"name\": \"largeutf8\"\n"
+ + " }\n"
+ + "}"),
+ true,
+ LARGE_UTF8_VALIDATOR),
+ Arguments.of("active", Types.BooleanType.get(), false,
BOOLEAN_VALIDATOR),
+ // Decimal
+ Arguments.of("price", Types.DecimalType.of(10, 2), false,
DECIMAL_VALIDATOR),
+ // List
+ Arguments.of(
+ "numbers", Types.ListType.of(Types.IntegerType.get(), true),
false, LIST_VALIDATOR),
+ // Map
+ Arguments.of(
+ "properties",
+ Types.MapType.of(Types.StringType.get(), Types.IntegerType.get(),
true),
+ true,
+ MAP_VALIDATOR),
+ // Struct
+ Arguments.of("person", SIMPLE_STRUCT, true, STRUCT_VALIDATOR),
+ // Nested Struct
+ Arguments.of("person_nested", NESTED_STRUCT, false,
NESTED_STRUCT_VALIDATOR),
+ Arguments.of(
+ "person_nested_json",
+ Types.ExternalType.of(NESTED_STRUCT_JSON),
+ false,
+ NESTED_STRUCT_VALIDATOR),
+ // List of Structs
+ Arguments.of("items", LIST_OF_STRUCTS, false,
LIST_OF_STRUCTS_VALIDATOR),
+ // Union
+ Arguments.of(
+ "union_field",
+ Types.UnionType.of(Types.IntegerType.get(),
Types.StringType.get()),
+ true,
+ UNION_VALIDATOR));
+ }
+}
diff --git a/docs/generic-lakehouse-catalog.md
b/docs/generic-lakehouse-catalog.md
new file mode 100644
index 0000000000..35eaeb4660
--- /dev/null
+++ b/docs/generic-lakehouse-catalog.md
@@ -0,0 +1,140 @@
+---
+title: "Lakehouse catalog"
+slug: /lakehouse-catalog
+keywords:
+ - lakehouse
+ - lance
+ - metadata
+license: "This software is licensed under the Apache License version 2."
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Introduction
+
+TBD.
+
+### Requirements and limitations
+
+TBD.
+
+## Catalog
+
+### Catalog capabilities
+
+TBD.
+
+### Catalog properties
+
+TBD.
+
+### Catalog operations
+
+TBD.
+
+## Schema
+
+### Schema capabilities
+
+TBD.
+
+### Schema properties
+
+TBD.
+
+### Schema operations
+
+Please refer to [Manage Relational Metadata Using
Gravitino](./manage-relational-metadata-using-gravitino.md#schema-operations)
for more details.
+
+## Table
+
+### Table capabilities
+
+TBD.
+
+### Table partitions
+
+TBD.
+
+### Table sort orders
+
+TBD.
+
+### Table distributions
+
+TBD.
+
+### Table column types
+
+Since Lance uses Apache Arrow as the table schema, the following table shows
the mapping between Gravitino types and Arrow types:
+
+| Gravitino Type | Arrow Type |
+|----------------------------------|-----------------------------------------|
+| `Struct` | `Struct` |
+| `Map` | `Map` |
+| `List` | `Array` |
+| `Boolean` | `Boolean` |
+| `Byte` | `Int8` |
+| `Short` | `Int16` |
+| `Integer` | `Int32` |
+| `Long` | `Int64` |
+| `Float` | `Float` |
+| `Double` | `Double` |
+| `String` | `Utf8` |
+| `Binary` | `Binary` |
+| `Decimal(p, s)` | `Decimal(p, s)` (128-bit) |
+| `Date` | `Date` |
+| `Timestamp`/`Timestamp(6)` | `TimestampType withoutZone` |
+| `Timestamp(0)` | `TimestampType Second withoutZone` |
+| `Timestamp(3)` | `TimestampType Millisecond withoutZone` |
+| `Timestamp(9)` | `TimestampType Nanosecond withoutZone` |
+| `Timestamp_tz`/`Timestamp_tz(6)` | `TimestampType Microsecond withUtc` |
+| `Timestamp_tz(0)` | `TimestampType Second withUtc` |
+| `Timestamp_tz(3)` | `TimestampType Millisecond withUtc` |
+| `Timestamp_tz(9)` | `TimestampType Nanosecond withUtc` |
+| `Time`/`Time(9)` | `Time Nanosecond` |
+| `Null` | `Null` |
+| `Fixed(n)` | `Fixed-Size Binary(n)` |
+| `Interval_year` | `Interval(YearMonth)` |
+| `Interval_day` | `Duration(Microsecond)` |
+| `External(arrow_field_json_str)` | Any Arrow Field (see note below) |
+
+`External(arrow_field_json_str)`:
+
+As the table above shows, Gravitino provides mappings for most common data
types. However,
+in some cases, you may need to use an Arrow data type that is not directly
supported by Gravitino.
+
+To address this, Gravitino introduces the `External(arrow_field_json_str)`
type,
+which allows you to define any Arrow data type by providing the JSON string of
an Arrow `Field`.
+
+The JSON string must conform to the Apache Arrow `Field`
[specification](https://github.com/apache/arrow-java/blob/ed81e5981a2bee40584b3a411ed755cb4cc5b91f/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java#L80C1-L86C68),
+including details such as the field name, data type, and nullability.
+Here are some examples of how to use `External` type for various Arrow types
that are not natively supported by Gravitino:
+
+| Arrow Type | External type
|
+|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `Large Utf8` |
`External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largeutf8\"},\"children\":[]}")`
|
+| `Large Binary` |
`External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largebinary\"},\"children\":[]}")`
|
+| `Large List` |
`External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largelist\"},\"children\":[{\"name\":\"element\",\"nullable\":true,\"type\":{\"name\":\"int\",
\"bitWidth\":32, \"isSigned\": true},\"children\":[]}]}")`
|
+| `Fixed-Size List` |
`External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"fixedsizelist\",
\"listSize\":10},\"children\":[{\"name\":\"element\",\"nullable\":true,\"type\":{\"name\":\"int\",
\"bitWidth\":32, \"isSigned\": true},\"children\":[]}]}")` |
+
+**Important considerations:**
+- The `name` attribute and `nullable` attribute in the JSON string must
exactly match the corresponding column name and nullable in the Gravitino table.
+- The `children` array should be empty for primitive types. For complex types
like `Struct` or `List`, it must contain the definitions of the child fields.
+
+### Table properties
+
+TBD.
+
+### Table indexes
+
+TBD.
+
+### Table operations
+
+Please refer to [Manage Relational Metadata Using
Gravitino](./manage-relational-metadata-using-gravitino.md#table-operations)
for more details.
+
+## Object store configuration
+
+TBD.