twalthr commented on a change in pull request #10503: [FLINK-15137][avro]
Improve schema derivation for Avro format
URL: https://github.com/apache/flink/pull/10503#discussion_r359216053
##########
File path:
flink-formats/flink-avro/src/main/java/org/apache/flink/formats/avro/typeutils/AvroSchemaConverter.java
##########
@@ -157,4 +209,162 @@ private AvroSchemaConverter() {
}
throw new IllegalArgumentException("Unsupported Avro type '" +
schema.getType() + "'.");
}
+
+ private static LogicalType convertToLogicalType(Schema schema) {
+ return convertToDataType(schema).getLogicalType();
+ }
+
+ private static DataType convertToDataType(Schema schema) {
+ switch (schema.getType()) {
+ case RECORD:
+ final List<Schema.Field> fields =
schema.getFields();
+ final DataTypes.Field[] dataTypeFields = new
DataTypes.Field[fields.size()];
+ for (int i = 0; i < fields.size(); i++) {
+ final Schema.Field field =
fields.get(i);
+ dataTypeFields[i] = DataTypes.FIELD(
+ field.name(),
+
convertToDataType(field.schema()));
+ }
+ return DataTypes.ROW(dataTypeFields);
+ case ENUM:
+ case STRING:
+ // convert Avro's Utf8/CharSequence to String
+ return DataTypes.STRING();
+ case ARRAY:
+ // result type might either be
ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings
+ return
DataTypes.ARRAY(convertToDataType(schema.getElementType()));
+ case MAP:
+ return DataTypes.MAP(DataTypes.STRING(),
convertToDataType(schema.getValueType()));
+ case UNION:
+ final Schema actualSchema;
+ if (schema.getTypes().size() == 2 &&
schema.getTypes().get(0).getType() == Schema.Type.NULL) {
+ actualSchema = schema.getTypes().get(1);
+ } else if (schema.getTypes().size() == 2 &&
schema.getTypes().get(1).getType() == Schema.Type.NULL) {
+ actualSchema = schema.getTypes().get(0);
+ } else if (schema.getTypes().size() == 1) {
+ actualSchema = schema.getTypes().get(0);
+ } else {
+ // use Kryo for serialization
+ return
DataTypes.RAW(Types.GENERIC(Object.class));
+ }
+ return convertToDataType(actualSchema);
+ case FIXED:
+ case BYTES:
+ // logical decimal type
+ if (schema.getLogicalType() instanceof
LogicalTypes.Decimal) {
+ LogicalTypes.Decimal decimalType =
(LogicalTypes.Decimal) schema.getLogicalType();
+ return
DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale());
+ }
+ // convert fixed size binary data to primitive
byte arrays
+ return DataTypes.BYTES();
+ case INT:
+ // logical date and time type
+ final org.apache.avro.LogicalType logicalType =
schema.getLogicalType();
+ if (logicalType == LogicalTypes.date()) {
+ return DataTypes.DATE();
+ } else if (logicalType ==
LogicalTypes.timeMillis()) {
+ return DataTypes.TIME(3);
+ }
+ return DataTypes.INT();
+ case LONG:
+ // logical timestamp type
+ if (schema.getLogicalType() ==
LogicalTypes.timestampMillis()) {
+ return DataTypes.TIMESTAMP(3);
+ }
+ return DataTypes.BIGINT();
+ case FLOAT:
+ return DataTypes.FLOAT();
+ case DOUBLE:
+ return DataTypes.DOUBLE();
+ case BOOLEAN:
+ return DataTypes.BOOLEAN();
+ case NULL:
+ return DataTypes.NULL();
+ }
+ throw new IllegalArgumentException("Unsupported Avro type '" +
schema.getType() + "'.");
+ }
+
+ private static Schema convertToSchema(LogicalType logicalType, int
rowTypeCounter) {
+ switch (logicalType.getTypeRoot()) {
+ case NULL:
+ return SchemaBuilder.builder().nullType();
+ case BOOLEAN:
+ return
SchemaBuilder.builder().nullable().booleanType();
+ case INTEGER:
+ return
SchemaBuilder.builder().nullable().intType();
+ case BIGINT:
+ return
SchemaBuilder.builder().nullable().longType();
+ case FLOAT:
+ return
SchemaBuilder.builder().nullable().floatType();
+ case DOUBLE:
+ return
SchemaBuilder.builder().nullable().doubleType();
+ case CHAR:
+ case VARCHAR:
+ return
SchemaBuilder.builder().nullable().stringType();
+ case BINARY:
+ case VARBINARY:
+ return
SchemaBuilder.builder().nullable().bytesType();
+ case TIMESTAMP_WITHOUT_TIME_ZONE:
+ // use long to represents Timestamp
+ return
LogicalTypes.timestampMillis().addToSchema(SchemaBuilder.builder().longType());
+ case DATE:
+ // use int to represents Date
+ return
LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType());
+ case TIME_WITHOUT_TIME_ZONE:
+ // use int to represents Time, we only support
millisecond when deserialization
+ return
LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType());
Review comment:
Can't we support more than millis now?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services