This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch parquet-1.14.x
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/parquet-1.14.x by this push:
new 0f3a615ac GH-2956: Use avro SchemaBuilder API to convert record (#2957)
0f3a615ac is described below
commit 0f3a615acb06dd8ab201f37e485aaac619e467ba
Author: Michel Davit <[email protected]>
AuthorDate: Wed Jul 24 16:43:31 2024 +0200
GH-2956: Use avro SchemaBuilder API to convert record (#2957)
The avro schema builder API is cleaned and more stable. It decreases
chance of using newly introduced avro API in case user run with legacy
avro version
As OPTIONAL converted fields sets null as default, increase consistency
by using [] as default for REPEATED converted fields.
---
.../apache/parquet/avro/AvroSchemaConverter.java | 21 +++---
.../parquet/avro/TestAvroSchemaConverter.java | 77 +++++++++++-----------
2 files changed, 51 insertions(+), 47 deletions(-)
diff --git
a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java
b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java
index d5f85ce44..ffaa07683 100644
---
a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java
+++
b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java
@@ -20,7 +20,6 @@ package org.apache.parquet.avro;
import static java.util.Optional.empty;
import static java.util.Optional.of;
-import static org.apache.avro.JsonProperties.NULL_VALUE;
import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED;
import static
org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED_DEFAULT;
import static org.apache.parquet.avro.AvroWriteSupport.WRITE_FIXED_AS_INT96;
@@ -58,6 +57,7 @@ import java.util.Set;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
+import org.apache.avro.SchemaBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
@@ -296,21 +296,24 @@ public class AvroSchemaConverter {
}
private Schema convertFields(String name, List<Type> parquetFields,
Map<String, Integer> names) {
- String ns = namespace(name, names);
- List<Schema.Field> fields = new ArrayList<Schema.Field>();
+ SchemaBuilder.FieldAssembler<Schema> builder =
+ SchemaBuilder.builder(namespace(name, names)).record(name).fields();
for (Type parquetType : parquetFields) {
Schema fieldSchema = convertField(parquetType, names);
if (parquetType.isRepetition(REPEATED)) { // If a repeated field is
ungrouped, treat as REQUIRED per spec
- fields.add(new Schema.Field(parquetType.getName(),
Schema.createArray(fieldSchema)));
+ builder.name(parquetType.getName())
+ .type()
+ .array()
+ .items()
+ .type(fieldSchema)
+ .arrayDefault(new ArrayList<>());
} else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
- fields.add(new Schema.Field(parquetType.getName(),
optional(fieldSchema), null, NULL_VALUE));
+
builder.name(parquetType.getName()).type().optional().type(fieldSchema);
} else { // REQUIRED
- fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null,
(Object) null));
+ builder.name(parquetType.getName()).type(fieldSchema).noDefault();
}
}
- Schema schema = Schema.createRecord(name, null, ns, false);
- schema.setFields(fields);
- return schema;
+ return builder.endRecord();
}
private Schema convertField(final Type parquetType, Map<String, Integer>
names) {
diff --git
a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java
b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java
index 3b8d5551e..6965c92ff 100644
---
a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java
+++
b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java
@@ -72,43 +72,43 @@ public class TestAvroSchemaConverter {
NEW_BEHAVIOR.setBoolean("parquet.avro.write-old-list-structure", false);
}
- public static final String ALL_PARQUET_SCHEMA =
- "message org.apache.parquet.avro.myrecord {\n" + " required boolean
myboolean;\n"
- + " required int32 myint;\n"
- + " required int64 mylong;\n"
- + " required float myfloat;\n"
- + " required double mydouble;\n"
- + " required binary mybytes;\n"
- + " required binary mystring (UTF8);\n"
- + " required group mynestedrecord {\n"
- + " required int32 mynestedint;\n"
- + " }\n"
- + " required binary myenum (ENUM);\n"
- + " required group myarray (LIST) {\n"
- + " repeated int32 array;\n"
- + " }\n"
- + " optional group myoptionalarray (LIST) {\n"
- + " repeated int32 array;\n"
- + " }\n"
- + " required group myarrayofoptional (LIST) {\n"
- + " repeated group list {\n"
- + " optional int32 element;\n"
- + " }\n"
- + " }\n"
- + " required group myrecordarray (LIST) {\n"
- + " repeated group array {\n"
- + " required int32 a;\n"
- + " required int32 b;\n"
- + " }\n"
- + " }\n"
- + " required group mymap (MAP) {\n"
- + " repeated group map (MAP_KEY_VALUE) {\n"
- + " required binary key (UTF8);\n"
- + " required int32 value;\n"
- + " }\n"
- + " }\n"
- + " required fixed_len_byte_array(1) myfixed;\n"
- + "}\n";
+ public static final String ALL_PARQUET_SCHEMA = "message
org.apache.parquet.avro.myrecord {\n"
+ + " required boolean myboolean;\n"
+ + " required int32 myint;\n"
+ + " required int64 mylong;\n"
+ + " required float myfloat;\n"
+ + " required double mydouble;\n"
+ + " required binary mybytes;\n"
+ + " required binary mystring (UTF8);\n"
+ + " required group mynestedrecord {\n"
+ + " required int32 mynestedint;\n"
+ + " }\n"
+ + " required binary myenum (ENUM);\n"
+ + " required group myarray (LIST) {\n"
+ + " repeated int32 array;\n"
+ + " }\n"
+ + " optional group myoptionalarray (LIST) {\n"
+ + " repeated int32 array;\n"
+ + " }\n"
+ + " required group myarrayofoptional (LIST) {\n"
+ + " repeated group list {\n"
+ + " optional int32 element;\n"
+ + " }\n"
+ + " }\n"
+ + " required group myrecordarray (LIST) {\n"
+ + " repeated group array {\n"
+ + " required int32 a;\n"
+ + " required int32 b;\n"
+ + " }\n"
+ + " }\n"
+ + " required group mymap (MAP) {\n"
+ + " repeated group map (MAP_KEY_VALUE) {\n"
+ + " required binary key (UTF8);\n"
+ + " required int32 value;\n"
+ + " }\n"
+ + " }\n"
+ + " required fixed_len_byte_array(1) myfixed;\n"
+ + "}\n";
private void testAvroToParquetConversion(Schema avroSchema, String
schemaString) throws Exception {
testAvroToParquetConversion(new Configuration(false), avroSchema,
schemaString);
@@ -432,7 +432,8 @@ public class TestAvroSchemaConverter {
+ " \"name\": \"SchemaWithRepeatedField\","
+ " \"fields\": [{"
+ " \"name\": \"repeatedField\","
- + " \"type\": {\"type\": \"array\",\"items\": \"int\"}"
+ + " \"type\": {\"type\": \"array\",\"items\": \"int\"},"
+ + " \"default\": []"
+ " }]"
+ "}"),
"message SchemaWithRepeatedField { repeated int32 repeatedField; }");