joyhaldar commented on code in PR #16638:
URL: https://github.com/apache/iceberg/pull/16638#discussion_r3361409871
##########
data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java:
##########
@@ -697,48 +705,420 @@ void testReaderBuilderReuseContainers(FileFormat
fileFormat) throws IOException
@ParameterizedTest
@FieldSource("FILE_FORMATS")
- void testReaderSchemaEvolutionNewColumnWithDefault(FileFormat fileFormat)
throws IOException {
+ void testDefaultValues(FileFormat fileFormat) throws IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Types.NestedField idField = Types.NestedField.required(1, "id",
Types.LongType.get());
+
+ Types.NestedField dataField =
+ Types.NestedField.optional("data")
+ .withId(2)
+ .ofType(Types.StringType.get())
+ .withInitialDefault(Literal.of("wrong!"))
+ .withDoc("Should not produce default value")
+ .build();
+
+ Schema writeSchema = new Schema(idField, dataField);
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, genericRecords);
+
+ Schema expectedSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.required("missing_str")
+ .withId(6)
+ .ofType(Types.StringType.get())
+ .withInitialDefault(Literal.of("orange"))
+ .build(),
+ Types.NestedField.optional("missing_int")
+ .withId(7)
+ .ofType(Types.IntegerType.get())
+ .withInitialDefault(Literal.of(34))
+ .build());
+
+ readAndAssertEngineRecords(
+ fileFormat,
+ expectedSchema,
+ genericRecords,
+ record -> {
+ Record expected = GenericRecord.create(expectedSchema);
+ expected.setField("id", record.getField("id"));
+ expected.setField("data", record.getField("data"));
+ expected.setField("missing_str", "orange");
+ expected.setField("missing_int", 34);
+
+ return expected;
+ });
+ }
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testNullDefaultValue(FileFormat fileFormat) throws IOException {
assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
- DataGenerator dataGenerator = new DataGenerators.DefaultSchema();
- Schema writeSchema = dataGenerator.schema();
- List<Record> genericRecords = dataGenerator.generateRecords();
+ Types.NestedField idField = Types.NestedField.required(1, "id",
Types.LongType.get());
+ Types.NestedField dataField =
+ Types.NestedField.optional("data")
+ .withId(2)
+ .ofType(Types.StringType.get())
+ .withInitialDefault(Literal.of("wrong!"))
+ .withDoc("Should not produce default value")
+ .build();
+
+ Schema writeSchema = new Schema(idField, dataField);
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
writeGenericRecords(fileFormat, writeSchema, genericRecords);
- String defaultStringValue = "default_value";
- int defaultIntValue = 42;
+ Schema expectedSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("missing_date")
+ .withId(3)
+ .ofType(Types.DateType.get())
+ .build());
- int maxFieldId =
-
writeSchema.columns().stream().mapToInt(Types.NestedField::fieldId).max().orElse(0);
+ readAndAssertEngineRecords(
+ fileFormat,
+ expectedSchema,
+ genericRecords,
+ record -> {
+ Record expected = GenericRecord.create(expectedSchema);
+ expected.setField("id", record.getField("id"));
+ expected.setField("data", record.getField("data"));
+ expected.setField("missing_date", null);
- List<Types.NestedField> evolvedColumns =
Lists.newArrayList(writeSchema.columns());
- evolvedColumns.add(
- Types.NestedField.required("col_f")
- .withId(maxFieldId + 1)
+ return expected;
+ });
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testNestedDefaultValue(FileFormat fileFormat) throws IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Types.NestedField idField = Types.NestedField.required(1, "id",
Types.LongType.get());
+ Types.NestedField dataField =
+ Types.NestedField.optional("data")
+ .withId(2)
.ofType(Types.StringType.get())
- .withInitialDefault(Literal.of(defaultStringValue))
- .build());
- evolvedColumns.add(
- Types.NestedField.optional("col_g")
- .withId(maxFieldId + 2)
- .ofType(Types.IntegerType.get())
- .withInitialDefault(Literal.of(defaultIntValue))
- .build());
+ .withInitialDefault(Literal.of("wrong!"))
+ .withDoc("Should not produce default value")
+ .build();
+
+ Schema writeSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested")
+ .withId(3)
+ .ofType(
+ Types.StructType.of(
+ Types.NestedField.required(4, "inner",
Types.StringType.get())))
+ .withDoc("Used to test nested field defaults")
+ .build());
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, genericRecords);
+
+ Schema expectedSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested")
+ .withId(3)
+ .ofType(
+ Types.StructType.of(
+ Types.NestedField.required(4, "inner",
Types.StringType.get()),
+ Types.NestedField.optional("missing_inner_float")
+ .withId(5)
+ .ofType(Types.FloatType.get())
+ .withInitialDefault(Literal.of(-0.0F))
+ .build()))
+ .withDoc("Used to test nested field defaults")
+ .build());
- Schema evolvedSchema = new Schema(evolvedColumns);
- readAndAssertGenericRecords(
+ readAndAssertEngineRecords(
fileFormat,
- evolvedSchema,
+ expectedSchema,
genericRecords,
record -> {
- Record expected = copy(record, writeSchema, evolvedSchema);
- expected.setField("col_f", defaultStringValue);
- expected.setField("col_g", defaultIntValue);
+ Record expected = copy(record, writeSchema, expectedSchema);
+
+ Record writtenNested = (Record) record.getField("nested");
+ if (writtenNested != null) {
+ Record expectedNested =
+
GenericRecord.create(expectedSchema.findField("nested").type().asStructType());
+ expectedNested.setField("inner", writtenNested.getField("inner"));
+ expectedNested.setField("missing_inner_float", -0.0F);
+ expected.setField("nested", expectedNested);
+ }
+
return expected;
});
}
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testMapNestedDefaultValue(FileFormat fileFormat) throws IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Types.NestedField idField = Types.NestedField.required(1, "id",
Types.LongType.get());
+ Types.NestedField dataField =
+ Types.NestedField.optional("data")
+ .withId(2)
+ .ofType(Types.StringType.get())
+ .withInitialDefault(Literal.of("wrong!"))
+ .withDoc("Should not produce default value")
+ .build();
+
+ Schema writeSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested_map")
+ .withId(3)
+ .ofType(
+ Types.MapType.ofOptional(
+ 4,
+ 5,
+ Types.StringType.get(),
+ Types.StructType.of(
+ Types.NestedField.required(6, "value_str",
Types.StringType.get()))))
+ .withDoc("Used to test nested map value field defaults")
+ .build());
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, genericRecords);
+
+ Schema expectedSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested_map")
+ .withId(3)
+ .ofType(
+ Types.MapType.ofOptional(
+ 4,
+ 5,
+ Types.StringType.get(),
+ Types.StructType.of(
+ Types.NestedField.required(6, "value_str",
Types.StringType.get()),
+ Types.NestedField.optional("value_int")
+ .withId(7)
+ .ofType(Types.IntegerType.get())
+ .withInitialDefault(Literal.of(34))
+ .build())))
+ .withDoc("Used to test nested field defaults")
+ .build());
+
+ readAndAssertEngineRecords(
+ fileFormat,
+ expectedSchema,
+ genericRecords,
+ record -> {
+ Record expected = copy(record, writeSchema, expectedSchema);
+
+ @SuppressWarnings("unchecked")
+ Map<Object, Record> writtenMap = (Map<Object, Record>)
expected.getField("nested_map");
+ if (writtenMap != null) {
+ Types.StructType valueType =
+ expectedSchema
+ .findField("nested_map")
+ .type()
+ .asMapType()
+ .valueType()
+ .asStructType();
+ Map<Object, Record> rebuilt = Maps.newLinkedHashMap();
+ writtenMap.forEach(
+ (key, value) ->
+ rebuilt.put(
+ key,
+ value == null
+ ? null
+ : GenericRecord.create(valueType)
+ .copy("value_str",
value.getField("value_str"), "value_int", 34)));
+ expected.setField("nested_map", rebuilt);
+ }
+
+ return expected;
+ });
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testListNestedDefaultValue(FileFormat fileFormat) throws IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Types.NestedField idField = Types.NestedField.required(1, "id",
Types.LongType.get());
+ Types.NestedField dataField =
+ Types.NestedField.optional("data")
+ .withId(2)
+ .ofType(Types.StringType.get())
+ .withInitialDefault(Literal.of("wrong!"))
+ .withDoc("Should not produce default value")
+ .build();
+
+ Schema writeSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested_list")
+ .withId(3)
+ .ofType(
+ Types.ListType.ofOptional(
+ 4,
+ Types.StructType.of(
+ Types.NestedField.required(5, "element_str",
Types.StringType.get()))))
+ .withDoc("Used to test nested field defaults")
+ .build());
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, genericRecords);
+
+ Schema expectedSchema =
+ new Schema(
+ idField,
+ dataField,
+ Types.NestedField.optional("nested_list")
+ .withId(3)
+ .ofType(
+ Types.ListType.ofOptional(
+ 4,
+ Types.StructType.of(
+ Types.NestedField.required(5, "element_str",
Types.StringType.get()),
+ Types.NestedField.optional("element_int")
+ .withId(7)
+ .ofType(Types.IntegerType.get())
+ .withInitialDefault(Literal.of(34))
+ .build())))
+ .withDoc("Used to test nested field defaults")
+ .build());
+
+ readAndAssertEngineRecords(
+ fileFormat,
+ expectedSchema,
+ genericRecords,
+ record -> {
+ Record expected = copy(record, writeSchema, expectedSchema);
+
+ @SuppressWarnings("unchecked")
+ List<Record> writtenList = (List<Record>)
expected.getField("nested_list");
+ if (writtenList != null) {
+ Types.StructType elementType =
+ expectedSchema
+ .findField("nested_list")
+ .type()
+ .asListType()
+ .elementType()
+ .asStructType();
+ List<Record> rebuilt =
+ writtenList.stream()
+ .map(
+ element ->
+ element == null
+ ? null
+ : GenericRecord.create(elementType)
+ .copy(
+ "element_str",
+ element.getField("element_str"),
+ "element_int",
+ 34))
+ .collect(Collectors.toList());
+ expected.setField("nested_list", rebuilt);
+ }
+
+ return expected;
+ });
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testMissingRequiredWithoutDefault(FileFormat fileFormat) throws
IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Schema writeSchema = new Schema(Types.NestedField.required(1, "id",
Types.LongType.get()));
+
+ List<Record> genericRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, genericRecords);
+
+ Schema expectedSchema =
+ new Schema(
+ Types.NestedField.required(1, "id", Types.LongType.get()),
+ Types.NestedField.required("missing_str")
+ .withId(6)
+ .ofType(Types.StringType.get())
+ .withDoc("Missing required field with no default")
+ .build());
+
+ assertThatThrownBy(
+ () -> readAndAssertGenericRecords(fileFormat, expectedSchema,
genericRecords))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessage("Missing required field: missing_str");
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testPrimitiveDefaultValues(FileFormat fileFormat) throws IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Schema writeSchema = DataGenerators.PrimitiveDefaults.WRITE_SCHEMA;
+ Schema readSchema =
+ supportsTime()
+ ? DataGenerators.PrimitiveDefaults.READ_SCHEMA
+ : TypeUtil.selectNot(
+ DataGenerators.PrimitiveDefaults.READ_SCHEMA,
+ Set.of(
+ DataGenerators.PrimitiveDefaults.READ_SCHEMA
+ .findField("time_with_default")
+ .fieldId()));
+
+ List<Record> sourceRecords = RandomGenericData.generate(writeSchema, 10,
1L);
+ writeGenericRecords(fileFormat, writeSchema, sourceRecords);
+
+ readAndAssertEngineRecords(
+ fileFormat,
+ readSchema,
+ sourceRecords,
+ record -> {
+ Record expected = GenericRecord.create(readSchema);
+ expected.setField("id", record.getField("id"));
+ for (Types.NestedField field : readSchema.columns()) {
+ if (!field.name().equals("id")) {
+ expected.setField(
+ field.name(),
+ GenericDataUtil.internalToGeneric(field.type(),
field.initialDefault()));
+ }
+ }
+
+ return expected;
+ });
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testPrimitiveDefaultValuesNotApplied(FileFormat fileFormat) throws
IOException {
+ assumeSupports(fileFormat, FEATURE_READER_DEFAULT);
+
+ Schema readSchema =
+ supportsTime()
+ ? DataGenerators.PrimitiveDefaults.READ_SCHEMA
+ : TypeUtil.selectNot(
+ DataGenerators.PrimitiveDefaults.READ_SCHEMA,
+ Set.of(
+ DataGenerators.PrimitiveDefaults.READ_SCHEMA
+ .findField("time_with_default")
+ .fieldId()));
+
+ List<Record> sourceRecords = RandomGenericData.generate(readSchema, 10,
1L);
+ writeGenericRecords(fileFormat, readSchema, sourceRecords);
+
+ readAndAssertEngineRecords(fileFormat, readSchema, sourceRecords, record
-> record);
Review Comment:
Extracted into `primitiveDefaultsReadSchema()`, shared by both tests.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]