Guosmilesmile commented on code in PR #15633:
URL: https://github.com/apache/iceberg/pull/15633#discussion_r2934787613
##########
data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java:
##########
@@ -317,6 +314,305 @@ void
testPositionDeleteWriterEngineWriteGenericRead(FileFormat fileFormat) throw
DataTestHelpers.assertEquals(positionDeleteSchema.asStruct(), records,
readRecords);
}
+ @ParameterizedTest
+ @FieldSource("FORMAT_AND_GENERATOR")
+ /** Write with Generic Record, read with projected engine type T (narrow
schema) */
+ void testReaderBuilderProjection(FileFormat fileFormat, DataGenerator
dataGenerator)
+ throws IOException {
+ Schema fullSchema = dataGenerator.schema();
+
+ List<Types.NestedField> columns = fullSchema.columns();
+ Schema projectedSchema = new Schema(columns.get(columns.size() - 1));
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, fullSchema, genericRecords);
+
+ List<Record> projectedGenericRecords = projectRecords(genericRecords,
projectedSchema);
+ List<T> expectedEngineRecords =
+ convertToEngineRecords(projectedGenericRecords, projectedSchema);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(projectedSchema)
+ .engineProjection(engineSchema(projectedSchema))
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertEquals(projectedSchema, expectedEngineRecords, readRecords);
+ }
+
+ @ParameterizedTest
+ @FieldSource("FORMAT_AND_GENERATOR")
+ void testReaderBuilderFilter(FileFormat fileFormat, DataGenerator
dataGenerator)
+ throws IOException {
+
+ // Avro does not support filter push down
+ // Skip this test for Avro to avoid false failures.
+ assumeThat(fileFormat != FileFormat.AVRO).isTrue();
+
+ Schema schema = dataGenerator.schema();
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, schema, genericRecords);
+
+ // Construct a filter condition that is smaller than the minimum value to
achieve file-level
+ // filtering.
+ Types.NestedField firstField = schema.columns().get(0);
+ Expression filter = filterFieldExpression(firstField, schema,
genericRecords);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(filter)
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertThat(readRecords).isEmpty();
+ }
+
+ @ParameterizedTest
+ @FieldSource("FORMAT_AND_GENERATOR")
+ /**
+ * Write with Generic Record, then read using an upper-cased column name in
the filter to verify
+ * caseSensitive behavior.
+ */
+ void testReaderBuilderCaseSensitive(FileFormat fileFormat, DataGenerator
dataGenerator)
+ throws IOException {
+
+ // Avro does not support filter push down; caseSensitive has no effect on
it.
+ // Skip this test for Avro to avoid false failures.
+ assumeThat(fileFormat != FileFormat.AVRO).isTrue();
+
+ Schema schema = dataGenerator.schema();
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, schema, genericRecords);
+
+ // Build a filter using the upper-cased name of the first column.
+ Types.NestedField firstField = schema.columns().get(0);
+ Object filterValue = genericRecords.get(0).getField(firstField.name());
+ Expression upperCaseFilter =
Expressions.equal(firstField.name().toUpperCase(), filterValue);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+
+ // caseSensitive=false: upper-cased column name must be resolved correctly.
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(upperCaseFilter)
+ .caseSensitive(false)
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertThat(readRecords).isNotEmpty();
+
+ // caseSensitive=true: upper-cased column name cannot be resolved → must
throw.
+ assertThatThrownBy(
+ () -> {
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(),
inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(upperCaseFilter)
+ .caseSensitive(true)
+ .build()) {
+ ImmutableList.copyOf(reader);
+ }
+ })
+ .isInstanceOf(ValidationException.class);
+ }
+
+ @ParameterizedTest
+ @FieldSource("FORMAT_AND_GENERATOR")
+ /**
+ * Write with Generic Record, then read using split to verify that the split
range is respected.
+ * Reading with a zero-length split at the end of the file should return no
records, while reading
+ * with the full file range should return all records.
+ */
+ void testReaderBuilderSplit(FileFormat fileFormat, DataGenerator
dataGenerator)
+ throws IOException {
+ Schema schema = dataGenerator.schema();
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, schema, genericRecords);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ long fileLength = inputFile.getLength();
+
+ // split(fileLength, 0): empty range at the end of the file → no records
should be returned
+ List<T> emptyReadRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .split(fileLength, 0)
+ .build()) {
+ emptyReadRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertThat(emptyReadRecords).isEmpty();
+
+ // split(0, fileLength): full file range → all records should be returned
+ List<T> fullReadRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .split(0, fileLength)
+ .build()) {
+ fullReadRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertEquals(schema, convertToEngineRecords(genericRecords, schema),
fullReadRecords);
+ }
+
+ @ParameterizedTest
+ @FieldSource("FORMAT_AND_GENERATOR")
+ /**
+ * Verifies the contract of recordsPerBatch: recordsPerBatch is a hint for
vectorized readers. The
+ * total number of records returned must be unaffected regardless of the
batch size value.
+ */
+ void testReaderBuilderRecordsPerBatch(FileFormat fileFormat, DataGenerator
dataGenerator)
+ throws IOException {
+
+ // Avro does not support batch reading.
+ assumeThat(fileFormat != FileFormat.AVRO).isTrue();
Review Comment:
https://github.com/apache/iceberg/blob/a50c2d920aadbb79f978bac10d1271bc4aa63a1a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java#L251-L253
AVRO don't support recordsPerBatch
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]