pvary commented on code in PR #15633:
URL: https://github.com/apache/iceberg/pull/15633#discussion_r2965357952
##########
data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java:
##########
@@ -317,7 +321,395 @@ void
testPositionDeleteWriterEngineWriteGenericRead(FileFormat fileFormat) throw
DataTestHelpers.assertEquals(positionDeleteSchema.asStruct(), records,
readRecords);
}
+ /** Write with Generic Record, read with projected engine type T (narrow
schema) */
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testReaderBuilderProjection(FileFormat fileFormat) throws IOException {
+ DataGenerator dataGenerator = new DataGenerators.DefaultSchema();
+ Schema fullSchema = dataGenerator.schema();
+
+ List<Types.NestedField> columns = fullSchema.columns();
+ List<Types.NestedField> projectedColumns =
+ IntStream.range(0, columns.size()).filter(i -> i % 2 ==
1).mapToObj(columns::get).toList();
+ if (projectedColumns.isEmpty()) {
+ projectedColumns = ImmutableList.of(columns.get(columns.size() - 1));
+ }
+
+ Schema projectedSchema = new Schema(projectedColumns);
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, fullSchema, genericRecords);
+
+ List<Record> projectedGenericRecords = projectRecords(genericRecords,
projectedSchema);
+ List<T> expectedEngineRecords =
+ convertToEngineRecords(projectedGenericRecords, projectedSchema);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(projectedSchema)
+ .engineProjection(engineSchema(projectedSchema))
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertEquals(projectedSchema, expectedEngineRecords, readRecords);
+ }
+
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException {
+
+ assumeSupports(fileFormat, "filter");
+
+ DataGenerator dataGenerator = new DataGenerators.DefaultSchema();
+ Schema schema = dataGenerator.schema();
+
+ List<Record> genericRecords = dataGenerator.generateRecords(10000);
+ writeRecordsForSplit(fileFormat, schema, genericRecords);
+
+ // Construct a filter condition that is smaller than the minimum value to
achieve file-level
+ // filtering.
+ Types.NestedField firstField = schema.columns().get(0);
+ Object minValue = minFilterField(firstField, genericRecords);
+ Expression lessThanFilter = Expressions.lessThan(firstField.name(),
minValue);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(lessThanFilter)
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertThat(readRecords).isEmpty();
+
+ Object maxFilterField = maxFilterField(firstField, genericRecords);
+ Expression greaterThanFilter =
+ Expressions.greaterThanOrEqual(firstField.name(), maxFilterField);
+
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(greaterThanFilter)
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+
assertThat(readRecords).hasSizeGreaterThan(0).hasSizeLessThan(genericRecords.size());
+ }
+
+ /**
+ * Write with Generic Record, then read using an upper-cased column name in
the filter to verify
+ * caseSensitive behavior.
+ */
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testReaderBuilderCaseSensitive(FileFormat fileFormat) throws
IOException {
+
+ assumeSupports(fileFormat, "caseSensitive");
+
+ DataGenerator dataGenerator = new DataGenerators.DefaultSchema();
+ Schema schema = dataGenerator.schema();
+
+ List<Record> genericRecords = dataGenerator.generateRecords();
+ writeGenericRecords(fileFormat, schema, genericRecords);
+
+ // Build a filter using the upper-cased name of the first column.
+ Types.NestedField firstField = schema.columns().get(0);
+ Object filterValue = genericRecords.get(0).getField(firstField.name());
+ Expression upperCaseFilter =
Expressions.equal(firstField.name().toUpperCase(), filterValue);
+
assertThat(firstField.name()).isNotEqualTo(firstField.name().toUpperCase());
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+
+ // caseSensitive=false: upper-cased column name must be resolved correctly.
+ List<T> readRecords;
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(upperCaseFilter)
+ .caseSensitive(false)
+ .build()) {
+ readRecords = ImmutableList.copyOf(reader);
+ }
+
+ assertThat(readRecords).isNotEmpty();
+
+ // caseSensitive=true: upper-cased column name cannot be resolved → must
throw.
+ assertThatThrownBy(
+ () -> {
+ try (CloseableIterable<T> reader =
+ FormatModelRegistry.readBuilder(fileFormat, engineType(),
inputFile)
+ .project(schema)
+ .engineProjection(engineSchema(schema))
+ .filter(upperCaseFilter)
+ .caseSensitive(true)
+ .build()) {
+ ImmutableList.copyOf(reader);
+ }
+ })
+ .isInstanceOf(ValidationException.class)
+ .hasMessageContaining("Cannot find field '%s'",
firstField.name().toUpperCase());
+ }
+
+ /**
+ * Write with Generic Record, then read using split to verify that the split
range is respected.
+ * The test writes enough records to produce multiple split so that {@link
+ * DataFile#splitOffsets()} contains at least two offsets. It then reads
only the first row-group
+ * split {@code [splitOffsets[0], splitOffsets[1])} and asserts that the
returned record count is
+ * greater than zero but less than the total, confirming that the split
boundary is honoured. A
+ * second read over the full file range {@code [0, fileLength)} must return
all records.
+ */
+ @ParameterizedTest
+ @FieldSource("FILE_FORMATS")
+ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException {
+
+ assumeSupports(fileFormat, "split");
+
+ DataGenerator dataGenerator = new DataGenerators.DefaultSchema();
+ Schema schema = dataGenerator.schema();
+ List<Record> records = dataGenerator.generateRecords(10000);
+
+ DataFile dataFile = writeRecordsForSplit(fileFormat, schema, records);
+
+ InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile();
+ long fileLength = inputFile.getLength();
+
+ List<Long> splitOffsets = dataFile.splitOffsets();
+ long firstSplitStart = splitOffsets.get(0);
+ long firstSplitLength = splitOffsets.get(1) - splitOffsets.get(0);
Review Comment:
Nah.. then fine.
My mistake then
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]