geruh commented on code in PR #14081: URL: https://github.com/apache/iceberg/pull/14081#discussion_r2350470500
########## data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java: ########## @@ -988,6 +993,97 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } Review Comment: Thanks for the feedback Amogh! Good point about avoiding file writing at this level. I initially wrote the tests this way because this test class shares a schema for writing out data files in both ORC and Parquet. We're now in a situation where ORC doesn't have full support for variant types while Parquet does, so adding variant fields to the shared schema would break the existing ORC tests. That said, it probably makes sense to use separate schemas for Parquet and ORC given the differnt levels of support. and write the tests to reflect that. ########## data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java: ########## @@ -988,6 +993,97 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + + @TestTemplate + public void testAllNullsVariantNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + record.setField("variant_field", null); + appender.add(record); + } + } + Review Comment: addressed above -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org