amogh-jahagirdar commented on code in PR #14081: URL: https://github.com/apache/iceberg/pull/14081#discussion_r2349039526
########## data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java: ########## @@ -988,6 +993,97 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } Review Comment: Why do we need to write records in these tests? At this level of abstraction, I think we should just create the row group filter with the `notNull("variant_field")` and it assert that shouldRead is true. ########## parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java: ########## @@ -157,8 +157,10 @@ public <T> Boolean notNull(BoundReference<T> ref) { // When filtering nested types notNull() is implicit filter passed even though complex // filters aren't pushed down in Parquet. Leave all nested column type filters to be - // evaluated post scan. - if (schema.findType(id) instanceof Type.NestedType) { + // evaluated post scan. Variant types also need to be evaluated post scan to access + // shredded statistics. Review Comment: I'd remove the "variant types also need to be evaluated post scan to access shredded statistics". I'd just update to reflect the current state of things which is in that first sentence, "When filtering nested types or variant...." and the second sentence to be "Leave these type filters...". For shredded stats pruning, the core library already contains `BoundExtract`, what we need is the translation/plumbing from engines to that extract, and then I think we can do pruning based on the shredded stats. For now though , I'd just leave it out of comments since I think it's more confusing and a bit inaccurate. ########## data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java: ########## @@ -988,6 +993,97 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + + @TestTemplate + public void testAllNullsVariantNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender<GenericRecord> appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + record.setField("variant_field", null); + appender.add(record); + } + } + Review Comment: Same as above, don't think we need to actually write parquet files in these tests -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org