keuin opened a new issue, #3585: URL: https://github.com/apache/parquet-java/issues/3585
### Describe the bug, including details regarding any error messages, version, and platform. num_nulls is a required field in parquet v2 header (https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift#L729). However current implementation tracks this field only when statistics is enabled. When statistics is disabled, this required field will be set to -1, a default value in the tracker, causing incorrect num_nulls and will fail in some reader which do explicit sanity check like Rust arrow-rs. This bug can be reproduced using: ```java package com.keuin.testparquet; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPageV2; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.column.page.PageReader; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.io.InputFile; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import java.nio.file.Path; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; import static org.junit.jupiter.api.Assertions.*; /** * Minimal reproduction of parquet-java bug: * * When using WriterVersion.PARQUET_2_0 (which produces DataPageV2 pages) combined with * .withStatisticsEnabled("column", false), the DataPageHeaderV2.num_nulls field is * written as -1 (0xFFFFFFFF). * * Root cause: ColumnWriterV2.writePage() calls Math.toIntExact(statistics.getNumNulls()), * and NoopStatistics.getNumNulls() unconditionally returns -1 (meaning "not set"). * But DataPageHeaderV2.num_nulls is a required i32 field in the Thrift schema, * so -1 gets serialized as-is. * * Affected versions: parquet-java 1.15.2 (and likely all versions with NoopStatistics) * Bug location: ColumnWriterV2.java line ~107, NoopStatistics.java line ~97 */ class ParquetV2NumNullsBugTest { /** * Minimal Avro schema: one non-null int field + one nullable string field. */ private static final String SCHEMA_JSON = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + " \"fields\": [\n" + " {\"name\": \"id\", \"type\": \"int\"},\n" + " {\"name\": \"value\", \"type\": [\"null\", \"string\"], \"default\": null}\n" + " ]\n" + "}"; /** * BUG REPRODUCTION: PARQUET_2_0 + withStatisticsEnabled(false) → num_nulls = -1 * * This test demonstrates the bug. With statistics disabled on a nullable column, * the DataPageV2 header's num_nulls field is incorrectly set to -1. */ @Test void bugRepro_parquet2_statisticsDisabled_numNullsIsMinusOne(@TempDir Path tempDir) throws Exception { Schema schema = new Schema.Parser().parse(SCHEMA_JSON); org.apache.hadoop.fs.Path parquetPath = new org.apache.hadoop.fs.Path(tempDir.resolve("bug.parquet").toUri()); // Write: PARQUET_2_0 + statistics disabled on "value" column try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath) .withSchema(schema) .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) .withWriterVersion(PARQUET_2_0) .withStatisticsEnabled("value", false) // ← triggers NoopStatistics .withPageSize(1024 * 1024) // large page to keep all records in one page .build()) { for (int i = 0; i < 10; i++) { GenericRecord record = new GenericData.Record(schema); record.put("id", i); // 4 nulls: i = 0, 3, 6, 9 record.put("value", i % 3 == 0 ? null : "hello-" + i); writer.write(record); } } // Read back using low-level page API Configuration conf = new Configuration(); InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf); try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { MessageType fileSchema = reader.getFileMetaData().getSchema(); // "value" is the second leaf column in the schema // For union ["null","string"], Avro maps to optional binary field var columns = fileSchema.getColumns(); var valueColumn = columns.stream() .filter(c -> c.getPath()[0].equals("value")) .findFirst() .orElseThrow(() -> new AssertionError("Column 'value' not found")); PageReadStore rowGroup = reader.readNextRowGroup(); assertNotNull(rowGroup, "Should have at least one row group"); PageReader pageReader = rowGroup.getPageReader(valueColumn); DataPage page = pageReader.readPage(); assertNotNull(page, "Should have at least one data page"); // Verify it's a V2 page (because we used PARQUET_2_0) assertInstanceOf(DataPageV2.class, page, "PARQUET_2_0 writer should produce DataPageV2 pages"); DataPageV2 pageV2 = (DataPageV2) page; int numNulls = pageV2.getNullCount(); System.out.println("=== BUG REPRODUCTION ==="); System.out.println("WriterVersion: PARQUET_2_0"); System.out.println("Statistics enabled: false (for 'value' column)"); System.out.println("Expected num_nulls: 4 (records with i=0,3,6,9)"); System.out.println("Actual num_nulls: " + numNulls); System.out.println("Bug present: " + (numNulls == -1 ? "YES" : "NO")); System.out.println(); // This assertion demonstrates the bug exists: // num_nulls should be 4, but is actually -1 assertEquals(-1, numNulls, "BUG: NoopStatistics.getNumNulls() returns -1, which gets written to " + "DataPageHeaderV2.num_nulls. This value is the 'required i32 num_nulls' " + "field in the Thrift page header."); } } /** * CONTROL: PARQUET_2_0 + statistics ENABLED → num_nulls is correct * * When statistics are enabled (default), num_nulls correctly reflects the actual null count. */ @Test void control_parquet2_statisticsEnabled_numNullsIsCorrect(@TempDir Path tempDir) throws Exception { Schema schema = new Schema.Parser().parse(SCHEMA_JSON); org.apache.hadoop.fs.Path parquetPath = new org.apache.hadoop.fs.Path(tempDir.resolve("control.parquet").toUri()); // Write: PARQUET_2_0 + statistics enabled (default) try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath) .withSchema(schema) .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) .withWriterVersion(PARQUET_2_0) // No withStatisticsEnabled(false) → default statistics are collected .withPageSize(1024 * 1024) .build()) { for (int i = 0; i < 10; i++) { GenericRecord record = new GenericData.Record(schema); record.put("id", i); record.put("value", i % 3 == 0 ? null : "hello-" + i); writer.write(record); } } // Read back Configuration conf = new Configuration(); InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf); try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { MessageType fileSchema = reader.getFileMetaData().getSchema(); var valueColumn = fileSchema.getColumns().stream() .filter(c -> c.getPath()[0].equals("value")) .findFirst() .orElseThrow(); PageReadStore rowGroup = reader.readNextRowGroup(); PageReader pageReader = rowGroup.getPageReader(valueColumn); DataPage page = pageReader.readPage(); assertInstanceOf(DataPageV2.class, page); DataPageV2 pageV2 = (DataPageV2) page; int numNulls = pageV2.getNullCount(); System.out.println("=== CONTROL (statistics enabled) ==="); System.out.println("WriterVersion: PARQUET_2_0"); System.out.println("Statistics enabled: true (default)"); System.out.println("Expected num_nulls: 4"); System.out.println("Actual num_nulls: " + numNulls); System.out.println(); assertEquals(4, numNulls, "With statistics enabled, num_nulls should correctly be 4"); } } /** * CONTROL: PARQUET_1_0 + statistics disabled → V1 page, no num_nulls in header * * DataPageV1 does not have a required num_nulls field, so this combination * does not trigger the bug. Statistics are simply omitted from the optional field. */ @Test void control_parquet1_statisticsDisabled_noIssue(@TempDir Path tempDir) throws Exception { Schema schema = new Schema.Parser().parse(SCHEMA_JSON); org.apache.hadoop.fs.Path parquetPath = new org.apache.hadoop.fs.Path(tempDir.resolve("v1.parquet").toUri()); // Write: PARQUET_1_0 + statistics disabled try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(parquetPath) .withSchema(schema) .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) .withWriterVersion(PARQUET_1_0) .withStatisticsEnabled("value", false) .withPageSize(1024 * 1024) .build()) { for (int i = 0; i < 10; i++) { GenericRecord record = new GenericData.Record(schema); record.put("id", i); record.put("value", i % 3 == 0 ? null : "hello-" + i); writer.write(record); } } // Read back — V1 pages don't have a separate num_nulls field in the header Configuration conf = new Configuration(); InputFile inputFile = HadoopInputFile.fromPath(parquetPath, conf); try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { MessageType fileSchema = reader.getFileMetaData().getSchema(); var valueColumn = fileSchema.getColumns().stream() .filter(c -> c.getPath()[0].equals("value")) .findFirst() .orElseThrow(); PageReadStore rowGroup = reader.readNextRowGroup(); PageReader pageReader = rowGroup.getPageReader(valueColumn); DataPage page = pageReader.readPage(); // V1 writer produces DataPageV1, not DataPageV2 assertFalse(page instanceof DataPageV2, "PARQUET_1_0 should NOT produce DataPageV2 pages"); System.out.println("=== CONTROL (PARQUET_1_0) ==="); System.out.println("WriterVersion: PARQUET_1_0"); System.out.println("Statistics enabled: false"); System.out.println("Page type: " + page.getClass().getSimpleName()); System.out.println("Result: No issue — V1 pages don't have required num_nulls"); System.out.println(); } } } ``` ### Component(s) _No response_ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
