ArnavBalyan commented on code in PR #49909:
URL: https://github.com/apache/arrow/pull/49909#discussion_r3187791355
##########
cpp/src/parquet/metadata_test.cc:
##########
@@ -427,6 +427,38 @@ TEST(Metadata, TestReadPageIndex) {
}
}
+// Regression test: a column with max_definition_level == 0 cannot encode
+// nulls, so any positive null_count in its statistics indicates a malformed
+// file. The reader should reject it when the metadata is loaded.
+TEST(Metadata, RejectsRequiredColumnWithNonZeroNullCount) {
+ schema::NodeVector fields;
+ fields.push_back(schema::Int32("required_col", Repetition::REQUIRED));
+ auto schema_node = std::static_pointer_cast<schema::GroupNode>(
+ schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+ SchemaDescriptor schema_descr;
+ schema_descr.Init(schema_node);
+
+ format::ColumnChunk column_chunk;
+ format::ColumnMetaData& column_metadata = column_chunk.meta_data;
+ column_chunk.__isset.meta_data = true;
+
+ column_metadata.type = format::Type::INT32;
+ column_metadata.codec = format::CompressionCodec::UNCOMPRESSED;
+ column_metadata.num_values = 1000;
+ column_metadata.total_uncompressed_size = 4000;
+ column_metadata.total_compressed_size = 4000;
+ column_metadata.data_page_offset = 4;
+ column_metadata.path_in_schema.push_back("required_col");
+
+ column_metadata.statistics.null_count = 105;
+ column_metadata.statistics.__isset.null_count = true;
+ column_metadata.__isset.statistics = true;
+
+ EXPECT_THROW(ColumnChunkMetaData::Make(&column_chunk,
schema_descr.Column(0)),
+ ParquetException);
Review Comment:
updated
##########
cpp/src/parquet/metadata.cc:
##########
@@ -307,6 +307,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
possible_encoded_stats_ = nullptr;
possible_geo_stats_ = nullptr;
InitKeyValueMetadata();
+
+ // Per the Parquet spec, a column with max_definition_level == 0 cannot
+ // have nulls, so null_count must be 0. Reject inconsistent metadata
+ // from writers that skip ValidatingRecordConsumer checks for missing
Review Comment:
updated
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]