ArnavBalyan commented on code in PR #49909:
URL: https://github.com/apache/arrow/pull/49909#discussion_r3189160017
##########
cpp/src/parquet/metadata.cc:
##########
@@ -307,6 +307,19 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
possible_encoded_stats_ = nullptr;
possible_geo_stats_ = nullptr;
InitKeyValueMetadata();
+
+ // Per the Parquet specification, a column with max_definition_level == 0
+ // is required at every level and therefore cannot contain null values.
+ // Reject inconsistent metadata from writers violating this invariant.
+ if (descr_->max_definition_level() == 0 &&
column_metadata_->__isset.statistics &&
+ column_metadata_->statistics.__isset.null_count &&
+ column_metadata_->statistics.null_count > 0) {
+ std::stringstream ss;
+ ss << "Malformed Parquet file: column '" << descr_->path()->ToDotString()
+ << "' has max_definition_level == 0 but statistics report null_count="
+ << column_metadata_->statistics.null_count;
+ throw ParquetException(ss.str());
Review Comment:
Looks like some legacy versions can get it through with disabled validating
record consumer
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]