This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2956dbf30f fix: Do not assume missing nullcount stat means zero
nullcount (#9481)
2956dbf30f is described below
commit 2956dbf30fe5b50f8f76e6bad93505a8e7b86eb5
Author: Ryan Johnson <[email protected]>
AuthorDate: Wed Mar 11 12:46:51 2026 -0600
fix: Do not assume missing nullcount stat means zero nullcount (#9481)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/9451
- Closes https://github.com/apache/arrow-rs/issues/6256
# Rationale for this change
A reader might be annoyed (performance wart) if a parquet footer lacks
nullcount stats, but inferring nullcount=0 for missing stats makes the
stats untrustworthy and can lead to incorrect behavior.
# What changes are included in this PR?
If a parquet footer nullcount stat is missing, surface it as None,
reserving `Some(0)` for known-no-null cases.
# Are these changes tested?
Fixed one unit test that broke, added a missing unit test that covers
the other change site.
# Are there any user-facing changes?
The stats API doesn't change signature, but there is a behavior change.
The existing doc that called out the incorrect behavior has been removed
to reflect that the incorrect behavior no longer occurs.
---
parquet/src/file/metadata/thrift/mod.rs | 73 ++++++++++++++++++++++++++-------
parquet/src/file/statistics.rs | 56 ++++++++++++-------------
2 files changed, 85 insertions(+), 44 deletions(-)
diff --git a/parquet/src/file/metadata/thrift/mod.rs
b/parquet/src/file/metadata/thrift/mod.rs
index ddb5aa16b0..88cb96f355 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -192,20 +192,19 @@ fn convert_stats(
use crate::file::statistics::Statistics as FStatistics;
Ok(match thrift_stats {
Some(stats) => {
- // Number of nulls recorded, when it is not available, we just
mark it as 0.
- // TODO this should be `None` if there is no information about
NULLS.
- // see https://github.com/apache/arrow-rs/pull/6216/files
- let null_count = stats.null_count.unwrap_or(0);
-
- if null_count < 0 {
- return Err(general_err!(
- "Statistics null count is negative {}",
- null_count
- ));
- }
-
// Generic null count.
- let null_count = Some(null_count as u64);
+ let null_count = stats
+ .null_count
+ .map(|null_count| {
+ if null_count < 0 {
+ return Err(general_err!(
+ "Statistics null count is negative {}",
+ null_count
+ ));
+ }
+ Ok(null_count as u64)
+ })
+ .transpose()?;
// Generic distinct count (count of distinct values occurring)
let distinct_count = stats.distinct_count.map(|value| value as
u64);
// Whether or not statistics use deprecated min/max fields.
@@ -1722,6 +1721,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
#[cfg(test)]
pub(crate) mod tests {
+ use crate::basic::Type as PhysicalType;
use crate::errors::Result;
use crate::file::metadata::thrift::{BoundingBox, SchemaElement,
write_schema};
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions,
RowGroupMetaData};
@@ -1730,7 +1730,8 @@ pub(crate) mod tests {
ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol,
read_thrift_vec,
};
use crate::schema::types::{
- ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes,
parquet_schema_from_array,
+ ColumnDescriptor, ColumnPath, SchemaDescriptor, TypePtr, num_nodes,
+ parquet_schema_from_array,
};
use std::sync::Arc;
@@ -1828,4 +1829,48 @@ pub(crate) mod tests {
mmax: Some(42.0.into()),
});
}
+
+ #[test]
+ fn test_convert_stats_preserves_missing_null_count() {
+ let primitive =
+ crate::schema::types::Type::primitive_type_builder("col",
PhysicalType::INT32)
+ .build()
+ .unwrap();
+ let column_descr = Arc::new(ColumnDescriptor::new(
+ Arc::new(primitive),
+ 0,
+ 0,
+ ColumnPath::new(vec![]),
+ ));
+
+ let none_null_count = super::Statistics {
+ max: None,
+ min: None,
+ null_count: None,
+ distinct_count: None,
+ max_value: None,
+ min_value: None,
+ is_max_value_exact: None,
+ is_min_value_exact: None,
+ };
+ let decoded_none = super::convert_stats(&column_descr,
Some(none_null_count))
+ .unwrap()
+ .unwrap();
+ assert_eq!(decoded_none.null_count_opt(), None);
+
+ let zero_null_count = super::Statistics {
+ max: None,
+ min: None,
+ null_count: Some(0),
+ distinct_count: None,
+ max_value: None,
+ min_value: None,
+ is_max_value_exact: None,
+ is_min_value_exact: None,
+ };
+ let decoded_zero = super::convert_stats(&column_descr,
Some(zero_null_count))
+ .unwrap()
+ .unwrap();
+ assert_eq!(decoded_zero.null_count_opt(), Some(0));
+ }
}
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index a813e82d13..9682fd54b8 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -125,19 +125,18 @@ pub(crate) fn from_thrift_page_stats(
) -> Result<Option<Statistics>> {
Ok(match thrift_stats {
Some(stats) => {
- // Number of nulls recorded, when it is not available, we just
mark it as 0.
- // TODO this should be `None` if there is no information about
NULLS.
- // see https://github.com/apache/arrow-rs/pull/6216/files
- let null_count = stats.null_count.unwrap_or(0);
-
- if null_count < 0 {
- return Err(ParquetError::General(format!(
- "Statistics null count is negative {null_count}",
- )));
- }
-
// Generic null count.
- let null_count = Some(null_count as u64);
+ let null_count = stats
+ .null_count
+ .map(|null_count| {
+ if null_count < 0 {
+ return Err(ParquetError::General(format!(
+ "Statistics null count is negative {null_count}",
+ )));
+ }
+ Ok(null_count as u64)
+ })
+ .transpose()?;
// Generic distinct count (count of distinct values occurring)
let distinct_count = stats.distinct_count.map(|value| value as
u64);
// Whether or not statistics use deprecated min/max fields.
@@ -431,9 +430,20 @@ impl Statistics {
/// Returns number of null values for the column, if known.
/// Note that this includes all nulls when column is part of the complex
type.
///
- /// Note this API returns Some(0) even if the null count was not present
- /// in the statistics.
- /// See <https://github.com/apache/arrow-rs/pull/6216/files>
+ /// Note: Versions of this library prior to `58.1.0` returned `0` if the
null count
+ /// was not available. This method now returns `None` in that case.
+ ///
+ /// Also, versions of this library prior to `53.1.0` did not store a null
count
+ /// statistic when the null count was `0`.
+ ///
+ /// It is unsound to assume that missing nullcount stats mean the column
contains no nulls,
+ /// but code that depends on the old behavior can restore it by defaulting
to zero:
+ ///
+ /// ```no_run
+ /// # use parquet::file::statistics::Statistics;
+ /// # let statistics: Statistics = todo!();
+ /// let null_count = statistics.null_count_opt().unwrap_or(0);
+ /// ```
pub fn null_count_opt(&self) -> Option<u64> {
statistics_enum_func![self, null_count_opt]
}
@@ -1064,21 +1074,7 @@ mod tests {
let round_tripped = from_thrift_page_stats(Type::BOOLEAN,
Some(thrift_stats))
.unwrap()
.unwrap();
- // TODO: remove branch when we no longer support assuming
null_count==None in the thrift
- // means null_count = Some(0)
- if null_count.is_none() {
- assert_ne!(round_tripped, statistics);
- assert!(round_tripped.null_count_opt().is_some());
- assert_eq!(round_tripped.null_count_opt(), Some(0));
- assert_eq!(round_tripped.min_bytes_opt(),
statistics.min_bytes_opt());
- assert_eq!(round_tripped.max_bytes_opt(),
statistics.max_bytes_opt());
- assert_eq!(
- round_tripped.distinct_count_opt(),
- statistics.distinct_count_opt()
- );
- } else {
- assert_eq!(round_tripped, statistics);
- }
+ assert_eq!(round_tripped, statistics);
}
fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>)
-> Statistics {