This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git
The following commit(s) were added to refs/heads/master by this push:
new db68787 Clarify num-nulls handling in Statistics and ColumnIndex
(#449)
db68787 is described below
commit db687874e11d238f25c5dc427a4db8d11146936f
Author: mwish <[email protected]>
AuthorDate: Fri Aug 23 15:30:20 2024 +0800
Clarify num-nulls handling in Statistics and ColumnIndex (#449)
* Clarify num-nulls handling
* Update src/main/thrift/parquet.thrift
Co-authored-by: Gang Wu <[email protected]>
* Add readers behavior
* Apply suggestions from code review
Co-authored-by: Ed Seidl <[email protected]>
* Reader: change SHOULD to MUST
* Add for ColumnIndex
* Apply suggestions from code review
Co-authored-by: Antoine Pitrou <[email protected]>
* Update src/main/thrift/parquet.thrift
Co-authored-by: Gang Wu <[email protected]>
* Update src/main/thrift/parquet.thrift
Co-authored-by: Ed Seidl <[email protected]>
---------
Co-authored-by: Gang Wu <[email protected]>
Co-authored-by: Ed Seidl <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
---
src/main/thrift/parquet.thrift | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 9e83529..83457fe 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -257,7 +257,14 @@ struct Statistics {
*/
1: optional binary max;
2: optional binary min;
- /** count of null value in the column */
+ /**
+ * Count of null values in the column.
+ *
+ * Writers SHOULD always write this field even if it is zero (i.e. no null
value)
+ * or the column is not nullable.
+ * Readers MUST distinguish between null_count not being present and
null_count == 0.
+ * If null_count is not present, readers MUST NOT assume null_count == 0.
+ */
3: optional i64 null_count;
/** count of distinct values occurring */
4: optional i64 distinct_count;
@@ -1084,7 +1091,16 @@ struct ColumnIndex {
*/
4: required BoundaryOrder boundary_order
- /** A list containing the number of null values for each page **/
+ /**
+ * A list containing the number of null values for each page
+ *
+ * Writers SHOULD always write this field even if no null values
+ * are present or the column is not nullable.
+ * Readers MUST distinguish between null_counts not being present
+ * and null_count being 0.
+ * If null_counts are not present, readers MUST NOT assume all
+ * null counts are 0.
+ */
5: optional list<i64> null_counts
/**