This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git


The following commit(s) were added to refs/heads/master by this push:
     new db68787  Clarify num-nulls handling in Statistics and ColumnIndex 
(#449)
db68787 is described below

commit db687874e11d238f25c5dc427a4db8d11146936f
Author: mwish <[email protected]>
AuthorDate: Fri Aug 23 15:30:20 2024 +0800

    Clarify num-nulls handling in Statistics and ColumnIndex (#449)
    
    * Clarify num-nulls handling
    
    * Update src/main/thrift/parquet.thrift
    
    Co-authored-by: Gang Wu <[email protected]>
    
    * Add readers behavior
    
    * Apply suggestions from code review
    
    Co-authored-by: Ed Seidl <[email protected]>
    
    * Reader: change SHOULD to MUST
    
    * Add for ColumnIndex
    
    * Apply suggestions from code review
    
    Co-authored-by: Antoine Pitrou <[email protected]>
    
    * Update src/main/thrift/parquet.thrift
    
    Co-authored-by: Gang Wu <[email protected]>
    
    * Update src/main/thrift/parquet.thrift
    
    Co-authored-by: Ed Seidl <[email protected]>
    
    ---------
    
    Co-authored-by: Gang Wu <[email protected]>
    Co-authored-by: Ed Seidl <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
---
 src/main/thrift/parquet.thrift | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 9e83529..83457fe 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -257,7 +257,14 @@ struct Statistics {
     */
    1: optional binary max;
    2: optional binary min;
-   /** count of null value in the column */
+   /** 
+    * Count of null values in the column.
+    *
+    * Writers SHOULD always write this field even if it is zero (i.e. no null 
value)
+    * or the column is not nullable.
+    * Readers MUST distinguish between null_count not being present and 
null_count == 0.
+    * If null_count is not present, readers MUST NOT assume null_count == 0.
+    */
    3: optional i64 null_count;
    /** count of distinct values occurring */
    4: optional i64 distinct_count;
@@ -1084,7 +1091,16 @@ struct ColumnIndex {
    */
   4: required BoundaryOrder boundary_order
 
-  /** A list containing the number of null values for each page **/
+  /**
+   * A list containing the number of null values for each page 
+   *
+   * Writers SHOULD always write this field even if no null values
+   * are present or the column is not nullable.
+   * Readers MUST distinguish between null_counts not being present 
+   * and null_count being 0.
+   * If null_counts are not present, readers MUST NOT assume all 
+   * null counts are 0.
+   */
   5: optional list<i64> null_counts
 
   /**

Reply via email to