This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 529d2c0229 Extract `Date32` parquet statistics as `Date32Array`  
rather than `Int32Array` (#10593)
529d2c0229 is described below

commit 529d2c022950ef6118ce616c28993a03042925e1
Author: Xin Li <[email protected]>
AuthorDate: Thu May 23 19:16:35 2024 +0800

    Extract `Date32` parquet statistics as `Date32Array`  rather than 
`Int32Array` (#10593)
    
    * Fixes bug expect `Date32Array` but returns Int32Array
    
    * Add round trip ut
    
    * Update arrow_statistics.rs
    
    * remove unreachable code
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../datasource/physical_plan/parquet/statistics.rs | 96 +++++++++++++++++++++-
 datafusion/core/tests/parquet/arrow_statistics.rs  | 38 +++++----
 2 files changed, 116 insertions(+), 18 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 0ebf7dfe23..d0ecb86f9e 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -75,6 +75,12 @@ macro_rules! get_statistic {
                             *scale,
                         ))
                     }
+                    Some(DataType::Date32) => {
+                        Some(ScalarValue::Date32(Some(*s.$func())))
+                    }
+                    Some(DataType::Date64) => {
+                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 
24 * 60 * 60 * 1000)))
+                    }
                     _ => Some(ScalarValue::Int32(Some(*s.$func()))),
                 }
             }
@@ -363,10 +369,12 @@ impl<'a> StatisticsConverter<'a> {
 #[cfg(test)]
 mod test {
     use super::*;
+    use arrow::compute::kernels::cast_utils::Parser;
+    use arrow::datatypes::{Date32Type, Date64Type};
     use arrow_array::{
-        new_null_array, Array, BinaryArray, BooleanArray, Decimal128Array, 
Float32Array,
-        Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, 
StructArray,
-        TimestampNanosecondArray,
+        new_null_array, Array, BinaryArray, BooleanArray, Date32Array, 
Date64Array,
+        Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, 
RecordBatch,
+        StringArray, StructArray, TimestampNanosecondArray,
     };
     use arrow_schema::{Field, SchemaRef};
     use bytes::Bytes;
@@ -664,6 +672,68 @@ mod test {
         .run()
     }
 
+    #[test]
+    fn roundtrip_date32() {
+        Test {
+            input: date32_array(vec![
+                // row group 1
+                Some("2021-01-01"),
+                None,
+                Some("2021-01-03"),
+                // row group 2
+                Some("2021-01-01"),
+                Some("2021-01-05"),
+                None,
+                // row group 3
+                None,
+                None,
+                None,
+            ]),
+            expected_min: date32_array(vec![
+                Some("2021-01-01"),
+                Some("2021-01-01"),
+                None,
+            ]),
+            expected_max: date32_array(vec![
+                Some("2021-01-03"),
+                Some("2021-01-05"),
+                None,
+            ]),
+        }
+        .run()
+    }
+
+    #[test]
+    fn roundtrip_date64() {
+        Test {
+            input: date64_array(vec![
+                // row group 1
+                Some("2021-01-01"),
+                None,
+                Some("2021-01-03"),
+                // row group 2
+                Some("2021-01-01"),
+                Some("2021-01-05"),
+                None,
+                // row group 3
+                None,
+                None,
+                None,
+            ]),
+            expected_min: date64_array(vec![
+                Some("2021-01-01"),
+                Some("2021-01-01"),
+                None,
+            ]),
+            expected_max: date64_array(vec![
+                Some("2021-01-03"),
+                Some("2021-01-05"),
+                None,
+            ]),
+        }
+        .run()
+    }
+
     #[test]
     fn struct_and_non_struct() {
         // Ensures that statistics for an array that appears *after* a struct
@@ -1069,4 +1139,24 @@ mod test {
         ]);
         Arc::new(struct_array)
     }
+
+    fn date32_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> 
ArrayRef {
+        let array = Date32Array::from(
+            input
+                .into_iter()
+                .map(|s| Date32Type::parse(s.unwrap_or_default()))
+                .collect::<Vec<_>>(),
+        );
+        Arc::new(array)
+    }
+
+    fn date64_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> 
ArrayRef {
+        let array = Date64Array::from(
+            input
+                .into_iter()
+                .map(|s| Date64Type::parse(s.unwrap_or_default()))
+                .collect::<Vec<_>>(),
+        );
+        Arc::new(array)
+    }
 }
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 36fffe5ac4..a08487d3a9 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -21,10 +21,12 @@
 use std::fs::File;
 use std::sync::Arc;
 
+use arrow::compute::kernels::cast_utils::Parser;
+use arrow::datatypes::{Date32Type, Date64Type};
 use arrow_array::{
-    make_array, Array, ArrayRef, BooleanArray, Decimal128Array, 
FixedSizeBinaryArray,
-    Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch,
-    StringArray, UInt64Array,
+    make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, 
Decimal128Array,
+    FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, 
Int8Array,
+    RecordBatch, StringArray, UInt64Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -638,8 +640,6 @@ async fn test_timestamp_diff_rg_sizes() {
 }
 
 // date with different row group sizes
-// Bug expect `Date32Array` but returns Int32Array
-//  https://github.com/apache/datafusion/issues/10587
 #[tokio::test]
 async fn test_dates_32_diff_rg_sizes() {
     // This creates a parquet files of 3 columns named "date32", "date64", 
"names"
@@ -654,10 +654,16 @@ async fn test_dates_32_diff_rg_sizes() {
     };
     Test {
         reader: reader.build().await,
-        // mins are [18262, 18565,]
-        expected_min: Arc::new(Int32Array::from(vec![18262, 18565])),
-        // maxes are [18564, 21865,]
-        expected_max: Arc::new(Int32Array::from(vec![18564, 21865])),
+        // mins are [2020-01-01, 2020-10-30]
+        expected_min: Arc::new(Date32Array::from(vec![
+            Date32Type::parse("2020-01-01"),
+            Date32Type::parse("2020-10-30"),
+        ])),
+        // maxes are [2020-10-29, 2029-11-12]
+        expected_max: Arc::new(Date32Array::from(vec![
+            Date32Type::parse("2020-10-29"),
+            Date32Type::parse("2029-11-12"),
+        ])),
         // nulls are [2, 2]
         expected_null_counts: UInt64Array::from(vec![2, 2]),
         // row counts are [13, 7]
@@ -667,10 +673,6 @@ async fn test_dates_32_diff_rg_sizes() {
     .run();
 }
 
-// BUG: same as above. Expect to return Date64Array but returns Int32Array
-// test date with different row group sizes
-// https://github.com/apache/datafusion/issues/10587
-#[ignore]
 #[tokio::test]
 async fn test_dates_64_diff_rg_sizes() {
     // The file is created by 4 record batches (each has a null row), each has 
5 rows but then will be split into 2 row groups with size 13, 7
@@ -680,8 +682,14 @@ async fn test_dates_64_diff_rg_sizes() {
     };
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![18262, 18565])), // panic 
here because the actual data is Int32Array
-        expected_max: Arc::new(Int64Array::from(vec![18564, 21865])),
+        expected_min: Arc::new(Date64Array::from(vec![
+            Date64Type::parse("2020-01-01"),
+            Date64Type::parse("2020-10-30"),
+        ])),
+        expected_max: Arc::new(Date64Array::from(vec![
+            Date64Type::parse("2020-10-29"),
+            Date64Type::parse("2029-11-12"),
+        ])),
         expected_null_counts: UInt64Array::from(vec![2, 2]),
         expected_row_counts: UInt64Array::from(vec![13, 7]),
         column_name: "date64",


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to