This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 6543d59e3f Fix incorrect statistics read for unsigned integers columns 
in parquet (#10704)
6543d59e3f is described below

commit 6543d59e3f2d86ae2640eb7d62869af1775f7975
Author: Xin Li <[email protected]>
AuthorDate: Wed May 29 17:59:10 2024 +0800

    Fix incorrect statistics read for unsigned integers columns in parquet 
(#10704)
---
 .../datasource/physical_plan/parquet/statistics.rs | 12 +++++
 datafusion/core/tests/parquet/arrow_statistics.rs  | 60 +++++++++-------------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index bbdd46af5d..ae8395aef6 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -81,6 +81,15 @@ macro_rules! get_statistic {
                     Some(DataType::Int16) => {
                         
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
                     }
+                    Some(DataType::UInt8) => {
+                        
Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt16) => {
+                        
Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt32) => {
+                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
+                    }
                     Some(DataType::Date32) => {
                         Some(ScalarValue::Date32(Some(*s.$func())))
                     }
@@ -100,6 +109,9 @@ macro_rules! get_statistic {
                             *scale,
                         ))
                     }
+                    Some(DataType::UInt64) => {
+                        Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
+                    }
                     _ => Some(ScalarValue::Int64(Some(*s.$func()))),
                 }
             }
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 93cb7636b8..eebf3447cb 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -26,7 +26,8 @@ use arrow::datatypes::{Date32Type, Date64Type};
 use arrow_array::{
     make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, 
Date64Array,
     Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, 
Int16Array,
-    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
+    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array,
+    UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -703,8 +704,6 @@ async fn test_dates_64_diff_rg_sizes() {
     .run();
 }
 
-// BUG:
-// https://github.com/apache/datafusion/issues/10604
 #[tokio::test]
 async fn test_uint() {
     // This creates a parquet files of 4 columns named "u8", "u16", "u32", 
"u64"
@@ -719,48 +718,40 @@ async fn test_uint() {
         row_per_group: 4,
     };
 
-    // u8
-    // BUG: expect UInt8Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt8Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt8Array
+        expected_min: Arc::new(UInt8Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u8",
     }
     .run();
 
-    // u16
-    // BUG: expect UInt16Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt16Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt16Array
+        expected_min: Arc::new(UInt16Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u16",
     }
     .run();
 
-    // u32
-    // BUG: expect UInt32Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt32Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt32Array
+        expected_min: Arc::new(UInt32Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u32",
     }
     .run();
 
-    // u64
-    // BUG: expect UInt64rray but returns Int64Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt64Array
-        expected_max: Arc::new(Int64Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt64Array
+        expected_min: Arc::new(UInt64Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u64",
@@ -788,8 +779,6 @@ async fn test_int32_range() {
     .run();
 }
 
-// BUG: not convert UInt32Array to Int32Array
-// https://github.com/apache/datafusion/issues/10604
 #[tokio::test]
 async fn test_uint32_range() {
     // This creates a parquet file of 1 column "u"
@@ -801,8 +790,8 @@ async fn test_uint32_range() {
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0])), // should be 
UInt32Array
-        expected_max: Arc::new(Int32Array::from(vec![300000])), // should be 
UInt32Array
+        expected_min: Arc::new(UInt32Array::from(vec![0])),
+        expected_max: Arc::new(UInt32Array::from(vec![300000])),
         expected_null_counts: UInt64Array::from(vec![0]),
         expected_row_counts: UInt64Array::from(vec![4]),
         column_name: "u",
@@ -820,44 +809,45 @@ async fn test_numeric_limits_unsigned() {
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
-        expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+        expected_min: Arc::new(UInt8Array::from(vec![u8::MIN, 100])),
+        expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i8",
+        column_name: "u8",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
-        expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+        expected_min: Arc::new(UInt16Array::from(vec![u16::MIN, 100])),
+        expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i16",
+        column_name: "u16",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
-        expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+        expected_min: Arc::new(UInt32Array::from(vec![u32::MIN, 100])),
+        expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i32",
+        column_name: "u32",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
-        expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+        expected_min: Arc::new(UInt64Array::from(vec![u64::MIN, 100])),
+        expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i64",
+        column_name: "u64",
     }
     .run();
 }
+
 #[tokio::test]
 async fn test_numeric_limits_signed() {
     // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to