(datafusion) branch main updated: Add tests for reading numeric limits in parquet statistics (#10642)

alamb Mon, 27 May 2024 02:46:21 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 7db58a900c Add tests for reading numeric limits in parquet statistics 
(#10642)
7db58a900c is described below

commit 7db58a900c266f53ed8fd5a0f63a2a4abc0566ed
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 27 05:45:02 2024 -0400

    Add tests for reading numeric limits in parquet statistics (#10642)
    
    * Add numeric limits tests for statistics reading
    
    * Apply suggestions from code review
    
    Co-authored-by: Oleks V <[email protected]>
    
    * fix
    
    ---------
    
    Co-authored-by: Oleks V <[email protected]>
---
 datafusion/core/tests/parquet/arrow_statistics.rs | 170 +++++++++++++++++++++-
 datafusion/core/tests/parquet/mod.rs              |  39 +++++
 2 files changed, 201 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index e5aadf2131..93cb7636b8 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -25,8 +25,8 @@ use arrow::compute::kernels::cast_utils::Parser;
 use arrow::datatypes::{Date32Type, Date64Type};
 use arrow_array::{
     make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, 
Date64Array,
-    Decimal128Array, FixedSizeBinaryArray, Float64Array, Int16Array, 
Int32Array,
-    Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
+    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, 
Int16Array,
+    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -189,7 +189,10 @@ impl Test {
         .extract(reader.metadata())
         .unwrap();
 
-        assert_eq!(&min, &expected_min, "Mismatch with expected minimums");
+        assert_eq!(
+            &min, &expected_min,
+            "{column_name}: Mismatch with expected minimums"
+        );
 
         let max = StatisticsConverter::try_new(
             column_name,
@@ -199,7 +202,10 @@ impl Test {
         .unwrap()
         .extract(reader.metadata())
         .unwrap();
-        assert_eq!(&max, &expected_max, "Mismatch with expected maximum");
+        assert_eq!(
+            &max, &expected_max,
+            "{column_name}: Mismatch with expected maximum"
+        );
 
         let null_counts = StatisticsConverter::try_new(
             column_name,
@@ -212,13 +218,15 @@ impl Test {
         let expected_null_counts = Arc::new(expected_null_counts) as ArrayRef;
         assert_eq!(
             &null_counts, &expected_null_counts,
-            "Mismatch with expected null counts"
+            "{column_name}: Mismatch with expected null counts. \
+            Actual: {null_counts:?}. Expected: {expected_null_counts:?}"
         );
 
         let row_counts = 
StatisticsConverter::row_counts(reader.metadata()).unwrap();
         assert_eq!(
             row_counts, expected_row_counts,
-            "Mismatch with expected row counts"
+            "{column_name}: Mismatch with expected row counts. \
+            Actual: {row_counts:?}. Expected: {expected_row_counts:?}"
         );
     }
 
@@ -802,6 +810,152 @@ async fn test_uint32_range() {
     .run();
 }
 
+#[tokio::test]
+async fn test_numeric_limits_unsigned() {
+    // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+    let reader = TestReader {
+        scenario: Scenario::NumericLimits,
+        row_per_group: 5,
+    };
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
+        expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i8",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
+        expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i16",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
+        expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i32",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
+        expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i64",
+    }
+    .run();
+}
+#[tokio::test]
+async fn test_numeric_limits_signed() {
+    // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+    let reader = TestReader {
+        scenario: Scenario::NumericLimits,
+        row_per_group: 5,
+    };
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
+        expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i8",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
+        expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i16",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
+        expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i32",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
+        expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "i64",
+    }
+    .run();
+}
+
+#[tokio::test]
+async fn test_numeric_limits_float() {
+    // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+    let reader = TestReader {
+        scenario: Scenario::NumericLimits,
+        row_per_group: 5,
+    };
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Float32Array::from(vec![f32::MIN, -100.0])),
+        expected_max: Arc::new(Float32Array::from(vec![100.0, f32::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "f32",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Float64Array::from(vec![f64::MIN, -100.0])),
+        expected_max: Arc::new(Float64Array::from(vec![100.0, f64::MAX])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "f64",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Float32Array::from(vec![-1.0, -100.0])),
+        expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "f32_nan",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Float64Array::from(vec![-1.0, -100.0])),
+        expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])),
+        expected_null_counts: UInt64Array::from(vec![0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "f64_nan",
+    }
+    .run();
+}
+
 #[tokio::test]
 async fn test_float64() {
     // This creates a parquet file of 1 column "f"
@@ -914,8 +1068,8 @@ async fn test_byte() {
 
     Test {
         reader: reader.build().await,
-        expected_min: 
Arc::new(BinaryArray::from(expected_service_binary_min_values)), // Shuld be 
BinaryArray
-        expected_max: 
Arc::new(BinaryArray::from(expected_service_binary_max_values)), // Shuld be 
BinaryArray
+        expected_min: 
Arc::new(BinaryArray::from(expected_service_binary_min_values)),
+        expected_max: 
Arc::new(BinaryArray::from(expected_service_binary_max_values)),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 5, 5]),
         column_name: "service_binary",
diff --git a/datafusion/core/tests/parquet/mod.rs 
b/datafusion/core/tests/parquet/mod.rs
index c5d0ad60bc..a0b62d7001 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -73,6 +73,9 @@ enum Scenario {
     Int32Range,
     UInt,
     UInt32Range,
+    /// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64
+    /// -MIN, -100, -1, 0, 1, 100, MAX
+    NumericLimits,
     Float64,
     Decimal,
     DecimalBloomFilterInt32,
@@ -710,6 +713,39 @@ fn make_int_batches_with_null(
     .unwrap()
 }
 
+fn make_numeric_limit_batch() -> RecordBatch {
+    let i8 = Int8Array::from(vec![i8::MIN, 100, -1, 0, 1, -100, i8::MAX]);
+    let i16 = Int16Array::from(vec![i16::MIN, 100, -1, 0, 1, -100, i16::MAX]);
+    let i32 = Int32Array::from(vec![i32::MIN, 100, -1, 0, 1, -100, i32::MAX]);
+    let i64 = Int64Array::from(vec![i64::MIN, 100, -1, 0, 1, -100, i64::MAX]);
+    let u8 = UInt8Array::from(vec![u8::MIN, 100, 1, 0, 1, 100, u8::MAX]);
+    let u16 = UInt16Array::from(vec![u16::MIN, 100, 1, 0, 1, 100, u16::MAX]);
+    let u32 = UInt32Array::from(vec![u32::MIN, 100, 1, 0, 1, 100, u32::MAX]);
+    let u64 = UInt64Array::from(vec![u64::MIN, 100, 1, 0, 1, 100, u64::MAX]);
+    let f32 = Float32Array::from(vec![f32::MIN, 100.0, -1.0, 0.0, 1.0, -100.0, 
f32::MAX]);
+    let f64 = Float64Array::from(vec![f64::MIN, 100.0, -1.0, 0.0, 1.0, -100.0, 
f64::MAX]);
+    let f32_nan =
+        Float32Array::from(vec![f32::NAN, 100.0, -1.0, 0.0, 1.0, -100.0, 
f32::NAN]);
+    let f64_nan =
+        Float64Array::from(vec![f64::NAN, 100.0, -1.0, 0.0, 1.0, -100.0, 
f64::NAN]);
+
+    RecordBatch::try_from_iter(vec![
+        ("i8", Arc::new(i8) as _),
+        ("i16", Arc::new(i16) as _),
+        ("i32", Arc::new(i32) as _),
+        ("i64", Arc::new(i64) as _),
+        ("u8", Arc::new(u8) as _),
+        ("u16", Arc::new(u16) as _),
+        ("u32", Arc::new(u32) as _),
+        ("u64", Arc::new(u64) as _),
+        ("f32", Arc::new(f32) as _),
+        ("f64", Arc::new(f64) as _),
+        ("f32_nan", Arc::new(f32_nan) as _),
+        ("f64_nan", Arc::new(f64_nan) as _),
+    ])
+    .unwrap()
+}
+
 fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
     match scenario {
         Scenario::Boolean => {
@@ -768,6 +804,9 @@ fn create_data_batch(scenario: Scenario) -> 
Vec<RecordBatch> {
         Scenario::UInt32Range => {
             vec![make_uint32_range(0, 10), make_uint32_range(200000, 300000)]
         }
+        Scenario::NumericLimits => {
+            vec![make_numeric_limit_batch()]
+        }
         Scenario::Float64 => {
             vec![
                 make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]),


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Add tests for reading numeric limits in parquet statistics (#10642)

Reply via email to