This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7db58a900c Add tests for reading numeric limits in parquet statistics
(#10642)
7db58a900c is described below
commit 7db58a900c266f53ed8fd5a0f63a2a4abc0566ed
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 27 05:45:02 2024 -0400
Add tests for reading numeric limits in parquet statistics (#10642)
* Add numeric limits tests for statistics reading
* Apply suggestions from code review
Co-authored-by: Oleks V <[email protected]>
* fix
---------
Co-authored-by: Oleks V <[email protected]>
---
datafusion/core/tests/parquet/arrow_statistics.rs | 170 +++++++++++++++++++++-
datafusion/core/tests/parquet/mod.rs | 39 +++++
2 files changed, 201 insertions(+), 8 deletions(-)
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs
b/datafusion/core/tests/parquet/arrow_statistics.rs
index e5aadf2131..93cb7636b8 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -25,8 +25,8 @@ use arrow::compute::kernels::cast_utils::Parser;
use arrow::datatypes::{Date32Type, Date64Type};
use arrow_array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array,
- Decimal128Array, FixedSizeBinaryArray, Float64Array, Int16Array,
Int32Array,
- Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
+ Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array,
Int16Array,
+ Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
};
use arrow_schema::{DataType, Field, Schema};
use datafusion::datasource::physical_plan::parquet::{
@@ -189,7 +189,10 @@ impl Test {
.extract(reader.metadata())
.unwrap();
- assert_eq!(&min, &expected_min, "Mismatch with expected minimums");
+ assert_eq!(
+ &min, &expected_min,
+ "{column_name}: Mismatch with expected minimums"
+ );
let max = StatisticsConverter::try_new(
column_name,
@@ -199,7 +202,10 @@ impl Test {
.unwrap()
.extract(reader.metadata())
.unwrap();
- assert_eq!(&max, &expected_max, "Mismatch with expected maximum");
+ assert_eq!(
+ &max, &expected_max,
+ "{column_name}: Mismatch with expected maximum"
+ );
let null_counts = StatisticsConverter::try_new(
column_name,
@@ -212,13 +218,15 @@ impl Test {
let expected_null_counts = Arc::new(expected_null_counts) as ArrayRef;
assert_eq!(
&null_counts, &expected_null_counts,
- "Mismatch with expected null counts"
+ "{column_name}: Mismatch with expected null counts. \
+ Actual: {null_counts:?}. Expected: {expected_null_counts:?}"
);
let row_counts =
StatisticsConverter::row_counts(reader.metadata()).unwrap();
assert_eq!(
row_counts, expected_row_counts,
- "Mismatch with expected row counts"
+ "{column_name}: Mismatch with expected row counts. \
+ Actual: {row_counts:?}. Expected: {expected_row_counts:?}"
);
}
@@ -802,6 +810,152 @@ async fn test_uint32_range() {
.run();
}
+#[tokio::test]
+async fn test_numeric_limits_unsigned() {
+ // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+ let reader = TestReader {
+ scenario: Scenario::NumericLimits,
+ row_per_group: 5,
+ };
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
+ expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i8",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
+ expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i16",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
+ expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i32",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
+ expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i64",
+ }
+ .run();
+}
+#[tokio::test]
+async fn test_numeric_limits_signed() {
+ // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+ let reader = TestReader {
+ scenario: Scenario::NumericLimits,
+ row_per_group: 5,
+ };
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
+ expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i8",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
+ expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i16",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
+ expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i32",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
+ expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "i64",
+ }
+ .run();
+}
+
+#[tokio::test]
+async fn test_numeric_limits_float() {
+ // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.
+ let reader = TestReader {
+ scenario: Scenario::NumericLimits,
+ row_per_group: 5,
+ };
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Float32Array::from(vec![f32::MIN, -100.0])),
+ expected_max: Arc::new(Float32Array::from(vec![100.0, f32::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "f32",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Float64Array::from(vec![f64::MIN, -100.0])),
+ expected_max: Arc::new(Float64Array::from(vec![100.0, f64::MAX])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "f64",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Float32Array::from(vec![-1.0, -100.0])),
+ expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "f32_nan",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Float64Array::from(vec![-1.0, -100.0])),
+ expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])),
+ expected_null_counts: UInt64Array::from(vec![0, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "f64_nan",
+ }
+ .run();
+}
+
#[tokio::test]
async fn test_float64() {
// This creates a parquet file of 1 column "f"
@@ -914,8 +1068,8 @@ async fn test_byte() {
Test {
reader: reader.build().await,
- expected_min:
Arc::new(BinaryArray::from(expected_service_binary_min_values)), // Shuld be
BinaryArray
- expected_max:
Arc::new(BinaryArray::from(expected_service_binary_max_values)), // Shuld be
BinaryArray
+ expected_min:
Arc::new(BinaryArray::from(expected_service_binary_min_values)),
+ expected_max:
Arc::new(BinaryArray::from(expected_service_binary_max_values)),
expected_null_counts: UInt64Array::from(vec![0, 0, 0]),
expected_row_counts: UInt64Array::from(vec![5, 5, 5]),
column_name: "service_binary",
diff --git a/datafusion/core/tests/parquet/mod.rs
b/datafusion/core/tests/parquet/mod.rs
index c5d0ad60bc..a0b62d7001 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -73,6 +73,9 @@ enum Scenario {
Int32Range,
UInt,
UInt32Range,
+ /// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64
+ /// -MIN, -100, -1, 0, 1, 100, MAX
+ NumericLimits,
Float64,
Decimal,
DecimalBloomFilterInt32,
@@ -710,6 +713,39 @@ fn make_int_batches_with_null(
.unwrap()
}
+fn make_numeric_limit_batch() -> RecordBatch {
+ let i8 = Int8Array::from(vec![i8::MIN, 100, -1, 0, 1, -100, i8::MAX]);
+ let i16 = Int16Array::from(vec![i16::MIN, 100, -1, 0, 1, -100, i16::MAX]);
+ let i32 = Int32Array::from(vec![i32::MIN, 100, -1, 0, 1, -100, i32::MAX]);
+ let i64 = Int64Array::from(vec![i64::MIN, 100, -1, 0, 1, -100, i64::MAX]);
+ let u8 = UInt8Array::from(vec![u8::MIN, 100, 1, 0, 1, 100, u8::MAX]);
+ let u16 = UInt16Array::from(vec![u16::MIN, 100, 1, 0, 1, 100, u16::MAX]);
+ let u32 = UInt32Array::from(vec![u32::MIN, 100, 1, 0, 1, 100, u32::MAX]);
+ let u64 = UInt64Array::from(vec![u64::MIN, 100, 1, 0, 1, 100, u64::MAX]);
+ let f32 = Float32Array::from(vec![f32::MIN, 100.0, -1.0, 0.0, 1.0, -100.0,
f32::MAX]);
+ let f64 = Float64Array::from(vec![f64::MIN, 100.0, -1.0, 0.0, 1.0, -100.0,
f64::MAX]);
+ let f32_nan =
+ Float32Array::from(vec![f32::NAN, 100.0, -1.0, 0.0, 1.0, -100.0,
f32::NAN]);
+ let f64_nan =
+ Float64Array::from(vec![f64::NAN, 100.0, -1.0, 0.0, 1.0, -100.0,
f64::NAN]);
+
+ RecordBatch::try_from_iter(vec![
+ ("i8", Arc::new(i8) as _),
+ ("i16", Arc::new(i16) as _),
+ ("i32", Arc::new(i32) as _),
+ ("i64", Arc::new(i64) as _),
+ ("u8", Arc::new(u8) as _),
+ ("u16", Arc::new(u16) as _),
+ ("u32", Arc::new(u32) as _),
+ ("u64", Arc::new(u64) as _),
+ ("f32", Arc::new(f32) as _),
+ ("f64", Arc::new(f64) as _),
+ ("f32_nan", Arc::new(f32_nan) as _),
+ ("f64_nan", Arc::new(f64_nan) as _),
+ ])
+ .unwrap()
+}
+
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Boolean => {
@@ -768,6 +804,9 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
Scenario::UInt32Range => {
vec![make_uint32_range(0, 10), make_uint32_range(200000, 300000)]
}
+ Scenario::NumericLimits => {
+ vec![make_numeric_limit_batch()]
+ }
Scenario::Float64 => {
vec![
make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]