This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a923c659cf feat: Add support for Int8 and Int16 data types in data
page statistics (#10931)
a923c659cf is described below
commit a923c659cf932f6369f2d5257e5b99128b67091a
Author: Alex Huang <[email protected]>
AuthorDate: Mon Jun 17 19:22:55 2024 +0800
feat: Add support for Int8 and Int16 data types in data page statistics
(#10931)
---
.../datasource/physical_plan/parquet/statistics.rs | 30 ++++++++++++++++++++++
datafusion/core/tests/parquet/arrow_statistics.rs | 24 +++--------------
2 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 327a516f1a..a2f17ca9b7 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -548,6 +548,8 @@ macro_rules! make_data_page_stats_iterator {
};
}
+make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min,
Index::INT32, i32);
+make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max,
Index::INT32, i32);
make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min,
Index::INT64, i64);
make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max,
Index::INT64, i64);
@@ -555,6 +557,29 @@ macro_rules! get_data_page_statistics {
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
paste! {
match $data_type {
+ Some(DataType::Int8) => Ok(Arc::new(
+ Int8Array::from_iter(
+ [<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
+ .map(|x| {
+ x.into_iter().filter_map(|x| {
+ x.and_then(|x| i8::try_from(x).ok())
+ })
+ })
+ .flatten()
+ )
+ )),
+ Some(DataType::Int16) => Ok(Arc::new(
+ Int16Array::from_iter(
+ [<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
+ .map(|x| {
+ x.into_iter().filter_map(|x| {
+ x.and_then(|x| i16::try_from(x).ok())
+ })
+ })
+ .flatten()
+ )
+ )),
+ Some(DataType::Int32) =>
Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
Some(DataType::Int64) =>
Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator).flatten()))),
_ => unimplemented!()
}
@@ -642,6 +667,11 @@ where
{
let iter = iterator.flat_map(|(len, index)| match index {
Index::NONE => vec![None; len],
+ Index::INT32(native_index) => native_index
+ .indexes
+ .iter()
+ .map(|x| x.null_count.map(|x| x as u64))
+ .collect::<Vec<_>>(),
Index::INT64(native_index) => native_index
.indexes
.iter()
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 6b8705441d..87bd137222 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -550,16 +550,11 @@ async fn test_int_32() {
// row counts are [5, 5, 5, 5]
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
column_name: "i32",
- check: Check::RowGroup,
+ check: Check::Both,
}
.run();
}
-// BUG: ignore this test for now
-// https://github.com/apache/datafusion/issues/10585
-// Note that the file has 4 columns named "i8", "i16", "i32", "i64".
-// - The tests on column i32 and i64 passed.
-// - The tests on column i8 and i16 failed.
#[tokio::test]
async fn test_int_16() {
// This creates a parquet files of 4 columns named "i8", "i16", "i32",
"i64"
@@ -573,16 +568,6 @@ async fn test_int_16() {
Test {
reader: &reader,
// mins are [-5, -4, 0, 5]
- // BUG: not sure why this returns same data but in Int32Array type
even though I debugged and the columns name is "i16" an its data is Int16
- // My debugging tells me the bug is either at:
- // 1. The new code to get "iter". See the code in this PR with
- // // Get an iterator over the column statistics
- // let iter = row_groups
- // .iter()
- // .map(|x| x.column(parquet_idx).statistics());
- // OR
- // 2. in the function (and/or its marco) `pub(crate) fn
min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>` here
- //
https://github.com/apache/datafusion/blob/ea023e2d4878240eece870cf4b346c7a0667aeed/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs#L179
expected_min: Arc::new(Int16Array::from(vec![-5, -4, 0, 5])), // panic
here because the actual data is Int32Array
// maxes are [-1, 0, 4, 9]
expected_max: Arc::new(Int16Array::from(vec![-1, 0, 4, 9])),
@@ -591,13 +576,11 @@ async fn test_int_16() {
// row counts are [5, 5, 5, 5]
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
column_name: "i16",
- check: Check::RowGroup,
+ check: Check::Both,
}
.run();
}
-// BUG (same as above): ignore this test for now
-// https://github.com/apache/datafusion/issues/10585
#[tokio::test]
async fn test_int_8() {
// This creates a parquet files of 4 columns named "i8", "i16", "i32",
"i64"
@@ -611,7 +594,6 @@ async fn test_int_8() {
Test {
reader: &reader,
// mins are [-5, -4, 0, 5]
- // BUG: not sure why this returns same data but in Int32Array even
though I debugged and the columns name is "i8" an its data is Int8
expected_min: Arc::new(Int8Array::from(vec![-5, -4, 0, 5])), // panic
here because the actual data is Int32Array
// maxes are [-1, 0, 4, 9]
expected_max: Arc::new(Int8Array::from(vec![-1, 0, 4, 9])),
@@ -620,7 +602,7 @@ async fn test_int_8() {
// row counts are [5, 5, 5, 5]
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
column_name: "i8",
- check: Check::RowGroup,
+ check: Check::Both,
}
.run();
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]