This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 089b232304 Extract Parquet statistics from `Interval` column (#10801)
089b232304 is described below
commit 089b23230468fbcdcf2e2c992b0fc0096321b8ee
Author: Marvin Lanhenke <[email protected]>
AuthorDate: Thu Jun 6 17:59:19 2024 +0200
Extract Parquet statistics from `Interval` column (#10801)
* feat: add make_batch + basic test
---
datafusion/core/tests/parquet/arrow_statistics.rs | 81 ++++++++++++++++++++++-
datafusion/core/tests/parquet/mod.rs | 80 +++++++++++++++++++++-
2 files changed, 159 insertions(+), 2 deletions(-)
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 2f8fbab647..b378b2a6c3 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -30,7 +30,8 @@ use arrow::datatypes::{
use arrow_array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array,
Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array,
Float32Array,
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray,
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
IntervalDayTimeArray,
+ IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray,
LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray,
@@ -1072,6 +1073,84 @@ async fn test_dates_64_diff_rg_sizes() {
.run();
}
+#[tokio::test]
+#[should_panic]
+// Currently this test `should_panic` since statistics for `Intervals`
+// are not supported and `IntervalMonthDayNano` cannot be written
+// to parquet yet.
+// Refer to issue: https://github.com/apache/arrow-rs/issues/5847
+// and
https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
+async fn test_interval_diff_rg_sizes() {
+ // This creates a parquet files of 3 columns:
+ // "year_month" --> IntervalYearMonthArray
+ // "day_time" --> IntervalDayTimeArray
+ // "month_day_nano" --> IntervalMonthDayNanoArray
+ //
+ // The file is created by 4 record batches (each has a null row)
+ // each has 5 rows but then will be split into 2 row groups with size 13, 7
+ let reader = TestReader {
+ scenario: Scenario::Interval,
+ row_per_group: 13,
+ }
+ .build()
+ .await;
+
+ // TODO: expected values need to be changed once issue is resolved
+ // expected_min: Arc::new(IntervalYearMonthArray::from(vec![
+ // IntervalYearMonthType::make_value(1, 10),
+ // IntervalYearMonthType::make_value(4, 13),
+ // ])),
+ // expected_max: Arc::new(IntervalYearMonthArray::from(vec![
+ // IntervalYearMonthType::make_value(6, 51),
+ // IntervalYearMonthType::make_value(8, 53),
+ // ])),
+ Test {
+ reader: &reader,
+ expected_min: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
+ expected_max: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
+ expected_null_counts: UInt64Array::from(vec![2, 2]),
+ expected_row_counts: UInt64Array::from(vec![13, 7]),
+ column_name: "year_month",
+ }
+ .run();
+
+ // expected_min: Arc::new(IntervalDayTimeArray::from(vec![
+ // IntervalDayTimeType::make_value(1, 10),
+ // IntervalDayTimeType::make_value(4, 13),
+ // ])),
+ // expected_max: Arc::new(IntervalDayTimeArray::from(vec![
+ // IntervalDayTimeType::make_value(6, 51),
+ // IntervalDayTimeType::make_value(8, 53),
+ // ])),
+ Test {
+ reader: &reader,
+ expected_min: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
+ expected_max: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
+ expected_null_counts: UInt64Array::from(vec![2, 2]),
+ expected_row_counts: UInt64Array::from(vec![13, 7]),
+ column_name: "day_time",
+ }
+ .run();
+
+ // expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![
+ // IntervalMonthDayNanoType::make_value(1, 10, 100),
+ // IntervalMonthDayNanoType::make_value(4, 13, 103),
+ // ])),
+ // expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![
+ // IntervalMonthDayNanoType::make_value(6, 51, 501),
+ // IntervalMonthDayNanoType::make_value(8, 53, 503),
+ // ])),
+ Test {
+ reader: &reader,
+ expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![None,
None])),
+ expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![None,
None])),
+ expected_null_counts: UInt64Array::from(vec![2, 2]),
+ expected_row_counts: UInt64Array::from(vec![13, 7]),
+ column_name: "month_day_nano",
+ }
+ .run();
+}
+
#[tokio::test]
async fn test_uint() {
// This creates a parquet files of 4 columns named "u8", "u16", "u32",
"u64"
diff --git a/datafusion/core/tests/parquet/mod.rs
b/datafusion/core/tests/parquet/mod.rs
index f36a9a194a..99769a3367 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -17,7 +17,9 @@
//! Parquet integration tests
use arrow::array::Decimal128Array;
-use arrow::datatypes::i256;
+use arrow::datatypes::{
+ i256, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
+};
use arrow::{
array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array,
@@ -33,6 +35,10 @@ use arrow::{
record_batch::RecordBatch,
util::pretty::pretty_format_batches,
};
+use arrow_array::{
+ IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
+};
+use arrow_schema::IntervalUnit;
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{physical_plan::ParquetExec, provider_as_source,
TableProvider},
@@ -80,6 +86,7 @@ enum Scenario {
Time32Millisecond,
Time64Nanosecond,
Time64Microsecond,
+ Interval,
/// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64
/// -MIN, -100, -1, 0, 1, 100, MAX
NumericLimits,
@@ -925,6 +932,71 @@ fn make_dict_batch() -> RecordBatch {
.unwrap()
}
+fn make_interval_batch(offset: i32) -> RecordBatch {
+ let schema = Schema::new(vec![
+ Field::new(
+ "year_month",
+ DataType::Interval(IntervalUnit::YearMonth),
+ true,
+ ),
+ Field::new("day_time", DataType::Interval(IntervalUnit::DayTime),
true),
+ Field::new(
+ "month_day_nano",
+ DataType::Interval(IntervalUnit::MonthDayNano),
+ true,
+ ),
+ ]);
+ let schema = Arc::new(schema);
+
+ let ym_arr = IntervalYearMonthArray::from(vec![
+ Some(IntervalYearMonthType::make_value(1 + offset, 10 + offset)),
+ Some(IntervalYearMonthType::make_value(2 + offset, 20 + offset)),
+ Some(IntervalYearMonthType::make_value(3 + offset, 30 + offset)),
+ None,
+ Some(IntervalYearMonthType::make_value(5 + offset, 50 + offset)),
+ ]);
+
+ let dt_arr = IntervalDayTimeArray::from(vec![
+ Some(IntervalDayTimeType::make_value(1 + offset, 10 + offset)),
+ Some(IntervalDayTimeType::make_value(2 + offset, 20 + offset)),
+ Some(IntervalDayTimeType::make_value(3 + offset, 30 + offset)),
+ None,
+ Some(IntervalDayTimeType::make_value(5 + offset, 50 + offset)),
+ ]);
+
+ // Not yet implemented, refer to:
+ //
https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
+ let mdn_arr = IntervalMonthDayNanoArray::from(vec![
+ Some(IntervalMonthDayNanoType::make_value(
+ 1 + offset,
+ 10 + offset,
+ 100 + (offset as i64),
+ )),
+ Some(IntervalMonthDayNanoType::make_value(
+ 2 + offset,
+ 20 + offset,
+ 200 + (offset as i64),
+ )),
+ Some(IntervalMonthDayNanoType::make_value(
+ 3 + offset,
+ 30 + offset,
+ 300 + (offset as i64),
+ )),
+ None,
+ Some(IntervalMonthDayNanoType::make_value(
+ 5 + offset,
+ 50 + offset,
+ 500 + (offset as i64),
+ )),
+ ]);
+
+ RecordBatch::try_new(
+ schema,
+ vec![Arc::new(ym_arr), Arc::new(dt_arr), Arc::new(mdn_arr)],
+ )
+ .unwrap()
+}
+
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Boolean => {
@@ -1346,6 +1418,12 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
]),
]
}
+ Scenario::Interval => vec![
+ make_interval_batch(0),
+ make_interval_batch(1),
+ make_interval_batch(2),
+ make_interval_batch(3),
+ ],
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]