This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new dea4a1b6a6 [minor] make parquet prune tests more readable (#10112)
dea4a1b6a6 is described below
commit dea4a1b6a6320b0c317396e59524279f924971e7
Author: Yang Jiang <[email protected]>
AuthorDate: Thu Apr 18 01:35:18 2024 +0800
[minor] make parquet prune tests more readable (#10112)
* [minor] make parquet prune tests more readable
* typo
---
datafusion/core/tests/parquet/mod.rs | 26 +++++-----
datafusion/core/tests/parquet/page_pruning.rs | 56 ++++++++++++++++++++--
datafusion/core/tests/parquet/row_group_pruning.rs | 13 ++---
3 files changed, 73 insertions(+), 22 deletions(-)
diff --git a/datafusion/core/tests/parquet/mod.rs
b/datafusion/core/tests/parquet/mod.rs
index f90d0e8afb..d92a56d7fa 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -81,8 +81,10 @@ enum Scenario {
}
enum Unit {
- RowGroup,
- Page,
+ // pass max row per row_group in parquet writer
+ RowGroup(usize),
+ // pass max row per page in parquet writer
+ Page(usize),
}
/// Test fixture that has an execution context that has an external
@@ -185,13 +187,13 @@ impl ContextWithParquet {
mut config: SessionConfig,
) -> Self {
let file = match unit {
- Unit::RowGroup => {
+ Unit::RowGroup(row_per_group) => {
config = config.with_parquet_bloom_filter_pruning(true);
- make_test_file_rg(scenario).await
+ make_test_file_rg(scenario, row_per_group).await
}
- Unit::Page => {
+ Unit::Page(row_per_page) => {
config = config.with_parquet_page_index_pruning(true);
- make_test_file_page(scenario).await
+ make_test_file_page(scenario, row_per_page).await
}
};
let parquet_path = file.path().to_string_lossy();
@@ -880,7 +882,7 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
}
/// Create a test parquet file with various data types
-async fn make_test_file_rg(scenario: Scenario) -> NamedTempFile {
+async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) ->
NamedTempFile {
let mut output_file = tempfile::Builder::new()
.prefix("parquet_pruning")
.suffix(".parquet")
@@ -888,7 +890,7 @@ async fn make_test_file_rg(scenario: Scenario) ->
NamedTempFile {
.expect("tempfile creation");
let props = WriterProperties::builder()
- .set_max_row_group_size(5)
+ .set_max_row_group_size(row_per_group)
.set_bloom_filter_enabled(true)
.build();
@@ -906,17 +908,17 @@ async fn make_test_file_rg(scenario: Scenario) ->
NamedTempFile {
output_file
}
-async fn make_test_file_page(scenario: Scenario) -> NamedTempFile {
+async fn make_test_file_page(scenario: Scenario, row_per_page: usize) ->
NamedTempFile {
let mut output_file = tempfile::Builder::new()
.prefix("parquet_page_pruning")
.suffix(".parquet")
.tempfile()
.expect("tempfile creation");
- // set row count to 5, should get same result as rowGroup
+ // set row count to row_per_page, should get same result as rowGroup
let props = WriterProperties::builder()
- .set_data_page_row_count_limit(5)
- .set_write_batch_size(5)
+ .set_data_page_row_count_limit(row_per_page)
+ .set_write_batch_size(row_per_page)
.build();
let batches = create_data_batch(scenario);
diff --git a/datafusion/core/tests/parquet/page_pruning.rs
b/datafusion/core/tests/parquet/page_pruning.rs
index 1615a1c576..ccaa65b7ee 100644
--- a/datafusion/core/tests/parquet/page_pruning.rs
+++ b/datafusion/core/tests/parquet/page_pruning.rs
@@ -241,9 +241,10 @@ async fn test_prune(
expected_errors: Option<usize>,
expected_row_pages_pruned: Option<usize>,
expected_results: usize,
+ row_per_page: usize,
) {
let output: crate::parquet::TestOutput =
- ContextWithParquet::new(case_data_type, Page)
+ ContextWithParquet::new(case_data_type, Page(row_per_page))
.await
.query(sql)
.await;
@@ -272,6 +273,7 @@ async fn prune_timestamps_nanos() {
Some(0),
Some(5),
10,
+ 5,
)
.await;
}
@@ -289,6 +291,7 @@ async fn prune_timestamps_micros() {
Some(0),
Some(5),
10,
+ 5,
)
.await;
}
@@ -306,6 +309,7 @@ async fn prune_timestamps_millis() {
Some(0),
Some(5),
10,
+ 5,
)
.await;
}
@@ -324,6 +328,7 @@ async fn prune_timestamps_seconds() {
Some(0),
Some(5),
10,
+ 5,
)
.await;
}
@@ -341,6 +346,7 @@ async fn prune_date32() {
Some(0),
Some(15),
1,
+ 5,
)
.await;
}
@@ -359,7 +365,7 @@ async fn prune_date64() {
.and_time(chrono::NaiveTime::from_hms_opt(0, 0, 0).unwrap());
let date = ScalarValue::Date64(Some(date.and_utc().timestamp_millis()));
- let output = ContextWithParquet::new(Scenario::Dates, Page)
+ let output = ContextWithParquet::new(Scenario::Dates, Page(5))
.await
.query_with_expr(col("date64").lt(lit(date)))
.await;
@@ -387,6 +393,7 @@ macro_rules! int_tests {
Some(0),
Some(5),
11,
+ 5,
)
.await;
// result of sql "SELECT * FROM t where i < 1" is same as
@@ -397,6 +404,7 @@ macro_rules! int_tests {
Some(0),
Some(5),
11,
+ 5,
)
.await;
}
@@ -409,6 +417,7 @@ macro_rules! int_tests {
Some(0),
Some(15),
1,
+ 5,
)
.await;
@@ -418,6 +427,7 @@ macro_rules! int_tests {
Some(0),
Some(15),
1,
+ 5,
)
.await;
}
@@ -430,6 +440,7 @@ macro_rules! int_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -441,6 +452,7 @@ macro_rules! int_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -453,6 +465,7 @@ macro_rules! int_tests {
Some(0),
Some(0),
3,
+ 5
)
.await;
}
@@ -465,6 +478,7 @@ macro_rules! int_tests {
Some(0),
Some(0),
2,
+ 5
)
.await;
}
@@ -477,6 +491,7 @@ macro_rules! int_tests {
Some(0),
Some(0),
9,
+ 5
)
.await;
}
@@ -490,6 +505,7 @@ macro_rules! int_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -503,6 +519,7 @@ macro_rules! int_tests {
Some(0),
Some(0),
19,
+ 5
)
.await;
}
@@ -531,6 +548,7 @@ macro_rules! uint_tests {
Some(0),
Some(5),
11,
+ 5
)
.await;
}
@@ -543,6 +561,7 @@ macro_rules! uint_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -555,6 +574,7 @@ macro_rules! uint_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -567,6 +587,7 @@ macro_rules! uint_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -579,6 +600,7 @@ macro_rules! uint_tests {
Some(0),
Some(0),
2,
+ 5
)
.await;
}
@@ -591,6 +613,7 @@ macro_rules! uint_tests {
Some(0),
Some(0),
2,
+ 5
)
.await;
}
@@ -604,6 +627,7 @@ macro_rules! uint_tests {
Some(0),
Some(15),
1,
+ 5
)
.await;
}
@@ -617,6 +641,7 @@ macro_rules! uint_tests {
Some(0),
Some(0),
19,
+ 5
)
.await;
}
@@ -642,6 +667,7 @@ async fn prune_f64_lt() {
Some(0),
Some(5),
11,
+ 5,
)
.await;
test_prune(
@@ -650,6 +676,7 @@ async fn prune_f64_lt() {
Some(0),
Some(5),
11,
+ 5,
)
.await;
}
@@ -664,6 +691,7 @@ async fn prune_f64_scalar_fun_and_gt() {
Some(0),
Some(10),
1,
+ 5,
)
.await;
}
@@ -677,6 +705,7 @@ async fn prune_f64_scalar_fun() {
Some(0),
Some(0),
1,
+ 5,
)
.await;
}
@@ -690,6 +719,7 @@ async fn prune_f64_complex_expr() {
Some(0),
Some(0),
9,
+ 5,
)
.await;
}
@@ -703,6 +733,7 @@ async fn prune_f64_complex_expr_subtract() {
Some(0),
Some(0),
9,
+ 5,
)
.await;
}
@@ -718,6 +749,7 @@ async fn prune_decimal_lt() {
Some(0),
Some(5),
6,
+ 5,
)
.await;
// compare with the casted decimal value
@@ -727,6 +759,7 @@ async fn prune_decimal_lt() {
Some(0),
Some(5),
8,
+ 5,
)
.await;
@@ -737,6 +770,7 @@ async fn prune_decimal_lt() {
Some(0),
Some(5),
6,
+ 5,
)
.await;
// compare with the casted decimal value
@@ -746,6 +780,7 @@ async fn prune_decimal_lt() {
Some(0),
Some(5),
8,
+ 5,
)
.await;
}
@@ -761,6 +796,7 @@ async fn prune_decimal_eq() {
Some(0),
Some(5),
2,
+ 5,
)
.await;
test_prune(
@@ -769,6 +805,7 @@ async fn prune_decimal_eq() {
Some(0),
Some(5),
2,
+ 5,
)
.await;
@@ -779,6 +816,7 @@ async fn prune_decimal_eq() {
Some(0),
Some(5),
2,
+ 5,
)
.await;
test_prune(
@@ -787,6 +825,7 @@ async fn prune_decimal_eq() {
Some(0),
Some(5),
2,
+ 5,
)
.await;
test_prune(
@@ -795,6 +834,7 @@ async fn prune_decimal_eq() {
Some(0),
Some(10),
2,
+ 5,
)
.await;
}
@@ -810,6 +850,7 @@ async fn prune_decimal_in_list() {
Some(0),
Some(5),
5,
+ 5,
)
.await;
test_prune(
@@ -818,6 +859,7 @@ async fn prune_decimal_in_list() {
Some(0),
Some(5),
6,
+ 5,
)
.await;
@@ -828,6 +870,7 @@ async fn prune_decimal_in_list() {
Some(0),
Some(5),
5,
+ 5,
)
.await;
test_prune(
@@ -836,17 +879,18 @@ async fn prune_decimal_in_list() {
Some(0),
Some(5),
6,
+ 5,
)
.await;
}
#[tokio::test]
async fn without_pushdown_filter() {
- let mut context = ContextWithParquet::new(Scenario::Timestamps,
Page).await;
+ let mut context = ContextWithParquet::new(Scenario::Timestamps,
Page(5)).await;
let output1 = context.query("SELECT * FROM t").await;
- let mut context = ContextWithParquet::new(Scenario::Timestamps,
Page).await;
+ let mut context = ContextWithParquet::new(Scenario::Timestamps,
Page(5)).await;
let output2 = context
.query("SELECT * FROM t where nanos < to_timestamp('2023-01-02
01:01:11Z')")
@@ -887,6 +931,7 @@ async fn test_pages_with_null_values() {
// (row_group1, page2), (row_group4, page2)
Some(10),
22,
+ 5,
)
.await;
@@ -897,6 +942,7 @@ async fn test_pages_with_null_values() {
// expect prune (row_group1, page2) and (row_group4, page2) = 10 rows
Some(10),
29,
+ 5,
)
.await;
@@ -907,6 +953,7 @@ async fn test_pages_with_null_values() {
// expect prune (row_group1, page1), (row_group2, page1+2),
(row_group3, page1), (row_group3, page1) = 25 rows
Some(25),
11,
+ 5,
)
.await;
@@ -918,6 +965,7 @@ async fn test_pages_with_null_values() {
// (row_group1, page1+2), (row_group2, page1), (row_group3, page1)
(row_group4, page1+2) = 30 rows
Some(30),
7,
+ 5,
)
.await;
}
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs
b/datafusion/core/tests/parquet/row_group_pruning.rs
index b3f1fec175..d6de2b6f8e 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -100,7 +100,7 @@ impl RowGroupPruningTest {
// Execute the test with the current configuration
async fn test_row_group_prune(self) {
- let output = ContextWithParquet::new(self.scenario, RowGroup)
+ let output = ContextWithParquet::new(self.scenario, RowGroup(5))
.await
.query(&self.query)
.await;
@@ -231,7 +231,7 @@ async fn prune_date64() {
.and_time(chrono::NaiveTime::from_hms_opt(0, 0, 0).unwrap());
let date = ScalarValue::Date64(Some(date.and_utc().timestamp_millis()));
- let output = ContextWithParquet::new(Scenario::Dates, RowGroup)
+ let output = ContextWithParquet::new(Scenario::Dates, RowGroup(5))
.await
.query_with_expr(col("date64").lt(lit(date)))
// .query(
@@ -267,10 +267,11 @@ async fn prune_disabled() {
let expected_rows = 10;
let config = SessionConfig::new().with_parquet_pruning(false);
- let output = ContextWithParquet::with_config(Scenario::Timestamps,
RowGroup, config)
- .await
- .query(query)
- .await;
+ let output =
+ ContextWithParquet::with_config(Scenario::Timestamps, RowGroup(5),
config)
+ .await
+ .query(query)
+ .await;
println!("{}", output.description());
// This should not prune any