alamb commented on code in PR #4131:
URL: https://github.com/apache/arrow-datafusion/pull/4131#discussion_r1016481711
##########
datafusion/core/tests/parquet_filter_pushdown.rs:
##########
@@ -225,6 +230,77 @@ async fn single_file() {
.run()
.await;
}
+
+#[cfg(not(target_family = "windows"))]
+#[tokio::test]
+async fn single_file_small_data_pages() {
+ let tempdir = TempDir::new().unwrap();
+
+ let generator = AccessLogGenerator::new().with_row_limit(Some(NUM_ROWS));
+
+ // set the max page rows with arbitrary sizes 8311 to increase
+ // effectiveness of page filtering
+ let props = WriterProperties::builder()
+ .set_data_page_row_count_limit(8311)
+ .build();
+ let file = tempdir.path().join("data_8311.parquet");
+
+ let start = Instant::now();
+ println!("Writing test data to {:?}", file);
+ let test_parquet_file = TestParquetFile::try_new(file, props,
generator).unwrap();
+ println!(
+ "Completed generating test data in {:?}",
+ Instant::now() - start
+ );
+
+ // The statistics on the 'pod' column are as follows:
+ //
+ // parquet-tools dump -d ~/Downloads/data_8311.parquet
+ //
+ // ...
+ // pod TV=53819 RL=0 DL=0 DS: 8 DE:PLAIN
+ //
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ // page 0: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: aqcathnxqsphdhgjtgvxsfyiwbmhlmg, max:
bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, num_nulls not defined] CRC:[none] SZ:7
VC:9216
+ // page 1: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, max:
bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, num_nulls not defined] CRC:[none] SZ:7
VC:9216
+ // page 2: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, max:
djzdyiecnumrsrcbizwlqzdhnpoiqdh, num_nulls not defined] CRC:[none] SZ:10 VC:9216
+ // page 3: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: djzdyiecnumrsrcbizwlqzdhnpoiqdh, max:
fktdcgtmzvoedpwhfevcvvrtaurzgex, num_nulls not defined] CRC:[none] SZ:7 VC:9216
+ // page 4: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: fktdcgtmzvoedpwhfevcvvrtaurzgex, max:
fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, num_nulls not defined] CRC:[none] SZ:7
VC:9216
+ // page 5: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, max:
iadnalqpdzthpifrvewossmpqibgtsuin, num_nulls not defined] CRC:[none] SZ:7
VC:7739
+ //
+ // This test currently fails due to
https://github.com/apache/arrow-datafusion/issues/3833
+ // (page index pruning not implemented for byte array)
+
+ // TestCase::new(&test_parquet_file)
+ // .with_name("selective")
+ // // predicagte is chosen carefully to prune pages 0, 1, 2, 3, 4
+ // // pod = 'iadnalqpdzthpifrvewossmpqibgtsuin'
+ //
.with_filter(col("pod").eq(lit("iadnalqpdzthpifrvewossmpqibgtsuin")))
+ // .with_pushdown_expected(PushdownExpected::Some)
+ //
.with_page_index_filtering_expected(PageIndexFilteringExpected::Some)
+ // .with_expected_rows(2574)
+ // .run()
+ // .await;
+
+ // time TV=53819 RL=0 DL=0 DS: 7092 DE:PLAIN
+ //
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ // page 0: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.004133888, num_nulls not defined] CRC:[none] SZ:13844
VC:9216
+ // page 1: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.006397952, num_nulls not defined] CRC:[none] SZ:14996
VC:9216
+ // page 2: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.005650432, num_nulls not defined] CRC:[none] SZ:14996
VC:9216
+ // page 3: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.004269056, num_nulls not defined] CRC:[none] SZ:14996
VC:9216
+ // page 4: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.007261184, num_nulls not defined] CRC:[none] SZ:14996
VC:9216
+ // page 5: DLE:RLE RLE:RLE
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max:
1970-01-01T00:00:00.005330944, num_nulls not defined] CRC:[none] SZ:12601
VC:7739
+ TestCase::new(&test_parquet_file)
+ .with_name("selective")
+ // predicagte is chosen carefully to prune pages
Review Comment:
Done in https://github.com/apache/arrow-datafusion/pull/4130
c9e676d25
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]