tustvold commented on code in PR #4743:
URL: https://github.com/apache/arrow-datafusion/pull/4743#discussion_r1057672311


##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -223,107 +226,123 @@ async fn single_file() {
             .unwrap(),
         )
         .with_pushdown_expected(PushdownExpected::Some)
-        .with_expected_rows(48919);
-    set.spawn(async move { case.run().await });
-
-    // Join all the cases.
-    while let Some(result) = set.join_next().await {
-        result.unwrap()
-    }
+        .with_expected_rows(3672);
+    case.run().await;
 }
 
 #[cfg(not(target_family = "windows"))]
 #[tokio::test]
 async fn single_file_small_data_pages() {
     let tempdir = TempDir::new().unwrap();
 
-    let generator = AccessLogGenerator::new().with_row_limit(NUM_ROWS);
-
-    // set the max page rows with arbitrary sizes 8311 to increase
-    // effectiveness of page filtering
+    // Set low row count limit to improve page filtering
     let props = WriterProperties::builder()
-        .set_data_page_row_count_limit(8311)
+        .set_max_row_group_size(2048)
+        .set_data_page_row_count_limit(512)
+        .set_write_batch_size(512)
         .build();
-    let file = tempdir.path().join("data_8311.parquet");
-
-    let start = Instant::now();
-    println!("Writing test data to {:?}", file);
-    let test_parquet_file =
-        Arc::new(TestParquetFile::try_new(file, props, generator).unwrap());
-    println!(
-        "Completed generating test data in {:?}",
-        Instant::now() - start
-    );
+    let test_parquet_file = generate_file(&tempdir, props);
 
     // The statistics on the 'pod' column are as follows:
     //
-    // parquet-tools dump -d ~/Downloads/data_8311.parquet
+    // docker run -v /tmp:/tmp nathanhowell/parquet-tools dump -d -c pod -n 
/tmp/.tmppkTohR/data.parquet
+    //
+    // ```
+    // row group 0
+    //     
--------------------------------------------------------------------------------
+    //     pod:  BINARY UNCOMPRESSED DO:782 FPO:1215 SZ:744/744/1.00 VC:2048 
ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: azvagebjesrqboyqxmgaskvpwddebuptqyy, max: 
zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx, num_nulls not defined]
     //
-    // ...
-    // pod TV=53819 RL=0 DL=0 DS:                 8 DE:PLAIN
-    // 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-    // page 0:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: aqcathnxqsphdhgjtgvxsfyiwbmhlmg, max: 
bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, num_nulls not defined] CRC:[none] SZ:7 
VC:9216
-    // page 1:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, max: 
bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, num_nulls not defined] CRC:[none] SZ:7 
VC:9216
-    // page 2:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, max: 
djzdyiecnumrsrcbizwlqzdhnpoiqdh, num_nulls not defined] CRC:[none] SZ:10 VC:9216
-    // page 3:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: djzdyiecnumrsrcbizwlqzdhnpoiqdh, max: 
fktdcgtmzvoedpwhfevcvvrtaurzgex, num_nulls not defined] CRC:[none] SZ:7 VC:9216
-    // page 4:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: fktdcgtmzvoedpwhfevcvvrtaurzgex, max: 
fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, num_nulls not defined] CRC:[none] SZ:7 
VC:9216
-    // page 5:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, max: 
iadnalqpdzthpifrvewossmpqibgtsuin, num_nulls not defined] CRC:[none] SZ:7 
VC:7739
-
-    TestCase::new(test_parquet_file.clone())
+    // pod TV=2048 RL=0 DL=0 DS: 11 DE:PLAIN
+    //     
----------------------------------------------------------------------------
+    //     page 0:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: azvagebjesrqboyqxmgaskvpwddebuptqyy, max: 
ksjzzqfxvawhmlkopjsbponfdwsurxff, num_nulls not defined] CRC:[none] SZ:10 VC:512
+    //     page 1:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: azvagebjesrqboyqxmgaskvpwddebuptqyy, max: 
wlftgepiwhnmzqrsyijhqbauhjplru, num_nulls not defined] CRC:[none] SZ:18 VC:1013
+    //     page 2:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: ewzlijvnljqeqhqhftfalqbqfsyidw, max: 
zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx, num_nulls not defined] CRC:[none] SZ:12 
VC:523
+    //
+    // row group 1
+    //     
--------------------------------------------------------------------------------
+    //     pod:  BINARY UNCOMPRESSED DO:249244 FPO:249724 SZ:901/901/1.00 
VC:2048 ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: csvnvrdcuzoftoidzmczrtqnrzgfpj, 
max: zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx, num_nulls not defined]
+    //
+    // pod TV=2048 RL=0 DL=0 DS: 12 DE:PLAIN
+    //     
----------------------------------------------------------------------------
+    //     page 0:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: dhhqgbsjutqdvqpikmnwqdnrhkqnjyieoviujkj, max: 
zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx, num_nulls not defined] CRC:[none] SZ:12 
VC:512
+    //     page 1:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: dlowgwtqjiifqajbobiuqoflbmsbobwsqtrc, max: 
uipgzhbptpinjcwbdwhkfdjzdfzrlffrifzh, num_nulls not defined] CRC:[none] SZ:12 
VC:671
+    //     page 2:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: csvnvrdcuzoftoidzmczrtqnrzgfpj, max: xacatvakpxztzuucoxhjiofxykryoxc, 
num_nulls not defined] CRC:[none] SZ:16 VC:781
+    //     page 3:                    DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: nxihlfujkdzymexwpqurhawwchvkdrntixjs, max: 
xacatvakpxztzuucoxhjiofxykryoxc, num_nulls not defined] CRC:[none] SZ:9 VC:84
+    // ```
+
+
+    TestCase::new(&test_parquet_file)
         .with_name("selective")
-        // predicate is chosen carefully to prune pages 0, 1, 2, 3, 4
-        // pod = 'iadnalqpdzthpifrvewossmpqibgtsuin'
-        .with_filter(col("pod").eq(lit("iadnalqpdzthpifrvewossmpqibgtsuin")))
+        // predicate is chosen carefully to prune all bar 0-2 and 1-0
+        // pod = 'zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx'
+        
.with_filter(col("pod").eq(lit("zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx")))
         .with_pushdown_expected(PushdownExpected::Some)
         .with_page_index_filtering_expected(PageIndexFilteringExpected::Some)
-        .with_expected_rows(2574)
+        .with_expected_rows(174)
         .run()
         .await;
 
-    // time TV=53819 RL=0 DL=0 DS:                7092 DE:PLAIN
-    // 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-    // page 0:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.004133888, num_nulls not defined] CRC:[none] SZ:13844 
VC:9216
-    // page 1:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.006397952, num_nulls not defined] CRC:[none] SZ:14996 
VC:9216
-    // page 2:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.005650432, num_nulls not defined] CRC:[none] SZ:14996 
VC:9216
-    // page 3:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.004269056, num_nulls not defined] CRC:[none] SZ:14996 
VC:9216
-    // page 4:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.007261184, num_nulls not defined] CRC:[none] SZ:14996 
VC:9216
-    // page 5:                                     DLE:RLE RLE:RLE 
VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 
1970-01-01T00:00:00.005330944, num_nulls not defined] CRC:[none] SZ:12601 
VC:7739
-    TestCase::new(test_parquet_file.clone())
+    // row group 0
+    //     
--------------------------------------------------------------------------------
+    //     time:  INT64 UNCOMPRESSED DO:3317 FPO:5334 SZ:4209/4209/1.00 
VC:2048 ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: 1970-01-01T00:00:00.000000000, 
max: 1970-01-01T00:00:00.000254976, num_nulls not defined]
+    //
+    // time TV=2048 RL=0 DL=0 DS: 250 DE:PLAIN
+    //     
----------------------------------------------------------------------------
+    //     page 0:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000203776, 
num_nulls not defined] CRC:[none] SZ:515 VC:512
+    //     page 1:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000254976, 
num_nulls not defined] CRC:[none] SZ:1020 VC:1013
+    //     page 2:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000216064, 
num_nulls not defined] CRC:[none] SZ:531 VC:523
+    //
+    // row group 1
+    //     
--------------------------------------------------------------------------------
+    //     time:  INT64 UNCOMPRESSED DO:252201 FPO:254186 SZ:4220/4220/1.00 
VC:2048 ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: 1970-01-01T00:00:00.000000000, 
max: 1970-01-01T00:00:00.000250880, num_nulls not defined]
+    //
+    // time TV=2048 RL=0 DL=0 DS: 246 DE:PLAIN
+    //     
----------------------------------------------------------------------------
+    //     page 0:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000231424, 
num_nulls not defined] CRC:[none] SZ:515 VC:512
+    //     page 1:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000250880, 
num_nulls not defined] CRC:[none] SZ:675 VC:671
+    //     page 2:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000211968, 
num_nulls not defined] CRC:[none] SZ:787 VC:781
+    //     page 3:                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY 
ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.000177152, 
num_nulls not defined] CRC:[none] SZ:90 VC:84
+
+    TestCase::new(&test_parquet_file)
         .with_name("selective")
-        // predicate is chosen carefully to prune pages 1, 2, 4, and 5
-        // time > 1970-01-01T00:00:00.004300000
-        .with_filter(col("time").gt(lit_timestamp_nano(4300000)))
+        // predicate is chosen carefully to prune all bar 0-1, 1-0, 1-1
+        // time > 1970-01-01T00:00:00.000216064
+        .with_filter(col("time").gt(lit_timestamp_nano(000216064)))
         .with_pushdown_expected(PushdownExpected::Some)
         .with_page_index_filtering_expected(PageIndexFilteringExpected::Some)
-        .with_expected_rows(9745)
+        .with_expected_rows(178)

Review Comment:
   This is currently failing, the correct answer is 178, for some reason we 
return less



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to