adriangb commented on code in PR #15301: URL: https://github.com/apache/datafusion/pull/15301#discussion_r2006767851
########## datafusion/core/src/datasource/physical_plan/parquet.rs: ########## @@ -1655,4 +1656,46 @@ mod tests { assert_eq!(calls.len(), 2); assert_eq!(calls, vec![Some(123), Some(456)]); } + + #[tokio::test] + async fn test_topk_predicate_pushdown() { + let ctx = SessionContext::new(); + let opt = ListingOptions::new(Arc::new(ParquetFormat::default())) + // We need to force 1 partition because TopK predicate pushdown happens on a per-partition basis + // If we had 1 file per partition (as an example) no pushdown would happen + .with_target_partitions(1); + + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().to_str().unwrap().to_string(); + // The point here is that we write many, many files. + // So when we scan after we processed the first one we should be able to skip the rest + // because of the TopK predicate pushdown. + for file in 0..100 { + let name = format!("test{:02}.parquet", file); + write_file(&format!("{path}/{name}")); + } + ctx.register_listing_table("base_table", path, opt, None, None) + .await + .unwrap(); + + let query = "select name from base_table order by id desc limit 3"; + + let batches = ctx.sql(query).await.unwrap().collect().await.unwrap(); + #[rustfmt::skip] + let expected = [ + "+--------+", + "| name |", + "+--------+", + "| test02 |", + "| test02 |", + "| test02 |", + "+--------+", + ]; + assert_batches_eq!(expected, &batches); + + let sql = format!("explain analyze {query}"); + let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap(); + let explain_plan = format!("{}", pretty_format_batches(&batches).unwrap()); + assert_contains!(explain_plan, "row_groups_pruned_statistics=96"); Review Comment: Yes! More tests! I just tried this in my full system and found a bug w/ hive partition columns. Making a note to add a test and fix. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org