This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 6fe00ce2e3 Fix join order for TPCH Q17 & Q18 by improving FilterExec
statistics (#8126)
6fe00ce2e3 is described below
commit 6fe00ce2e30d2af2b64d3a97b877a96109c215f9
Author: Andy Grove <[email protected]>
AuthorDate: Sun Nov 12 01:41:15 2023 -0700
Fix join order for TPCH Q17 & Q18 by improving FilterExec statistics (#8126)
* Assume filters are highly selective if we cannot truly estimate
cardinality
* fix regression
* cargo fmt
* simplify code
* Update datafusion/physical-plan/src/filter.rs
Co-authored-by: Daniël Heres <[email protected]>
* add comment with link to follow on issue
* Use default of 20% selectivity
* trigger CI
* remove files
* trigger CI
* address feedback
---------
Co-authored-by: Daniël Heres <[email protected]>
---
datafusion/physical-plan/src/filter.rs | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/datafusion/physical-plan/src/filter.rs
b/datafusion/physical-plan/src/filter.rs
index d560a219f2..822ddfdf3e 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -194,11 +194,23 @@ impl ExecutionPlan for FilterExec {
fn statistics(&self) -> Result<Statistics> {
let predicate = self.predicate();
+ let input_stats = self.input.statistics()?;
let schema = self.schema();
if !check_support(predicate, &schema) {
- return Ok(Statistics::new_unknown(&schema));
+ // assume filter selects 20% of rows if we cannot do anything
smarter
+ // tracking issue for making this configurable:
+ // https://github.com/apache/arrow-datafusion/issues/8133
+ let selectivity = 0.2_f32;
+ let mut stats = input_stats.clone().into_inexact();
+ if let Precision::Inexact(n) = stats.num_rows {
+ stats.num_rows = Precision::Inexact((selectivity * n as f32)
as usize);
+ }
+ if let Precision::Inexact(n) = stats.total_byte_size {
+ stats.total_byte_size =
+ Precision::Inexact((selectivity * n as f32) as usize);
+ }
+ return Ok(stats);
}
- let input_stats = self.input.statistics()?;
let num_rows = input_stats.num_rows;
let total_byte_size = input_stats.total_byte_size;