This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 6fe00ce2e3 Fix join order for TPCH Q17 & Q18 by improving FilterExec 
statistics (#8126)
6fe00ce2e3 is described below

commit 6fe00ce2e30d2af2b64d3a97b877a96109c215f9
Author: Andy Grove <[email protected]>
AuthorDate: Sun Nov 12 01:41:15 2023 -0700

    Fix join order for TPCH Q17 & Q18 by improving FilterExec statistics (#8126)
    
    * Assume filters are highly selective if we cannot truly estimate 
cardinality
    
    * fix regression
    
    * cargo fmt
    
    * simplify code
    
    * Update datafusion/physical-plan/src/filter.rs
    
    Co-authored-by: Daniël Heres <[email protected]>
    
    * add comment with link to follow on issue
    
    * Use default of 20% selectivity
    
    * trigger CI
    
    * remove files
    
    * trigger CI
    
    * address feedback
    
    ---------
    
    Co-authored-by: Daniël Heres <[email protected]>
---
 datafusion/physical-plan/src/filter.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/datafusion/physical-plan/src/filter.rs 
b/datafusion/physical-plan/src/filter.rs
index d560a219f2..822ddfdf3e 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -194,11 +194,23 @@ impl ExecutionPlan for FilterExec {
     fn statistics(&self) -> Result<Statistics> {
         let predicate = self.predicate();
 
+        let input_stats = self.input.statistics()?;
         let schema = self.schema();
         if !check_support(predicate, &schema) {
-            return Ok(Statistics::new_unknown(&schema));
+            // assume filter selects 20% of rows if we cannot do anything 
smarter
+            // tracking issue for making this configurable:
+            // https://github.com/apache/arrow-datafusion/issues/8133
+            let selectivity = 0.2_f32;
+            let mut stats = input_stats.clone().into_inexact();
+            if let Precision::Inexact(n) = stats.num_rows {
+                stats.num_rows = Precision::Inexact((selectivity * n as f32) 
as usize);
+            }
+            if let Precision::Inexact(n) = stats.total_byte_size {
+                stats.total_byte_size =
+                    Precision::Inexact((selectivity * n as f32) as usize);
+            }
+            return Ok(stats);
         }
-        let input_stats = self.input.statistics()?;
 
         let num_rows = input_stats.num_rows;
         let total_byte_size = input_stats.total_byte_size;

Reply via email to