This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 42f5ff3bf Infer cardinality for disjoint inner and outer joins (#3848)
42f5ff3bf is described below

commit 42f5ff3bf01e66f36e2a1328e279fbc160459868
Author: Batuhan Taskaya <[email protected]>
AuthorDate: Tue Oct 18 16:55:05 2022 +0300

    Infer cardinality for disjoint inner and outer joins (#3848)
---
 datafusion/core/src/physical_plan/join_utils.rs | 106 +++++++++++++++++++++---
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/datafusion/core/src/physical_plan/join_utils.rs 
b/datafusion/core/src/physical_plan/join_utils.rs
index d010f4219..4ce72ccc2 100644
--- a/datafusion/core/src/physical_plan/join_utils.rs
+++ b/datafusion/core/src/physical_plan/join_utils.rs
@@ -362,6 +362,7 @@ fn estimate_join_cardinality(
                 right_num_rows,
                 left_col_stats,
                 right_col_stats,
+                left_stats.is_exact && right_stats.is_exact,
             )?;
 
             // The cardinality for inner join can also be used to estimate
@@ -407,6 +408,7 @@ fn estimate_inner_join_cardinality(
     right_num_rows: usize,
     left_col_stats: Vec<ColumnStatistics>,
     right_col_stats: Vec<ColumnStatistics>,
+    is_exact: bool,
 ) -> Option<usize> {
     // The algorithm here is partly based on the non-histogram selectivity 
estimation
     // from Spark's Catalyst optimizer.
@@ -416,12 +418,10 @@ fn estimate_inner_join_cardinality(
         if (left_stat.min_value.clone()? > right_stat.max_value.clone()?)
             || (left_stat.max_value.clone()? < right_stat.min_value.clone()?)
         {
-            // If there is no overlap, then we can not accurately estimate
-            // the join cardinality. We could in theory use this information
-            // to point out the join will not produce any rows, but that would
-            // require some extra information (namely whether the statistics 
are
-            // exact). For now, we just give up.
-            return None;
+            // If there is no overlap in any of the join columns, that means 
the join
+            // itself is disjoint and the cardinality is 0. Though we can only 
assume
+            // this when the statistics are exact (since it is a very strong 
assumption).
+            return if is_exact { Some(0) } else { None };
         }
 
         let left_max_distinct = max_distinct_count(left_num_rows, 
left_stat.clone());
@@ -664,10 +664,12 @@ mod tests {
     fn create_stats(
         num_rows: Option<usize>,
         column_stats: Option<Vec<ColumnStatistics>>,
+        is_exact: bool,
     ) -> Statistics {
         Statistics {
             num_rows,
             column_statistics: column_stats,
+            is_exact,
             ..Default::default()
         }
     }
@@ -788,7 +790,7 @@ mod tests {
                 None,
             ),
             ((10, None, Some(3), None), (10, Some(1), None, None), None),
-            // Non overlapping min/max.
+            // Non overlapping min/max (when exact=False).
             (
                 (10, Some(0), Some(10), None),
                 (10, Some(11), Some(20), None),
@@ -835,6 +837,7 @@ mod tests {
                     right_num_rows,
                     left_col_stats.clone(),
                     right_col_stats.clone(),
+                    false,
                 ),
                 expected_cardinality
             );
@@ -844,8 +847,8 @@ mod tests {
             let join_on = vec![(Column::new("a", 0), Column::new("b", 0))];
             let partial_join_stats = estimate_join_cardinality(
                 &join_type,
-                create_stats(Some(left_num_rows), 
Some(left_col_stats.clone())),
-                create_stats(Some(right_num_rows), 
Some(right_col_stats.clone())),
+                create_stats(Some(left_num_rows), 
Some(left_col_stats.clone()), false),
+                create_stats(Some(right_num_rows), 
Some(right_col_stats.clone()), false),
                 &join_on,
             );
 
@@ -876,7 +879,13 @@ mod tests {
         // We have statistics about 4 columns, where the highest distinct
         // count is 200, so we are going to pick it.
         assert_eq!(
-            estimate_inner_join_cardinality(400, 400, left_col_stats, 
right_col_stats),
+            estimate_inner_join_cardinality(
+                400,
+                400,
+                left_col_stats,
+                right_col_stats,
+                false
+            ),
             Some((400 * 400) / 200)
         );
         Ok(())
@@ -899,7 +908,13 @@ mod tests {
         }];
 
         assert_eq!(
-            estimate_inner_join_cardinality(100, 100, left_col_stats, 
right_col_stats),
+            estimate_inner_join_cardinality(
+                100,
+                100,
+                left_col_stats,
+                right_col_stats,
+                false
+            ),
             None
         );
         Ok(())
@@ -945,8 +960,73 @@ mod tests {
 
             let partial_join_stats = estimate_join_cardinality(
                 &join_type,
-                create_stats(Some(1000), Some(left_col_stats.clone())),
-                create_stats(Some(2000), Some(right_col_stats.clone())),
+                create_stats(Some(1000), Some(left_col_stats.clone()), false),
+                create_stats(Some(2000), Some(right_col_stats.clone()), false),
+                &join_on,
+            )
+            .unwrap();
+            assert_eq!(partial_join_stats.num_rows, expected_num_rows);
+            assert_eq!(
+                partial_join_stats.column_statistics,
+                [left_col_stats.clone(), right_col_stats.clone()].concat()
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_join_cardinality_when_one_column_is_disjoint() -> Result<()> {
+        // Left table (rows=1000)
+        //   a: min=0, max=100, distinct=100
+        //   b: min=0, max=500, distinct=500
+        //   x: min=1000, max=10000, distinct=None
+        //
+        // Right table (rows=2000)
+        //   c: min=0, max=100, distinct=50
+        //   d: min=0, max=2000, distinct=2500 (how? some inexact statistics)
+        //   y: min=0, max=100, distinct=None
+        //
+        // Join on a=c, x=y (ignores b/d) where x and y does not intersect
+
+        let left_col_stats = vec![
+            create_column_stats(Some(0), Some(100), Some(100)),
+            create_column_stats(Some(0), Some(500), Some(500)),
+            create_column_stats(Some(1000), Some(10000), None),
+        ];
+
+        let right_col_stats = vec![
+            create_column_stats(Some(0), Some(100), Some(50)),
+            create_column_stats(Some(0), Some(2000), Some(2500)),
+            create_column_stats(Some(0), Some(100), None),
+        ];
+
+        let join_on = vec![
+            (Column::new("a", 0), Column::new("c", 0)),
+            (Column::new("x", 2), Column::new("y", 2)),
+        ];
+
+        let cases = vec![
+            // Join type, expected cardinality
+            //
+            // When an inner join is disjoint, that means it won't
+            // produce any rows.
+            (JoinType::Inner, 0),
+            // But left/right outer joins will produce at least
+            // the amount of rows from the left/right side.
+            (JoinType::Left, 1000),
+            (JoinType::Right, 2000),
+            // And a full outer join will produce at least the combination
+            // of the rows above (minus the cardinality of the inner join, 
which
+            // is 0).
+            (JoinType::Full, 3000),
+        ];
+
+        for (join_type, expected_num_rows) in cases {
+            let partial_join_stats = estimate_join_cardinality(
+                &join_type,
+                create_stats(Some(1000), Some(left_col_stats.clone()), true),
+                create_stats(Some(2000), Some(right_col_stats.clone()), true),
                 &join_on,
             )
             .unwrap();

Reply via email to