buraksenn commented on code in PR #20904:
URL: https://github.com/apache/datafusion/pull/20904#discussion_r2930682341


##########
datafusion/physical-plan/src/joins/utils.rs:
##########
@@ -697,6 +710,88 @@ fn estimate_disjoint_inputs(
     None
 }
 
+/// Estimates the number of outer rows that have at least one matching
+/// key on the inner side (i.e. semi join cardinality) using NDV
+/// (Number of Distinct Values) statistics.
+///
+/// Assuming the smaller domain is contained in the larger, the number
+/// of overlapping distinct values is `min(outer_ndv, inner_ndv)`.
+/// Under the uniformity assumption (each distinct value contributes
+/// equally to row counts), the surviving fraction of outer rows is:
+///
+/// ```text
+/// overlap_fraction = min(outer_ndv, inner_ndv) / outer_ndv
+/// ```
+///
+/// For multi-column join keys, each column pair contributes an
+/// independent selectivity factor and the overall selectivity is the
+/// product of these factors:
+///
+/// ```text
+/// selectivity = product_i(min(outer_ndv_i, inner_ndv_i) / outer_ndv_i)
+/// semi_cardinality = outer_rows * selectivity
+/// ```
+///
+/// Anti join cardinality is derived as the complement:
+/// `outer_rows - semi_cardinality`.
+///
+/// Boundary cases:
+/// * `inner_ndv >= outer_ndv` → selectivity = 1.0 (all outer rows match)
+/// * `inner_ndv = 0` → selectivity = 0.0
+/// * Missing NDV statistics → returns `None` (fallback to `outer_rows`)
+///
+/// PostgreSQL uses a similar approach in `eqjoinsel_semi`
+/// (`src/backend/utils/adt/selfuncs.c`). When NDV statistics are
+/// available on both sides it computes selectivity as `nd2 / nd1`,
+/// which is equivalent to `min(outer_ndv, inner_ndv) / outer_ndv`.
+/// If either side lacks statistics it falls back to a default.
+fn estimate_semi_join_cardinality(
+    outer_num_rows: &Precision<usize>,
+    inner_num_rows: &Precision<usize>,
+    outer_col_stats: &[ColumnStatistics],
+    inner_col_stats: &[ColumnStatistics],
+) -> Option<usize> {
+    let outer_rows = *outer_num_rows.get_value()?;
+    if outer_rows == 0 {
+        return Some(0);
+    }
+    let inner_rows = *inner_num_rows.get_value()?;
+    if inner_rows == 0 {
+        return Some(0);
+    }
+
+    let mut selectivity = 1.0_f64;
+    let mut has_selectivity_estimate = false;
+
+    for (outer_stat, inner_stat) in 
outer_col_stats.iter().zip(inner_col_stats.iter()) {
+        let outer_has_stats = outer_stat.distinct_count.get_value().is_some()
+            || (outer_stat.min_value.get_value().is_some()
+                && outer_stat.max_value.get_value().is_some());
+        let inner_has_stats = inner_stat.distinct_count.get_value().is_some()
+            || (inner_stat.min_value.get_value().is_some()
+                && inner_stat.max_value.get_value().is_some());
+        if !outer_has_stats || !inner_has_stats {
+            continue;
+        }
+
+        let outer_ndv = max_distinct_count(outer_num_rows, outer_stat);
+        let inner_ndv = max_distinct_count(inner_num_rows, inner_stat);
+
+        if let (Some(&o), Some(&i)) = (outer_ndv.get_value(), 
inner_ndv.get_value())
+            && o > 0
+        {
+            selectivity *= (o.min(i) as f64) / (o as f64);

Review Comment:
   I've adjusted comment and added test cases



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to