Dandandan commented on code in PR #3787:
URL: https://github.com/apache/arrow-datafusion/pull/3787#discussion_r991943213
##########
datafusion/core/src/physical_plan/join_utils.rs:
##########
@@ -296,6 +299,154 @@ impl<T> Clone for OnceFut<T> {
}
}
+/// A shared state between statistic aggregators for a join
+/// operation.
+#[derive(Clone, Debug, Default)]
+struct PartialJoinStatistics {
+ pub num_rows: usize,
+ pub column_statistics: Vec<ColumnStatistics>,
+}
+
+/// Calculate the statistics for the given join's output.
+pub(crate) fn join_statistics(
+ left: Arc<dyn ExecutionPlan>,
+ right: Arc<dyn ExecutionPlan>,
+ on: JoinOn,
+ join_type: &JoinType,
+) -> Statistics {
+ let left_stats = left.statistics();
+ let right_stats = right.statistics();
+
+ let join_stats = join_cardinality(join_type, left_stats, right_stats, &on);
+ let (num_rows, column_statistics) = match join_stats {
+ Some(stats) => (Some(stats.num_rows), Some(stats.column_statistics)),
+ None => (None, None),
+ };
+ Statistics {
+ num_rows,
+ total_byte_size: None,
+ column_statistics,
+ is_exact: false,
+ }
+}
+
+// Estimate the cardinality for the given join with input statistics.
+fn join_cardinality(
+ join_type: &JoinType,
+ left_stats: Statistics,
+ right_stats: Statistics,
+ on: &JoinOn,
+) -> Option<PartialJoinStatistics> {
+ match join_type {
+ JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full =>
{
+ let left_num_rows = left_stats.num_rows?;
+ let right_num_rows = right_stats.num_rows?;
+
+ // Take the left_col_stats and right_col_stats using the index
+ // obtained from index() method of the each element of 'on'.
+ let all_left_col_stats = left_stats.column_statistics?;
+ let all_right_col_stats = right_stats.column_statistics?;
+ let (left_col_stats, right_col_stats) = on
+ .iter()
+ .map(|(left, right)| {
+ (
+ all_left_col_stats[left.index()].clone(),
+ all_right_col_stats[right.index()].clone(),
+ )
+ })
+ .unzip::<_, _, Vec<_>, Vec<_>>();
+
+ let ij_cardinality = inner_join_cardinality(
+ left_num_rows,
+ right_num_rows,
+ left_col_stats,
+ right_col_stats,
+ )?;
+
+ // The cardinality for inner join can also be used to estimate
+ // the cardinality of left/right/full outer joins as long as it
+ // it is greater than the minimum cardinality constraints of these
+ // joins (so that we don't underestimate the cardinality).
+ let cardinality = match join_type {
+ JoinType::Inner => ij_cardinality,
+ JoinType::Left => max(ij_cardinality, left_num_rows),
+ JoinType::Right => max(ij_cardinality, right_num_rows),
+ JoinType::Full => {
+ max(ij_cardinality, left_num_rows)
+ + max(ij_cardinality, right_num_rows)
+ - ij_cardinality
+ }
+ _ => unreachable!(),
Review Comment:
I would prefer not to panic here (even if currently it is unreachable, we
could add join types later).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]