This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e2b2c160bc Minor: Improve documentation of `need_handle_count_bug`
(#15050)
e2b2c160bc is described below
commit e2b2c160bc91f51779f7737a8975cfa7b7d8f8f7
Author: suibianwanwan <[email protected]>
AuthorDate: Sat Mar 8 17:29:11 2025 +0800
Minor: Improve documentation of `need_handle_count_bug` (#15050)
* Minor: Improve documentation of `need_handle_count_bug`
* Add back link
* Improve `with_need_handle_count_bug`
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/optimizer/src/decorrelate.rs | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/datafusion/optimizer/src/decorrelate.rs
b/datafusion/optimizer/src/decorrelate.rs
index b192f97404..71ff863b51 100644
--- a/datafusion/optimizer/src/decorrelate.rs
+++ b/datafusion/optimizer/src/decorrelate.rs
@@ -56,10 +56,16 @@ pub struct PullUpCorrelatedExpr {
/// Indicates if we encounter any correlated expression that can not be
pulled up
/// above a aggregation without changing the meaning of the query.
can_pull_over_aggregation: bool,
- /// Do we need to handle [the Count bug] during the pull up process.
- /// TODO this parameter should be removed or renamed semantically
+ /// Do we need to handle [the count bug] during the pull up process.
///
- /// [the Count bug]: https://github.com/apache/datafusion/issues/10553
+ /// The "count bug" was described in [Optimization of Nested SQL
+ /// Queries Revisited](https://dl.acm.org/doi/pdf/10.1145/38714.38723).
This bug is
+ /// not specific to the COUNT function, and it can occur with any
aggregate function,
+ /// such as SUM, AVG, etc. The anomaly arises because aggregates fail to
distinguish
+ /// between an empty set and null values when optimizing a correlated
query as a join.
+ /// Here, we use "the count bug" to refer to all such cases.
+ ///
+ /// [the count bug]: https://github.com/apache/datafusion/issues/10553
pub need_handle_count_bug: bool,
/// mapping from the plan to its expressions' evaluation result on empty
batch
pub collected_count_expr_map: HashMap<LogicalPlan, ExprResultMap>,
@@ -88,10 +94,9 @@ impl PullUpCorrelatedExpr {
}
}
- /// Set if we need to handle [the Count bug] during the pull up process
- /// TODO this should be removed or renamed semantically
+ /// Set if we need to handle [the count bug] during the pull up process
///
- /// [the Count bug]: https://github.com/apache/datafusion/issues/10553
+ /// [the count bug]: https://github.com/apache/datafusion/issues/10553
pub fn with_need_handle_count_bug(mut self, need_handle_count_bug: bool)
-> Self {
self.need_handle_count_bug = need_handle_count_bug;
self
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]