This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new e2b2c160bc Minor: Improve documentation of `need_handle_count_bug` 
(#15050)
e2b2c160bc is described below

commit e2b2c160bc91f51779f7737a8975cfa7b7d8f8f7
Author: suibianwanwan <[email protected]>
AuthorDate: Sat Mar 8 17:29:11 2025 +0800

    Minor: Improve documentation of `need_handle_count_bug` (#15050)
    
    * Minor: Improve documentation of `need_handle_count_bug`
    
    * Add back link
    
    * Improve `with_need_handle_count_bug`
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/optimizer/src/decorrelate.rs | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/datafusion/optimizer/src/decorrelate.rs 
b/datafusion/optimizer/src/decorrelate.rs
index b192f97404..71ff863b51 100644
--- a/datafusion/optimizer/src/decorrelate.rs
+++ b/datafusion/optimizer/src/decorrelate.rs
@@ -56,10 +56,16 @@ pub struct PullUpCorrelatedExpr {
     /// Indicates if we encounter any correlated expression that can not be 
pulled up
     /// above a aggregation without changing the meaning of the query.
     can_pull_over_aggregation: bool,
-    /// Do we need to handle [the Count bug] during the pull up process.
-    /// TODO this parameter should be removed or renamed semantically
+    /// Do we need to handle [the count bug] during the pull up process.
     ///
-    /// [the Count bug]: https://github.com/apache/datafusion/issues/10553
+    /// The "count bug" was described in [Optimization of Nested SQL
+    /// Queries Revisited](https://dl.acm.org/doi/pdf/10.1145/38714.38723). 
This bug is
+    /// not specific to the COUNT function, and it can occur with any 
aggregate function,
+    /// such as SUM, AVG, etc. The anomaly arises because aggregates fail to 
distinguish
+    /// between an empty set and null values when optimizing a correlated 
query as a join.
+    /// Here, we use "the count bug" to refer to all such cases.
+    ///
+    /// [the count bug]: https://github.com/apache/datafusion/issues/10553
     pub need_handle_count_bug: bool,
     /// mapping from the plan to its expressions' evaluation result on empty 
batch
     pub collected_count_expr_map: HashMap<LogicalPlan, ExprResultMap>,
@@ -88,10 +94,9 @@ impl PullUpCorrelatedExpr {
         }
     }
 
-    /// Set if we need to handle [the Count bug] during the pull up process
-    /// TODO this should be removed or renamed semantically
+    /// Set if we need to handle [the count bug] during the pull up process
     ///
-    /// [the Count bug]: https://github.com/apache/datafusion/issues/10553
+    /// [the count bug]: https://github.com/apache/datafusion/issues/10553
     pub fn with_need_handle_count_bug(mut self, need_handle_count_bug: bool) 
-> Self {
         self.need_handle_count_bug = need_handle_count_bug;
         self


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to