neilconway commented on code in PR #21058:
URL: https://github.com/apache/datafusion/pull/21058#discussion_r3036984058
##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -397,6 +397,14 @@ impl PhysicalGroupBy {
self.expr.len() + usize::from(self.has_grouping_set)
}
+ /// Returns the Arrow data type of the `__grouping_id` column.
+ ///
+ /// The type is chosen to be wide enough to hold both the semantic bitmask
+ /// (in the low `n` bits) and the duplicate ordinal (in the high bits).
Review Comment:
Not clear what `n` is here based on the context of the function itself.
##########
datafusion/expr/src/logical_plan/plan.rs:
##########
@@ -3737,6 +3686,24 @@ impl PartialOrd for Aggregate {
}
}
+/// Returns the highest duplicate ordinal across all grouping sets in
`group_expr`.
+///
+/// The ordinal counts how many times a given grouping set pattern has already
+/// appeared before the current occurrence. For example, if the same set
+/// appears three times the ordinals are 0, 1, 2 and this function returns 2.
+/// Returns 0 when no grouping set is duplicated.
+fn max_grouping_set_duplicate_ordinal(group_expr: &[Expr]) -> usize {
+ if let Some(Expr::GroupingSet(GroupingSet::GroupingSets(sets))) =
group_expr.first() {
+ let mut counts: HashMap<&Vec<Expr>, usize> = HashMap::new();
Review Comment:
`HashMap<&[Expr], usize>`
##########
datafusion/optimizer/src/analyzer/resolve_grouping_function.rs:
##########
@@ -204,17 +204,22 @@ fn grouping_function_on_id(
Expr::Literal(ScalarValue::from(value as u64), None)
}
};
-
let grouping_id_column =
Expr::Column(Column::from(Aggregate::INTERNAL_GROUPING_ID));
- // The grouping call is exactly our internal grouping id
+ // The grouping call is exactly our internal grouping id — mask the ordinal
+ // bits (above position `n`) so only the semantic bitmask is visible.
+ let n = group_by_expr_count;
+ // (1 << n) - 1 masks the low n bits.
+ let semantic_mask: u64 = if n >= 64 { u64::MAX } else { (1u64 << n) - 1 };
Review Comment:
As I suggested before, move this inside the `if`
##########
datafusion/sqllogictest/test_files/group_by.slt:
##########
@@ -5206,6 +5203,41 @@ NULL NULL 1
statement ok
drop table t;
+# regression: duplicate grouping sets must not be collapsed into one
Review Comment:
I think it's worth having a test case for the situation where adding the
duplicate ordinal widens the size of the grouping ID field, that's a bit tricky.
##########
datafusion/expr/src/logical_plan/plan.rs:
##########
@@ -3737,6 +3686,24 @@ impl PartialOrd for Aggregate {
}
}
+/// Returns the highest duplicate ordinal across all grouping sets in
`group_expr`.
+///
+/// The ordinal counts how many times a given grouping set pattern has already
+/// appeared before the current occurrence. For example, if the same set
Review Comment:
I mentioned this before but "the current occurrence" is not well-defined in
this context. Something like this instead:
```
/// The ordinal for each occurrence of a grouping set pattern is its 0-based
index among
/// identical entries. For example, if the same set appears three times, the
ordinals are
/// 0, 1, 2 and this function returns 2.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]