cj-zhukov commented on code in PR #21021:
URL: https://github.com/apache/datafusion/pull/21021#discussion_r3128672679
##########
datafusion/core/src/dataframe/mod.rs:
##########
@@ -410,21 +426,128 @@ impl DataFrame {
expr_list: impl IntoIterator<Item = impl Into<SelectExpr>>,
) -> Result<DataFrame> {
let expr_list: Vec<SelectExpr> =
- expr_list.into_iter().map(|e| e.into()).collect::<Vec<_>>();
+ expr_list.into_iter().map(|e| e.into()).collect();
+ // Extract expressions
let expressions = expr_list.iter().filter_map(|e| match e {
SelectExpr::Expression(expr) => Some(expr),
_ => None,
});
- let window_func_exprs = find_window_exprs(expressions);
- let plan = if window_func_exprs.is_empty() {
+ // Apply window functions first
+ let window_func_exprs = find_window_exprs(expressions.clone());
+
+ let mut plan = if window_func_exprs.is_empty() {
self.plan
} else {
LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
};
- let project_plan =
LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
+ // Collect aggregate expressions
+ let aggr_exprs = find_aggregate_exprs(expressions.clone());
+
+ // Check for non-aggregate expressions
+ let has_non_aggregate_expr = expr_list.iter().any(|e| match e {
+ SelectExpr::Expression(expr) => {
+ find_aggregate_exprs(std::iter::once(expr)).is_empty()
+ }
+ SelectExpr::Wildcard(_) | SelectExpr::QualifiedWildcard(_, _) =>
true,
+ });
+
+ if has_non_aggregate_expr && !aggr_exprs.is_empty() {
+ return plan_err!(
+ "Column in SELECT must be in GROUP BY or an aggregate function"
+ );
+ }
+
+ // Fallback to projection
+ if matches!(plan, LogicalPlan::Aggregate(_))
+ || has_non_aggregate_expr
+ || aggr_exprs.is_empty()
+ {
+ let project_plan =
+ LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
+
+ return Ok(DataFrame {
+ session_state: self.session_state,
+ plan: project_plan,
+ projection_requires_validation: false,
+ });
+ }
+
+ // Unique name generator
Review Comment:
Yes, your understanding is correct - the implementation introduces an
`Aggregate` node with uniquely aliased expressions, followed by a `Projection`
that rewrites the original select expressions to reference those aggregates.
I also considered simplifying this to “Aggregate + Project *”, but that only
works for very simple cases like `select(count(1))`.
As soon as the select contains expressions on top of aggregates (e.g.
`count(1) + 1)`, aliases, or repeated aggregates, we need the rewrite step to
correctly reference the computed aggregate outputs.
The mapping + projection ensures that:
- aggregates are computed exactly once
- expressions on top of aggregates are preserved
- aliases are stable and deterministic
So while the current approach is a bit more involved, it aligns with how
logical plans typically separate aggregation from final projection.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]