notfilippo opened a new issue, #17405: URL: https://github.com/apache/datafusion/issues/17405
### Describe the bug
Related to #15439 (fixed in #15438) and #16063 (fixed in #16064).
While executing some queries with a similar pattern to the ones described
the related issue, I encountered several situations where the physical planner
would get confused when grouping by columns originating from multiple joins
(`Input field name {} does not match with the projection expression {}`). All
pathologic columns didn't pass the check because of a `:1` (or similar) to
their column name.
The suffix is appended from:
https://github.com/apache/datafusion/blob/d83a290d1d534f7db9849b11c39d2b0a289a62e4/datafusion/expr/src/logical_plan/builder.rs#L1528
which gets called when creating a new
[`SubqueryAlias`](https://github.com/apache/datafusion/blob/d83a290d1d534f7db9849b11c39d2b0a289a62e4/datafusion/expr/src/logical_plan/plan.rs#L2226),
which is used in these queries to qualify the `left` and `right` part of the
join so the query can reference the columns indepentely. The qualification
happens by stripping the inner DFSchema of its current qualification and
applying the new one (e.g. `TableReference::bar("left")`). During this
re-qualification, columns with duplicate names are handled in a very hacky way
by appending a `:{N}` where N is a unique number for that column.
The issue lies in the fact that nesting multiple `SubqueryAlias`es together
could lead to information loss, which after an optimization step could lead to
fatal errors in the logical and physical planner.
The fix for #16064 applies only for projections but if instead of projecting
we use the column references in any other way the issue repeats. We would
either have to use the `maybe_fix_physical_column_name` function everywhere,
which is inconvenient or fix this at a systemic level by keeping the
qualification path when creating a `SubqueryAlias` so that no information is
lost and dropping the `:{N}` suffix so that columns can continue to be
identified by name.
### To Reproduce
```
{
"extensionUris": [{
"extensionUriAnchor": 1,
"uri": "/functions_aggregate_generic.yaml"
}, {
"extensionUriAnchor": 2,
"uri": "/functions_comparison.yaml"
}],
"extensions": [{
"extensionFunction": {
"extensionUriReference": 1,
"functionAnchor": 0,
"name": "count:"
}
}, {
"extensionFunction": {
"extensionUriReference": 2,
"functionAnchor": 1,
"name": "equal:any_any"
}
}],
"relations": [{
"root": {
"input": {
"aggregate": {
"common": {
},
"input": {
"join": {
"common": {
"direct": {
}
},
"left": {
"join": {
"common": {
"direct": {
}
},
"left": {
"join": {
"common": {
"direct": {
}
},
"left": {
"aggregate": {
"common": {
"direct": {
}
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["id"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"virtualTable": {
"values": [{
"fields": [{
"i64": "1",
"nullable": true,
"typeVariationReference": 0
}]
}, {
"fields": [{
"i64": "2",
"nullable": true,
"typeVariationReference": 0
}]
}]
}
}
},
"groupings": [{
"groupingExpressions": [{
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}],
"expressionReferences": []
}],
"measures": [{
"measure": {
"functionReference": 0,
"args": [],
"sorts": [],
"phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
"outputType": {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"invocation": "AGGREGATION_INVOCATION_ALL",
"arguments": [],
"options": []
}
}],
"groupingExpressions": []
}
},
"right": {
"aggregate": {
"common": {
"direct": {
}
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["id", "category"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"string": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"virtualTable": {
"values": [{
"fields": [{
"i64": "1",
"nullable": true,
"typeVariationReference": 0
}, {
"string": "info",
"nullable": true,
"typeVariationReference": 0
}]
}, {
"fields": [{
"i64": "2",
"nullable": true,
"typeVariationReference": 0
}, {
"string": "low",
"nullable": true,
"typeVariationReference": 0
}]
}]
}
}
},
"groupings": [{
"groupingExpressions": [{
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}, {
"selection": {
"directReference": {
"structField": {
"field": 1
}
},
"rootReference": {
}
}
}],
"expressionReferences": []
}],
"measures": [],
"groupingExpressions": []
}
},
"expression": {
"scalarFunction": {
"functionReference": 1,
"args": [],
"outputType": {
"bool": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
},
"arguments": [{
"value": {
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}
}, {
"value": {
"selection": {
"directReference": {
"structField": {
"field": 2
}
},
"rootReference": {
}
}
}
}],
"options": []
}
},
"type": "JOIN_TYPE_LEFT"
}
},
"right": {
"aggregate": {
"common": {
"direct": {
}
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["id"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"virtualTable": {
"values": [{
"fields": [{
"i64": "1",
"nullable": true,
"typeVariationReference": 0
}]
}, {
"fields": [{
"i64": "2",
"nullable": true,
"typeVariationReference": 0
}]
}]
}
}
},
"groupings": [{
"groupingExpressions": [{
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}],
"expressionReferences": []
}],
"measures": [{
"measure": {
"functionReference": 0,
"args": [],
"sorts": [],
"phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
"outputType": {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"invocation": "AGGREGATION_INVOCATION_ALL",
"arguments": [],
"options": []
}
}],
"groupingExpressions": []
}
},
"expression": {
"scalarFunction": {
"functionReference": 1,
"args": [],
"outputType": {
"bool": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
},
"arguments": [{
"value": {
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}
}, {
"value": {
"selection": {
"directReference": {
"structField": {
"field": 4
}
},
"rootReference": {
}
}
}
}],
"options": []
}
},
"type": "JOIN_TYPE_LEFT"
}
},
"right": {
"aggregate": {
"common": {
"direct": {
}
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["id"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"virtualTable": {
"values": [{
"fields": [{
"i64": "1",
"nullable": true,
"typeVariationReference": 0
}]
}, {
"fields": [{
"i64": "2",
"nullable": true,
"typeVariationReference": 0
}]
}]
}
}
},
"groupings": [{
"groupingExpressions": [{
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}],
"expressionReferences": []
}],
"measures": [{
"measure": {
"functionReference": 0,
"args": [],
"sorts": [],
"phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
"outputType": {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"invocation": "AGGREGATION_INVOCATION_ALL",
"arguments": [],
"options": []
}
}],
"groupingExpressions": []
}
},
"expression": {
"scalarFunction": {
"functionReference": 1,
"args": [],
"outputType": {
"bool": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
},
"arguments": [{
"value": {
"selection": {
"directReference": {
"structField": {
"field": 0
}
},
"rootReference": {
}
}
}
}, {
"value": {
"selection": {
"directReference": {
"structField": {
"field": 6
}
},
"rootReference": {
}
}
}
}],
"options": []
}
},
"type": "JOIN_TYPE_LEFT"
}
},
"groupings": [{
"groupingExpressions": [{
"selection": {
"directReference": {
"structField": {
"field": 5
}
},
"rootReference": {
}
}
}],
"expressionReferences": []
}],
"measures": []
}
},
"names": ["count_second"]
}
}],
"expectedTypeUrls": [],
"version": {
"majorNumber": 0,
"minorNumber": 52,
"patchNumber": 0,
"gitHash": ""
}
}
```
### Expected behavior
_No response_
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
