This is an automated email from the ASF dual-hosted git repository.
cloud-fan pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new 5f3b1281a33e [SPARK-56944][SQL] Trim aliases from ListAgg expression
subtree
5f3b1281a33e is described below
commit 5f3b1281a33ec55d319528ee715f94fef3afdfeb
Author: Mihailo Aleksic <[email protected]>
AuthorDate: Tue May 26 08:50:46 2026 +0800
[SPARK-56944][SQL] Trim aliases from ListAgg expression subtree
### What changes were proposed in this pull request?
In this PR I propose to trim aliases from `ListAgg` expression subtree in
order to fix a discrepancy between single-pass and fixed-point analyzers.
It is a safe change since it would otherwise be removed in `CleanupAliases`.
### Why are the changes needed?
To fix a dual-run issue.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Added + existing tests.
### Was this patch authored or co-authored using generative AI tooling?
Yes.
Closes #55984 from mihailoale-db/fixlistagg.
Authored-by: Mihailo Aleksic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit d342170f142f6359950c37dd7d169fb0e0a4b854)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalyst/expressions/aggregate/collect.scala | 8 +-
.../sql-tests/analyzer-results/listagg.sql.out | 96 ++++++++++++++++++++++
.../test/resources/sql-tests/inputs/listagg.sql | 15 ++++
.../resources/sql-tests/results/listagg.sql.out | 58 +++++++++++++
4 files changed, 174 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index ee06147b0394..a03491e349a7 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -394,7 +394,8 @@ case class ListAgg(
inputAggBufferOffset: Int = 0)
extends Collect[mutable.ArrayBuffer[Any]]
with SupportsOrderingWithinGroup
- with ImplicitCastInputTypes {
+ with ImplicitCastInputTypes
+ with AliasHelper {
override def orderingFilled: Boolean = orderExpressions.nonEmpty
@@ -600,7 +601,8 @@ case class ListAgg(
if (someOrder.isEmpty) {
return true
}
- if (someOrder.size == 1 && someOrder.head.child.semanticEquals(child)) {
+ if (someOrder.size == 1 &&
+ trimAliases(someOrder.head.child).semanticEquals(trimAliases(child))) {
return true
}
false
@@ -691,7 +693,7 @@ case class ListAgg(
if (orderExpressions.size != 1) return
OrderDeterminismResult.NonDeterministicMismatch
child match {
case Cast(castChild, castType, _, _)
- if orderExpressions.head.child.semanticEquals(castChild) =>
+ if
trimAliases(orderExpressions.head.child).semanticEquals(trimAliases(castChild))
=>
if (isCastEqualityPreserving(castChild.dataType) &&
isCastTargetEqualityPreserving(castType)) {
OrderDeterminismResult.Deterministic
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
index 146aa9ff6f52..f2e52a1be723 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
@@ -702,3 +702,99 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}
+
+
+-- !query
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string)
FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a":
"y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query analysis
+Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true,
Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a,
VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0)
AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP
(ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
++- SubqueryAlias __auto_generated_subquery_name
+ +- Union false, false
+ :- Union false, false
+ : :- Project [parse_json({"a": "x"}, true) AS v#x]
+ : : +- OneRowRelation
+ : +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
+ : +- OneRowRelation
+ +- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+ +- OneRowRelation
+
+
+-- !query
+SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM
(SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}')
UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query analysis
+Aggregate [listagg(cast(variant_get(v#x, $.a, VariantType, true,
Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a,
VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0)
AS listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY
CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
++- SubqueryAlias __auto_generated_subquery_name
+ +- Union false, false
+ :- Union false, false
+ : :- Project [parse_json({"a": "x"}, true) AS v#x]
+ : : +- OneRowRelation
+ : +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
+ : +- OneRowRelation
+ +- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+ +- OneRowRelation
+
+
+-- !query
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string
DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT
parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query analysis
+Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true,
Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a,
VariantType, true, Some(America/Los_Angeles)) as string) DESC NULLS LAST, 0, 0)
AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP
(ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS LAST)#x]
++- SubqueryAlias __auto_generated_subquery_name
+ +- Union false, false
+ :- Union false, false
+ : :- Project [parse_json({"a": "x"}, true) AS v#x]
+ : : +- OneRowRelation
+ : +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
+ : +- OneRowRelation
+ +- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+ +- OneRowRelation
+
+
+-- !query
+SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY
v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT
parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b":
"x"}}'))
+-- !query analysis
+Aggregate [listagg(distinct cast(variant_get(v#x, $.a.b, VariantType, true,
Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a.b,
VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0)
AS listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN GROUP
(ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS FIRST)#x]
++- SubqueryAlias __auto_generated_subquery_name
+ +- Union false, false
+ :- Union false, false
+ : :- Project [parse_json({"a": {"b": "x"}}, true) AS v#x]
+ : : +- OneRowRelation
+ : +- Project [parse_json({"a": {"b": "y"}}, true) AS parse_json({"a":
{"b": "y"}})#x]
+ : +- OneRowRelation
+ +- Project [parse_json({"a": {"b": "x"}}, true) AS parse_json({"a":
{"b": "x"}})#x]
+ +- OneRowRelation
+
+
+-- !query
+SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY
v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1,
parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL
SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a":
"x"}')) GROUP BY grp
+-- !query analysis
+Aggregate [grp#x], [grp#x, listagg(distinct cast(variant_get(v#x, $.a,
VariantType, true, Some(America/Los_Angeles)) as string), ,,
cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as
string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS
a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS
STRING) ASC NULLS FIRST)#x]
++- SubqueryAlias __auto_generated_subquery_name
+ +- Union false, false
+ :- Union false, false
+ : :- Union false, false
+ : : :- Union false, false
+ : : : :- Project [1 AS grp#x, parse_json({"a": "x"}, true) AS v#x]
+ : : : : +- OneRowRelation
+ : : : +- Project [1 AS 1#x, parse_json({"a": "y"}, true) AS
parse_json({"a": "y"})#x]
+ : : : +- OneRowRelation
+ : : +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS
parse_json({"a": "x"})#x]
+ : : +- OneRowRelation
+ : +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS
parse_json({"a": "x"})#x]
+ : +- OneRowRelation
+ +- Project [1 AS 1#x, parse_json({"a": "x"}, true) AS parse_json({"a":
"x"})#x]
+ +- OneRowRelation
+
+
+-- !query
+SELECT listagg(DISTINCT (v:a)::double::string, ',') WITHIN GROUP (ORDER BY
(v:a)::double) FROM (SELECT parse_json('{"a": 1.1}') v UNION ALL SELECT
parse_json('{"a": 2.2}') UNION ALL SELECT parse_json('{"a": 1.1}'))
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+ "errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST",
+ "sqlState" : "42K0K",
+ "messageParameters" : {
+ "castType" : "\"STRING\"",
+ "funcName" : "`listagg`",
+ "inputType" : "\"DOUBLE\""
+ }
+}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
index 3a7401893aaa..b6a7dd1d0801 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
@@ -61,3 +61,18 @@ SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1,
col2) FROM df;
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES
(cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3
as double)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES
(cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES
(TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'),
(TIMESTAMP'2024-01-01 10:00:00') AS t(col);
+
+-- LISTAGG with semi-structured extract (parser wraps v:a in Alias with fresh
ExprId)
+-- Tests that isOrderCompatible strips Alias wrappers before comparing via
semanticEquals
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string)
FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a":
"y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
+-- Semi-structured extract without DISTINCT
+SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM
(SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}')
UNION ALL SELECT parse_json('{"a": "x"}'));
+-- Semi-structured extract with DESC ordering
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string
DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT
parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
+-- Semi-structured extract with nested path
+SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY
v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT
parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b":
"x"}}'));
+-- Semi-structured extract with GROUP BY
+SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY
v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1,
parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL
SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a":
"x"}')) GROUP BY grp;
+-- Semi-structured extract: DISTINCT with non-equality-preserving cast
(double->string)
+-- Tests that checkOrderValueDeterminism strips Alias wrappers before
comparing via semanticEquals
+SELECT listagg(DISTINCT (v:a)::double::string, ',') WITHIN GROUP (ORDER BY
(v:a)::double) FROM (SELECT parse_json('{"a": 1.1}') v UNION ALL SELECT
parse_json('{"a": 2.2}') UNION ALL SELECT parse_json('{"a": 1.1}'));
diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
index ca387fe70fa1..08a50a6c5c68 100644
--- a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
@@ -563,3 +563,61 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}
+
+
+-- !query
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string)
FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a":
"y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query schema
+struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN
GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS
FIRST):string>
+-- !query output
+x,y
+
+
+-- !query
+SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM
(SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}')
UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query schema
+struct<listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP
(ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
+-- !query output
+x,x,y
+
+
+-- !query
+SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string
DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT
parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
+-- !query schema
+struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN
GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS
LAST):string>
+-- !query output
+y,x
+
+
+-- !query
+SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY
v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT
parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b":
"x"}}'))
+-- !query schema
+struct<listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN
GROUP (ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS
FIRST):string>
+-- !query output
+x,y
+
+
+-- !query
+SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY
v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1,
parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL
SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a":
"x"}')) GROUP BY grp
+-- !query schema
+struct<grp:int,listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,)
WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS
FIRST):string>
+-- !query output
+1 x,y
+2 x
+
+
+-- !query
+SELECT listagg(DISTINCT (v:a)::double::string, ',') WITHIN GROUP (ORDER BY
(v:a)::double) FROM (SELECT parse_json('{"a": 1.1}') v UNION ALL SELECT
parse_json('{"a": 2.2}') UNION ALL SELECT parse_json('{"a": 1.1}'))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+ "errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST",
+ "sqlState" : "42K0K",
+ "messageParameters" : {
+ "castType" : "\"STRING\"",
+ "funcName" : "`listagg`",
+ "inputType" : "\"DOUBLE\""
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]