agubichev commented on code in PR #48145: URL: https://github.com/apache/spark/pull/48145#discussion_r1769003527
########## sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-group-by.sql: ########## @@ -22,16 +22,27 @@ select *, (select count(*) from y where x1 = y1 and cast(y2 as double) = x1 + 1 select *, (select count(*) from y where y2 + 1 = x1 + x2 group by y2 + 1) from x; --- Illegal queries +-- Illegal queries (single join disabled) +set spark.sql.optimizer.scalarSubqueryUseSingleJoin = false; select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x; select *, (select count(*) from y where x1 = y1 and y2 + 10 = x1 + 1 group by y2) from x; +reset spark.sql.optimizer.scalarSubqueryUseSingleJoin; Review Comment: removed ########## sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-group-by.sql: ########## @@ -22,16 +22,27 @@ select *, (select count(*) from y where x1 = y1 and cast(y2 as double) = x1 + 1 select *, (select count(*) from y where y2 + 1 = x1 + x2 group by y2 + 1) from x; --- Illegal queries +-- Illegal queries (single join disabled) +set spark.sql.optimizer.scalarSubqueryUseSingleJoin = false; select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x; select *, (select count(*) from y where x1 = y1 and y2 + 10 = x1 + 1 group by y2) from x; +reset spark.sql.optimizer.scalarSubqueryUseSingleJoin; + +-- Same queries, with LeftSingle join +set spark.sql.optimizer.scalarSubqueryUseSingleJoin = true; +select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; +select *, (select count(*) from y where y1 + y2 = x1 group by y1) from x; +select *, (select count(*) from y where x1 = y1 and y2 + 10 = x1 + 1 group by y2) from x; + -- Certain other operators like OUTER JOIN or UNION between the correlating filter and the group-by also can cause the scalar subquery to return multiple values and hence make the query illegal. select *, (select count(*) from (select * from y where y1 = x1 union all select * from y) sub group by y1) from x; select *, (select count(*) from y left join (select * from z where z1 = x1) sub on y2 = z2 group by z1) from x; -- The correlation below the join is unsupported in Spark anyway, but when we do support it this query should still be disallowed. -- Test legacy behavior conf set spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate = true; +set spark.sql.optimizer.scalarSubqueryUseSingleJoin = false; select * from x where (select count(*) from y where y1 > x1 group by y1) = 1; reset spark.sql.legacy.scalarSubqueryAllowGroupByNonEqualityCorrelatedPredicate; +reset spark.sql.optimizer.scalarSubqueryUseSingleJoin; Review Comment: removed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
