Github user dongjoon-hyun commented on a diff in the pull request:
https://github.com/apache/spark/pull/16798#discussion_r99480006
--- Diff:
sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
---
@@ -0,0 +1,255 @@
+-- A test suite for scalar subquery in predicate context
+
+CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv);
+CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv);
+
+-- SPARK-18814.1: Simplified version of TPCDS-Q32
+SELECT pk, cv
+FROM p, c
+WHERE p.pk = c.ck
+AND c.cv = (SELECT avg(c1.cv)
+ FROM c c1
+ WHERE c1.ck = p.pk);
+
+-- SPARK-18814.2: Adding stack of aggregates
+SELECT pk, cv
+FROM p, c
+WHERE p.pk = c.ck
+AND c.cv = (SELECT max(avg)
+ FROM (SELECT c1.cv, avg(c1.cv) avg
+ FROM c c1
+ WHERE c1.ck = p.pk
+ GROUP BY c1.cv));
+
+create temporary view t1 as select * from values
+ ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04
00:00:00.000', date '2014-04-04'),
+ ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04
01:01:00.000', date '2014-05-04'),
+ ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04
01:02:00.001', date '2014-06-04'),
+ ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04
01:01:00.000', date '2014-07-04'),
+ ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04
01:02:00.001', date '2014-05-05'),
+ ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04
01:01:00.000', null),
+ ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04
01:02:00.001', null),
+ ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04
01:01:00.000', date '2014-08-04'),
+ ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04
01:02:00.001', date '2014-09-04'),
+ ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04
01:01:00.000', date '2015-05-04'),
+ ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04
01:02:00.001', date '2014-04-04'),
+ ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04
01:01:00.000', date '2014-05-04')
+ as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i);
+
+create temporary view t2 as select * from values
+ ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04
01:01:00.000', date '2014-04-04'),
+ ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04
01:01:00.000', date '2014-05-04'),
+ ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04
01:01:00.000', date '2015-05-04'),
+ ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04
01:01:00.000', date '2016-05-04'),
+ ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04
01:01:00.000', null),
+ ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04
01:01:00.000', date '2014-06-04'),
+ ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04
01:01:00.000', date '2014-05-04'),
+ ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04
01:01:00.000', date '2014-06-04'),
+ ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04
01:01:00.000', date '2014-07-04'),
+ ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04
01:01:00.000', date '2014-08-05'),
+ ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04
01:01:00.000', date '2014-09-04'),
+ ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04
01:01:00.000', date '2014-10-04'),
+ ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04
01:01:00.000', null)
+ as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i);
+
+create temporary view t3 as select * from values
+ ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04
01:02:00.000', date '2014-04-04'),
+ ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04
01:02:00.000', date '2014-05-04'),
+ ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04
01:02:00.000', date '2014-05-04'),
+ ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04
01:02:00.000', date '2014-05-04'),
+ ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04
01:02:00.000', date '2014-06-04'),
+ ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04
01:02:00.000', date '2014-07-04'),
+ ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04
01:02:00.000', date '2014-08-04'),
+ ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04
01:02:00.000', date '2014-09-05'),
+ ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04
01:02:00.000', null),
+ ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04
01:02:00.000', null),
+ ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04
01:02:00.000', date '2014-05-04'),
+ ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04
01:02:00.000', date '2015-05-04')
+ as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i);
+
+-- Group 1: scalar subquery in predicate context
+-- no correlation
+-- TC 01.01
+SELECT t1a, t1b
+FROM t1
+WHERE t1c = (SELECT max(t2c)
+ FROM t2);
+
+-- TC 01.02
+SELECT t1a, t1d, t1f
+FROM t1
+WHERE t1c = (SELECT max(t2c)
+ FROM t2)
+AND t1b > (SELECT min(t3b)
+ FROM t3);
+
+-- TC 01.03
+SELECT t1a, t1h
+FROM t1
+WHERE t1c = (SELECT max(t2c)
+ FROM t2)
+OR t1b = (SELECT min(t3b)
+ FROM t3
+ WHERE t3b > 10);
+
+-- TC 01.04
+-- scalar subquery over outer join
+SELECT t1a, t1b, t2d
+FROM t1 LEFT JOIN t2
+ ON t1a = t2a
+WHERE t1b = (SELECT min(t3b)
+ FROM t3);
+
+-- TC 01.05
+-- test casting
+SELECT t1a, t1b, t1g
+FROM t1
+WHERE t1c + 5 = (SELECT max(t2e)
+ FROM t2);
+
+-- TC 01.06
+-- test casting
+SELECT t1a, t1h
+FROM t1
+WHERE date(t1h) = (SELECT min(t2i)
+ FROM t2);
+
+-- TC 01.07
+-- same table, expressions in scalar subquery
+SELECT t2d, t1a
+FROM t1, t2
+WHERE t1b = t2b
+AND t2c + 1 = (SELECT max(t2c) + 1
+ FROM t2, t1
+ WHERE t2b = t1b);
+
+-- TC 01.08
+-- same table
+SELECT DISTINCT t2a, max_t1g
+FROM t2, (SELECT max(t1g) max_t1g, t1a
+ FROM t1
+ GROUP BY t1a) t1
+WHERE t2a = t1a
+AND max_t1g = (SELECT max(t1g)
+ FROM t1);
+
+-- TC 01.09
+-- more than one scalar subquery
+SELECT t3b, t3c
+FROM t3
+WHERE (SELECT max(t3c)
+ FROM t3
+ WHERE t3b > 10) >=
+ (SELECT min(t3b)
+ FROM t3
+ WHERE t3c > 0)
+AND (t3b is null or t3c is null);
+
+-- Group 2: scalar subquery in predicate context
+-- with correlation
+-- TC 02.01
+SELECT t1a
+FROM t1
+WHERE t1a < (SELECT max(t2a)
+ FROM t2
+ WHERE t2c = t1c
+ GROUP BY t2c);
+
+-- TC 02.02
+SELECT t1a, t1c
+FROM t1
+WHERE (SELECT max(t2a)
+ FROM t2
+ WHERE t2c = t1c
+ GROUP BY t2c) IS NULL;
+
+-- TC 02.03
+SELECT t1a
+FROM t1
+WHERE t1a = (SELECT max(t2a)
+ FROM t2
+ WHERE t2c = t1c
+ GROUP BY t2c
+ HAVING count(*) >= 0)
+OR t1i > '2014-12-31';
+
+-- TC 02.04
+-- t1 on the right of an outer join
+-- can be reduced to inner join
+SELECT count(t1a)
+FROM t1 RIGHT JOIN t2
+ON t1d = t2d
+WHERE t1a < (SELECT max(t2a)
+ FROM t2
+ WHERE t2c = t1c
+ GROUP BY t2c);
+
+-- TC 02.05
+SELECT t1a
+FROM t1
+WHERE t1b <= (SELECT max(t2b)
+ FROM t2
+ WHERE t2c = t1c
+ GROUP BY t2c)
+AND t1b >= (SELECT min(t2b)
--- End diff --
Nit. I like this indentation.
For the other examples, `AND` seems to be aligned with `t1b` at line 190.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]