viirya commented on a change in pull request #25371: [SPARK-28393][SQL][PYTHON][TESTS] Convert and port 'pgSQL/join.sql' into UDF test base URL: https://github.com/apache/spark/pull/25371#discussion_r314967707
########## File path: sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-join.sql ########## @@ -0,0 +1,2081 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- JOIN +-- Test JOIN clauses +-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/join.sql +-- +-- This test file was converted from pgSQL/join.sql. + +CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM + (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) + AS v(f1); +CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM + (VALUES + (123, 456), + (123, 4567890123456789), + (4567890123456789, 123), + (4567890123456789, 4567890123456789), + (4567890123456789, -4567890123456789)) + AS v(q1, q2); +CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM + (VALUES (0.0), (1004.30), (-34.84), + (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double))) + AS v(f1); +CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM + (VALUES ('doh!'), ('hi de ho neighbor')) + AS v(f1); +CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1; + +CREATE TABLE J1_TBL ( + i integer, + j integer, + t string +) USING parquet; + +CREATE TABLE J2_TBL ( + i integer, + k integer +) USING parquet; + + +INSERT INTO J1_TBL VALUES (1, 4, 'one'); +INSERT INTO J1_TBL VALUES (2, 3, 'two'); +INSERT INTO J1_TBL VALUES (3, 2, 'three'); +INSERT INTO J1_TBL VALUES (4, 1, 'four'); +INSERT INTO J1_TBL VALUES (5, 0, 'five'); +INSERT INTO J1_TBL VALUES (6, 6, 'six'); +INSERT INTO J1_TBL VALUES (7, 7, 'seven'); +INSERT INTO J1_TBL VALUES (8, 8, 'eight'); +INSERT INTO J1_TBL VALUES (0, NULL, 'zero'); +INSERT INTO J1_TBL VALUES (NULL, NULL, 'null'); +INSERT INTO J1_TBL VALUES (NULL, 0, 'zero'); + +INSERT INTO J2_TBL VALUES (1, -1); +INSERT INTO J2_TBL VALUES (2, 2); +INSERT INTO J2_TBL VALUES (3, -3); +INSERT INTO J2_TBL VALUES (2, 4); +INSERT INTO J2_TBL VALUES (5, -5); +INSERT INTO J2_TBL VALUES (5, -5); +INSERT INTO J2_TBL VALUES (0, NULL); +INSERT INTO J2_TBL VALUES (NULL, NULL); +INSERT INTO J2_TBL VALUES (NULL, 0); + +-- [SPARK-20856] Do not need onerow because it only used for test statement using nested joins +-- useful in some tests below +-- create temp table onerow(); +-- insert into onerow default values; +-- analyze onerow; + + +-- +-- CORRELATION NAMES +-- Make sure that table/column aliases are supported +-- before diving into more complex join syntax. +-- + +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t) + FROM J1_TBL AS tx; + +SELECT udf(udf('')) AS `xxx`, udf(udf(i)), udf(j), udf(t) + FROM J1_TBL tx; + +SELECT udf('') AS `xxx`, a, udf(udf(b)), c + FROM J1_TBL AS t1 (a, b, c); + +SELECT udf('') AS `xxx`, udf(a), udf(b), udf(udf(c)) + FROM J1_TBL t1 (a, b, c); + +SELECT udf('') AS `xxx`, udf(a), b, udf(c), udf(d), e + FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e); + +-- [SPARK-28377] Fully support correlation names in the FROM clause +-- SELECT '' AS "xxx", t1.a, t2.e +-- FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e) +-- WHERE t1.a = t2.d; + + +-- +-- CROSS JOIN +-- Qualifications are not allowed on cross joins, +-- which degenerate into a standard unqualified inner join. +-- + +SELECT udf('') AS `xxx`, * + FROM J1_TBL CROSS JOIN J2_TBL; + +-- ambiguous column +SELECT udf('') AS `xxx`, udf(i), udf(k), udf(t) + FROM J1_TBL CROSS JOIN J2_TBL; + +-- resolve previous ambiguity by specifying the table name +SELECT udf('') AS `xxx`, udf(t1.i), udf(k), udf(t) + FROM J1_TBL t1 CROSS JOIN J2_TBL t2; + +SELECT udf(udf('')) AS `xxx`, udf(udf(ii)), udf(udf(tt)), udf(udf(kk)) + FROM (J1_TBL CROSS JOIN J2_TBL) + AS tx (ii, jj, tt, ii2, kk); + +-- [SPARK-28377] Fully support correlation names in the FROM clause +-- SELECT '' AS `xxx`, tx.ii, tx.jj, tx.kk +-- FROM (J1_TBL t1 (a, b, c) CROSS JOIN J2_TBL t2 (d, e)) +-- AS tx (ii, jj, tt, ii2, kk); + +SELECT udf('') AS `xxx`, udf(udf(j1_tbl.i)), udf(j), udf(t), udf(a.i), udf(a.k), udf(b.i), udf(b.k) + FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b; + + +-- +-- +-- Inner joins (equi-joins) +-- +-- + +-- +-- Inner joins (equi-joins) with USING clause +-- The USING syntax changes the shape of the resulting table +-- by including a column in the USING clause only once in the result. +-- + +-- Inner equi-join on specified column +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL INNER JOIN J2_TBL USING (i); + +-- Same as above, slightly different syntax +SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL JOIN J2_TBL USING (i); + +SELECT udf('') AS `xxx`, * + FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, d) USING (a) + ORDER BY udf(udf(a)), udf(d); + +-- [SPARK-28377] Fully support correlation names in the FROM clause +-- SELECT '' AS `xxx`, * +-- FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, b) USING (b) +-- ORDER BY b, t1.a; + + +-- +-- NATURAL JOIN +-- Inner equi-join on all columns with the same name +-- + +SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL NATURAL JOIN J2_TBL; + +SELECT udf('') AS `xxx`, udf(udf(udf(a))), udf(b), udf(c), udf(d) + FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d); + +SELECT udf('') AS `xxx`, udf(udf(a)), udf(udf(b)), udf(udf(c)), udf(udf(udf(d))) + FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a); + +-- [SPARK-28377] Fully support correlation names in the FROM clause +-- mismatch number of columns +-- currently, Postgres will fill in with underlying names +-- SELECT '' AS `xxx`, * +-- FROM J1_TBL t1 (a, b) NATURAL JOIN J2_TBL t2 (a); + + +-- +-- Inner joins (equi-joins) +-- + +SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(udf(J1_TBL.j)), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k) + FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) = J2_TBL.i); + +SELECT udf('') AS `xxx`, udf(udf(J1_TBL.i)), udf(udf(J1_TBL.j)), udf(udf(J1_TBL.t)), J2_TBL.i, J2_TBL.k + FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = udf(J2_TBL.k)); + + +-- +-- Non-equi-joins +-- + +SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k) + FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) <= udf(udf(J2_TBL.k))); + + +-- +-- Outer joins +-- Note that OUTER is a noise word +-- + +SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i) + ORDER BY udf(udf(i)), udf(k), udf(t); + +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL LEFT JOIN J2_TBL USING (i) + ORDER BY udf(i), udf(udf(k)), udf(t); + +SELECT udf('') AS `xxx`, udf(udf(i)), udf(j), udf(t), udf(k) + FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i); + +SELECT udf('') AS `xxx`, udf(i), udf(udf(j)), udf(t), udf(k) + FROM J1_TBL RIGHT JOIN J2_TBL USING (i); + +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(udf(t)), udf(k) + FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i) + ORDER BY udf(udf(i)), udf(k), udf(t); + +SELECT udf('') AS `xxx`, udf(i), udf(j), t, udf(udf(k)) + FROM J1_TBL FULL JOIN J2_TBL USING (i) + ORDER BY udf(udf(i)), udf(k), udf(udf(t)); + +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(udf(k)) + FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(k) = 1); + +SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) + FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(udf(i)) = udf(1)); + +-- +-- semijoin selectivity for <> +-- +-- explain (costs off) +-- select * from int4_tbl i4, tenk1 a +-- where exists(select * from tenk1 b +-- where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) +-- and i4.f1 = a.tenthous; + + +-- +-- More complicated constructs +-- + +-- +-- Multiway full join +-- + +CREATE TABLE t1 (name STRING, n INTEGER) USING parquet; +CREATE TABLE t2 (name STRING, n INTEGER) USING parquet; +CREATE TABLE t3 (name STRING, n INTEGER) USING parquet; + +INSERT INTO t1 VALUES ( 'bb', 11 ); +INSERT INTO t2 VALUES ( 'bb', 12 ); +INSERT INTO t2 VALUES ( 'cc', 22 ); +INSERT INTO t2 VALUES ( 'ee', 42 ); +INSERT INTO t3 VALUES ( 'bb', 13 ); +INSERT INTO t3 VALUES ( 'cc', 23 ); +INSERT INTO t3 VALUES ( 'dd', 33 ); + +SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name); + +-- +-- Test interactions of join syntax and subqueries +-- + +-- Basic cases (we expect planner to pull up the subquery here) +SELECT * FROM +(SELECT udf(name) as name, t2.n FROM t2) as s2 +INNER JOIN +(SELECT udf(udf(name)) as name, t3.n FROM t3) s3 +USING (name); + +SELECT * FROM +(SELECT udf(udf(name)) as name, t2.n FROM t2) as s2 +LEFT JOIN +(SELECT udf(name) as name, t3.n FROM t3) s3 +USING (name); + +SELECT udf(name), udf(udf(s2.n)), udf(s3.n) FROM +(SELECT * FROM t2) as s2 +FULL JOIN +(SELECT * FROM t3) s3 +USING (name); + +-- Cases with non-nullable expressions in subquery results; +-- make sure these go to null as expected +SELECT * FROM +(SELECT udf(udf(name)) as name, udf(n) as s2_n, udf(2) as s2_2 FROM t2) as s2 +NATURAL INNER JOIN +(SELECT udf(name) as name, udf(udf(n)) as s3_n, udf(3) as s3_2 FROM t3) s3; + +SELECT * FROM +(SELECT udf(name) as name, udf(udf(n)) as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL LEFT JOIN +(SELECT udf(udf(name)) as name, udf(n) as s3_n, 3 as s3_2 FROM t3) s3; + +SELECT * FROM +(SELECT udf(name) as name, udf(n) as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL FULL JOIN +(SELECT udf(udf(name)) as name, udf(udf(n)) as s3_n, 3 as s3_2 FROM t3) s3; + +SELECT * FROM +(SELECT udf(udf(name)) as name, udf(n) as s1_n, 1 as s1_1 FROM t1) as s1 +NATURAL INNER JOIN +(SELECT udf(name) as name, udf(n) as s2_n, 2 as s2_2 FROM t2) as s2 +NATURAL INNER JOIN +(SELECT udf(udf(udf(name))) as name, udf(n) as s3_n, 3 as s3_2 FROM t3) s3; + +SELECT * FROM +(SELECT udf(name) as name, udf(n) as s1_n, udf(udf(1)) as s1_1 FROM t1) as s1 +NATURAL FULL JOIN +(SELECT udf(name) as name, udf(udf(n)) as s2_n, udf(2) as s2_2 FROM t2) as s2 +NATURAL FULL JOIN +(SELECT udf(udf(name)) as name, udf(n) as s3_n, udf(3) as s3_2 FROM t3) s3; + +SELECT name, udf(udf(s1_n)), udf(s2_n), udf(s3_n) FROM +(SELECT name, udf(udf(n)) as s1_n FROM t1) as s1 +NATURAL FULL JOIN + (SELECT * FROM + (SELECT name, udf(n) as s2_n FROM t2) as s2 + NATURAL FULL JOIN + (SELECT name, udf(udf(n)) as s3_n FROM t3) as s3 + ) ss2; + +SELECT * FROM +(SELECT name, n as s1_n FROM t1) as s1 +NATURAL FULL JOIN + (SELECT * FROM + (SELECT name, udf(udf(n)) as s2_n, 2 as s2_2 FROM t2) as s2 + NATURAL FULL JOIN + (SELECT name, udf(n) as s3_n FROM t3) as s3 + ) ss2; + +-- Constants as join keys can also be problematic +SELECT s1.name, udf(s1_n), s2.name, udf(udf(s2_n)) FROM + (SELECT name, udf(n) as s1_n FROM t1) as s1 +FULL JOIN + (SELECT name, udf(2) as s2_n FROM t2) as s2 Review comment: The purpose of this test, looks like, is to use constants as join key, so I think we should not rewrite `2` to `udf(2)`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
