[GitHub] [spark] allisonwang-db commented on a diff in pull request #41750: [SPARK-44200][SQL] Support TABLE argument parser rule for TableValuedFunction

via GitHub Thu, 06 Jul 2023 09:46:43 -0700


allisonwang-db commented on code in PR #41750:
URL: https://github.com/apache/spark/pull/41750#discussion_r1254683987



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -397,6 +397,198 @@ def test_udtf(a: int):
         with self.assertRaisesRegex(TypeError, err_msg):
             udtf(test_udtf, returnType="a: int")
 
+    def test_udtf_with_table_argument_query(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))").collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_int_and_table_argument_query(self):
+        class TestUDTF:
+            def eval(self, i: int, row: Row):
+                if row["id"] > i:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                "SELECT * FROM test_udtf(5, TABLE (SELECT id FROM range(0, 
8)))"
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_table_argument_identifier(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.tempView("v"):
+            self.spark.sql("CREATE OR REPLACE TEMPORARY VIEW v as SELECT id 
FROM range(0, 8)")
+            self.assertEqual(
+                self.spark.sql("SELECT * FROM test_udtf(TABLE v)").collect(),
+                [Row(a=6), Row(a=7)],
+            )
+
+    def test_udtf_with_int_and_table_argument_identifier(self):
+        class TestUDTF:
+            def eval(self, i: int, row: Row):
+                if row["id"] > i:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.tempView("v"):
+            self.spark.sql("CREATE OR REPLACE TEMPORARY VIEW v as SELECT id 
FROM range(0, 8)")
+            self.assertEqual(
+                self.spark.sql("SELECT * FROM test_udtf(5, TABLE 
v)").collect(),
+                [Row(a=6), Row(a=7)],
+            )
+
+    def test_udtf_with_table_argument_unknown_identifier(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.assertRaisesRegex(AnalysisException, 
"TABLE_OR_VIEW_NOT_FOUND"):
+            self.spark.sql("SELECT * FROM test_udtf(TABLE v)").collect()
+
+    def test_udtf_with_table_argument_malformed_query(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.assertRaisesRegex(AnalysisException, 
"TABLE_OR_VIEW_NOT_FOUND"):
+            self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT * FROM 
v))").collect()
+
+    def test_udtf_with_table_argument_cte_inside(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                """
+                SELECT * FROM test_udtf(TABLE (
+                  WITH t AS (
+                    SELECT id FROM range(0, 8)
+                  )
+                  SELECT * FROM t
+                ))
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_table_argument_cte_outside(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                """
+                WITH t AS (
+                  SELECT id FROM range(0, 8)
+                )
+                SELECT * FROM test_udtf(TABLE (SELECT id FROM t))
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+        self.assertEqual(
+            self.spark.sql(
+                """
+                WITH t AS (
+                  SELECT id FROM range(0, 8)
+                )
+                SELECT * FROM test_udtf(TABLE t)
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    # TODO(SPARK-44233): Fix the subquery resolution.
+    @unittest.skip("Fails to resolve the subquery.")
+    def test_udtf_with_table_argument_lateral_join(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                """
+                SELECT * FROM
+                  range(0, 8) AS t,
+                  LATERAL test_udtf(TABLE t)
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_table_argument_multiple(self):
+        class TestUDTF:
+            def eval(self, a: Row, b: Row):
+                yield a[0], b[0]
+
+        func = udtf(TestUDTF, returnType="a: int, b: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        query = """
+          SELECT * FROM test_udtf(
+            TABLE (SELECT id FROM range(0, 2)),
+            TABLE (SELECT id FROM range(0, 3)))
+        """
+
+        with 
self.sql_conf({"spark.sql.tvf.allowMultipleTableArguments.enabled": False}):

Review Comment:
   What's the reason why we have this config?



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -397,6 +397,198 @@ def test_udtf(a: int):
         with self.assertRaisesRegex(TypeError, err_msg):
             udtf(test_udtf, returnType="a: int")
 
+    def test_udtf_with_table_argument_query(self):
+        class TestUDTF:
+            def eval(self, row: Row):

Review Comment:
   To use the TABLE argument, the signature of the UDTF must be a row right? 
   Maybe we should mention in this the udtf doc.



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -397,6 +397,138 @@ def test_udtf(a: int):
         with self.assertRaisesRegex(TypeError, err_msg):
             udtf(test_udtf, returnType="a: int")
 
+    def test_udtf_with_table_argument_query(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))").collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_table_argument_identifier(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.tempView("v"):
+            self.spark.sql("CREATE OR REPLACE TEMPORARY VIEW v as SELECT id 
FROM range(0, 8)")
+            self.assertEqual(
+                self.spark.sql("SELECT * FROM test_udtf(TABLE v)").collect(),
+                [Row(a=6), Row(a=7)],
+            )
+
+    def test_udtf_with_table_argument_unknown_identifier(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.assertRaisesRegex(AnalysisException, 
"TABLE_OR_VIEW_NOT_FOUND"):
+            self.spark.sql("SELECT * FROM test_udtf(TABLE v)").collect()
+
+    def test_udtf_with_table_argument_malformed_query(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+
+        with self.assertRaisesRegex(AnalysisException, 
"TABLE_OR_VIEW_NOT_FOUND"):
+            self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT * FROM 
v))").collect()
+
+    def test_udtf_with_table_argument_cte_inside(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                """
+                SELECT * FROM test_udtf(TABLE (
+                  WITH t AS (
+                    SELECT id FROM range(0, 8)
+                  )
+                  SELECT * FROM t
+                ))
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+    def test_udtf_with_table_argument_cte_outside(self):
+        class TestUDTF:
+            def eval(self, row: Row):
+                if row["id"] > 5:
+                    yield row["id"],
+
+        func = udtf(TestUDTF, returnType="a: int")
+        self.spark.udtf.register("test_udtf", func)
+        self.assertEqual(
+            self.spark.sql(
+                """
+                WITH t AS (
+                  SELECT id FROM range(0, 8)
+                )
+                SELECT * FROM test_udtf(TABLE (SELECT id FROM t))
+                """
+            ).collect(),
+            [Row(a=6), Row(a=7)],
+        )
+
+        self.assertEqual(
+            self.spark.sql(
+                """
+                WITH t AS (
+                  SELECT id FROM range(0, 8)
+                )
+                SELECT * FROM test_udtf(TABLE t)

Review Comment:
   Yea this is not supported. The outer reference must be a column or an alias, 
not a table.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] allisonwang-db commented on a diff in pull request #41750: [SPARK-44200][SQL] Support TABLE argument parser rule for TableValuedFunction

Reply via email to