[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

via GitHub Fri, 14 Jul 2023 14:43:54 -0700


ueshin commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1264196125



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -719,6 +726,153 @@ def terminate(self):
         self.assertIn("Evaluate the input row", cls.eval.__doc__)
         self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
 
+    def test_simple_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> StructType:
+                return StructType().add("c1", StringType()).add("c2", 
StringType())
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+        rows = func().collect()
+        self.assertEqual(rows, [Row(c1="hello", c2="world")])
+
+    def test_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], DataType)
+                assert a["value"] is not None
+                assert a["is_table"] is False
+                return StructType().add("a", a["data_type"])
+
+            def eval(self, a):
+                yield a,
+
+        func = udtf(TestUDTF)
+
+        df1 = func(lit(1))
+        self.assertEquals(df1.schema, StructType().add("a", IntegerType()))
+        self.assertEqual(df1.collect(), [Row(a=1)])
+
+        df2 = func(lit("x"))
+        self.assertEquals(df2.schema, StructType().add("a", StringType()))
+        self.assertEqual(df2.collect(), [Row(a="x")])
+
+    def test_udtf_with_analyze_multiple_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a, b) -> StructType:
+                return StructType().add("a", a["data_type"]).add("b", 
b["data_type"])
+
+            def eval(self, a, b):
+                yield a, b
+
+        func = udtf(TestUDTF)
+
+        df = func(lit(1), lit("x"))
+        self.assertEquals(df.schema, StructType().add("a", 
IntegerType()).add("b", StringType()))
+        self.assertEqual(df.collect(), [Row(a=1, b="x")])
+
+    def test_udtf_with_analyze_table_argument(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], StructType)
+                assert a["value"] is None
+                assert a["is_table"] is True
+                return StructType().add("a", a["data_type"][0].dataType)
+
+            def eval(self, a: Row):
+                if a["id"] > 5:
+                    yield a["id"],
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))")
+        self.assertEqual(df.schema, StructType().add("a", LongType()))
+        self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+    def test_udtf_with_neither_return_type_nor_analyze(self):
+        class TestUDTF:
+            def eval(self):
+                yield "hello", "world"
+
+        with self.assertRaises(PySparkAttributeError) as e:
+            udtf(TestUDTF)
+
+        self.check_error(
+            exception=e.exception,
+            error_class="INVALID_UDTF_RETURN_TYPE",
+            message_parameters={"name": "TestUDTF"},
+        )
+
+    def test_udtf_with_non_static_analyze(self):
+        class TestUDTF:
+            def analyze(self) -> StructType:
+                return StructType().add("c1", StringType()).add("c2", 
StringType())
+
+            def eval(self):
+                yield "hello", "world"
+
+        with self.assertRaises(PySparkAttributeError) as e:
+            udtf(TestUDTF)
+
+        self.check_error(
+            exception=e.exception,
+            error_class="INVALID_UDTF_RETURN_TYPE",
+            message_parameters={"name": "TestUDTF"},
+        )
+
+    def test_udtf_with_analyze_returning_non_struct(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze():
+                return StringType()
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+
+        with self.assertRaisesRegex(
+            AnalysisException,
+            "Output of `analyze` static method of Python UDTFs expects a 
StructType "
+            "but got: <class 'pyspark.sql.types.StringType'>",
+        ):
+            func().collect()
+
+    def test_udtf_with_analyze_taking_wrong_number_of_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a, b) -> StructType:
+                return StructType().add("a", a["data_type"]).add("b", 
b["data_type"])
+
+            def eval(self, a):
+                yield a, a + 1
+
+        func = udtf(TestUDTF)
+
+        with self.assertRaisesRegex(
+            AnalysisException, r"analyze\(\) missing 1 required positional 
argument: 'b'"

Review Comment:
   For this one, let me leave it as-is for now because the error message is 
from Python runtime using a general error type and message. If we capture and 
modify it, it could also modify an actual error from inside of the `analyze` 
method.
   I'll revisit this later.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

Reply via email to