[GitHub] [spark] dtenedor commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

via GitHub Fri, 14 Jul 2023 11:34:56 -0700


dtenedor commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1264031515



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -719,6 +726,153 @@ def terminate(self):
         self.assertIn("Evaluate the input row", cls.eval.__doc__)
         self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
 
+    def test_simple_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> StructType:
+                return StructType().add("c1", StringType()).add("c2", 
StringType())
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+        rows = func().collect()
+        self.assertEqual(rows, [Row(c1="hello", c2="world")])
+
+    def test_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], DataType)
+                assert a["value"] is not None
+                assert a["is_table"] is False
+                return StructType().add("a", a["data_type"])
+
+            def eval(self, a):
+                yield a,
+
+        func = udtf(TestUDTF)
+
+        df1 = func(lit(1))
+        self.assertEquals(df1.schema, StructType().add("a", IntegerType()))
+        self.assertEqual(df1.collect(), [Row(a=1)])
+
+        df2 = func(lit("x"))
+        self.assertEquals(df2.schema, StructType().add("a", StringType()))
+        self.assertEqual(df2.collect(), [Row(a="x")])
+
+    def test_udtf_with_analyze_multiple_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a, b) -> StructType:
+                return StructType().add("a", a["data_type"]).add("b", 
b["data_type"])
+
+            def eval(self, a, b):
+                yield a, b
+
+        func = udtf(TestUDTF)
+
+        df = func(lit(1), lit("x"))
+        self.assertEquals(df.schema, StructType().add("a", 
IntegerType()).add("b", StringType()))
+        self.assertEqual(df.collect(), [Row(a=1, b="x")])
+
+    def test_udtf_with_analyze_table_argument(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], StructType)
+                assert a["value"] is None
+                assert a["is_table"] is True
+                return StructType().add("a", a["data_type"][0].dataType)
+
+            def eval(self, a: Row):
+                if a["id"] > 5:
+                    yield a["id"],
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))")
+        self.assertEqual(df.schema, StructType().add("a", LongType()))
+        self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+    def test_udtf_with_neither_return_type_nor_analyze(self):

Review Comment:
   Good question, we might be best off just returning error if we attempt to 
register the UDTF with a static return type, and the UDTF class has an 
‘analyze’ method, to make them mutually exclusive.



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -719,6 +726,153 @@ def terminate(self):
         self.assertIn("Evaluate the input row", cls.eval.__doc__)
         self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
 
+    def test_simple_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> StructType:
+                return StructType().add("c1", StringType()).add("c2", 
StringType())
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+        rows = func().collect()
+        self.assertEqual(rows, [Row(c1="hello", c2="world")])
+
+    def test_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], DataType)
+                assert a["value"] is not None
+                assert a["is_table"] is False
+                return StructType().add("a", a["data_type"])
+
+            def eval(self, a):
+                yield a,
+
+        func = udtf(TestUDTF)
+
+        df1 = func(lit(1))
+        self.assertEquals(df1.schema, StructType().add("a", IntegerType()))
+        self.assertEqual(df1.collect(), [Row(a=1)])
+
+        df2 = func(lit("x"))
+        self.assertEquals(df2.schema, StructType().add("a", StringType()))
+        self.assertEqual(df2.collect(), [Row(a="x")])
+
+    def test_udtf_with_analyze_multiple_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a, b) -> StructType:
+                return StructType().add("a", a["data_type"]).add("b", 
b["data_type"])
+
+            def eval(self, a, b):
+                yield a, b
+
+        func = udtf(TestUDTF)
+
+        df = func(lit(1), lit("x"))
+        self.assertEquals(df.schema, StructType().add("a", 
IntegerType()).add("b", StringType()))
+        self.assertEqual(df.collect(), [Row(a=1, b="x")])
+
+    def test_udtf_with_analyze_table_argument(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a) -> StructType:
+                assert isinstance(a, dict)
+                assert isinstance(a["data_type"], StructType)
+                assert a["value"] is None
+                assert a["is_table"] is True
+                return StructType().add("a", a["data_type"][0].dataType)
+
+            def eval(self, a: Row):
+                if a["id"] > 5:
+                    yield a["id"],
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))")
+        self.assertEqual(df.schema, StructType().add("a", LongType()))
+        self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+    def test_udtf_with_neither_return_type_nor_analyze(self):

Review Comment:
   Good question, we might be best off just returning error if we attempt to 
register the UDTF with a static return type, and the UDTF class has an 
‘analyze’ method, to make them mutually exclusive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] dtenedor commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

Reply via email to