[GitHub] [spark] ueshin commented on a diff in pull request #41867: [SPARK-43964][SQL][PYTHON] Support arrow-optimized Python UDTFs

via GitHub Fri, 21 Jul 2023 16:27:13 -0700


ueshin commented on code in PR #41867:
URL: https://github.com/apache/spark/pull/41867#discussion_r1271166023



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -369,19 +381,115 @@ def eval(self, a: int):
         ):
             TestUDTF(rand(0) * 100).collect()
 
-    def test_udtf_no_eval(self):
-        @udtf(returnType="a: int, b: int")
+    def test_udtf_with_struct_input_type(self):
+        @udtf(returnType="x: string")
         class TestUDTF:
-            def run(self, a: int):
-                yield a, a + 1
+            def eval(self, person):
+                yield f"{person.name}: {person.age}",
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+        self.assertEqual(
+            self.spark.sql(
+                "select * from test_udtf(named_struct('name', 'Alice', 'age', 
1))"
+            ).collect(),
+            [Row(x="Alice: 1")],
+        )
 
+    def test_udtf_with_array_input_type(self):
+        @udtf(returnType="x: string")
+        class TestUDTF:
+            def eval(self, args):
+                yield str(args),
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+        self.assertEqual(
+            self.spark.sql("select * from test_udtf(array(1, 2, 
3))").collect(),
+            [Row(x="[1, 2, 3]")],
+        )
+
+    def test_udtf_with_map_input_type(self):
+        @udtf(returnType="x: string")
+        class TestUDTF:
+            def eval(self, m):
+                yield str(m),
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+        self.assertEqual(
+            self.spark.sql("select * from test_udtf(map('key', 
'value'))").collect(),
+            [Row(x="{'key': 'value'}")],
+        )
+
+    def test_udtf_with_struct_output_types(self):
+        @udtf(returnType="x: struct<a:int,b:int>")
+        class TestUDTF:
+            def eval(self, x: int):
+                yield {"a": x, "b": x + 1},
+
+        self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x=Row(a=1, b=2))])
+
+    def test_udtf_with_array_output_types(self):
+        @udtf(returnType="x: array<int>")
+        class TestUDTF:
+            def eval(self, x: int):
+                yield [x, x + 1, x + 2],
+
+        self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x=[1, 2, 3])])
+
+    def test_udtf_with_map_output_types(self):
+        @udtf(returnType="x: map<int,string>")
+        class TestUDTF:
+            def eval(self, x: int):
+                yield {x: str(x)},
+
+        self.assertEqual(TestUDTF(lit(1)).collect(), [Row(x={1: "1"})])
+
+    def test_udtf_with_pandas_input_type(self):

Review Comment:
   @allisonwang-db This test needs to be skipped when pandas is not available.
   
   ```
   Traceback (most recent call last):
     File "/.../pyspark/sql/tests/test_udtf.py", line 491, in 
test_udtf_with_pandas_input_type
       import pandas as pd
   ModuleNotFoundError: No module named 'pandas'
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] ueshin commented on a diff in pull request #41867: [SPARK-43964][SQL][PYTHON] Support arrow-optimized Python UDTFs

Reply via email to