[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

via GitHub Tue, 18 Jul 2023 19:33:02 -0700


ueshin commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1267467229



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -748,6 +769,442 @@ def terminate(self):
         self.assertIn("Evaluate the input row", cls.eval.__doc__)
         self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
 
+    def test_simple_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(func().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a, AnalyzeArgument)
+                assert isinstance(a.data_type, DataType)
+                assert a.value is not None
+                assert a.is_table is False
+                return AnalyzeResult(StructType().add("a", a.data_type))
+
+            def eval(self, a):
+                yield a,
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (func(lit(1)), StructType().add("a", IntegerType()), 
[Row(a=1)]),
+                # another data type
+                (func(lit("x")), StructType().add("a", StringType()), 
[Row(a="x")]),
+                # array type
+                (
+                    func(array(lit(1), lit(2), lit(3))),
+                    StructType().add("a", ArrayType(IntegerType(), 
containsNull=False)),
+                    [Row(a=[1, 2, 3])],
+                ),
+                # map type
+                (
+                    func(create_map(lit("x"), lit(1), lit("y"), lit(2))),
+                    StructType().add(
+                        "a", MapType(StringType(), IntegerType(), 
valueContainsNull=False)
+                    ),
+                    [Row(a={"x": 1, "y": 2})],
+                ),
+                # struct type
+                (
+                    func(named_struct(lit("x"), lit(1), lit("y"), lit(2))),
+                    StructType().add(
+                        "a",
+                        StructType()
+                        .add("x", IntegerType(), nullable=False)
+                        .add("y", IntegerType(), nullable=False),
+                    ),
+                    [Row(a=Row(x=1, y=2))],
+                ),
+                # use SQL
+                (
+                    self.spark.sql("SELECT * from test_udtf(1)"),
+                    StructType().add("a", IntegerType()),
+                    [Row(a=1)],
+                ),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_decorator(self):
+        @udtf
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(TestUDTF().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze_decorator_parens(self):
+        @udtf()
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(TestUDTF().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze_multiple_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument, b: AnalyzeArgument) -> 
AnalyzeResult:
+                return AnalyzeResult(StructType().add("a", 
a.data_type).add("b", b.data_type))
+
+            def eval(self, a, b):
+                yield a, b
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (
+                    func(lit(1), lit("x")),
+                    StructType().add("a", IntegerType()).add("b", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+                (
+                    self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+                    StructType().add("a", IntegerType()).add("b", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_arbitary_number_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(*args: AnalyzeArgument) -> AnalyzeResult:
+                return AnalyzeResult(
+                    StructType([StructField(f"col{i}", a.data_type) for i, a 
in enumerate(args)])
+                )
+
+            def eval(self, *args):
+                yield args
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (
+                    func(lit(1)),
+                    StructType().add("col0", IntegerType()),
+                    [Row(a=1)],
+                ),
+                (
+                    self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+                    StructType().add("col0", IntegerType()).add("col1", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+                # TODO(SPARK-44479): Support Python UDTFs with empty schema
+                # (func(), StructType(), [Row()]),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_table_argument(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a, AnalyzeArgument)
+                assert isinstance(a.data_type, StructType)
+                assert a.value is None
+                assert a.is_table is True
+                return AnalyzeResult(StructType().add("a", 
a.data_type[0].dataType))
+
+            def eval(self, a: Row):
+                if a["id"] > 5:
+                    yield a["id"],
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))")
+        self.assertEqual(df.schema, StructType().add("a", LongType()))
+        self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+    def test_udtf_with_analyze_table_argument_adding_columns(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a.data_type, StructType)
+                assert a.is_table is True
+                return AnalyzeResult(a.data_type.add("is_even", BooleanType()))
+
+            def eval(self, a: Row):
+                yield a["id"], a["id"] % 2 == 0
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 4)))")
+        self.assertEqual(
+            df.schema,
+            StructType().add("id", LongType(), nullable=False).add("is_even", 
BooleanType()),
+        )
+        self.assertEqual(
+            df.collect(),
+            [
+                Row(a=0, is_even=True),
+                Row(a=1, is_even=False),
+                Row(a=2, is_even=True),
+                Row(a=3, is_even=False),
+            ],
+        )
+
+    def test_udtf_with_analyze_table_argument_repeating_rows(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(n, row) -> AnalyzeResult:
+                if n.value is None or not isinstance(n.value, int) or (n.value 
< 1 or n.value > 10):
+                    raise Exception("The first argument must be a scalar 
integer between 1 and 10")
+
+                if row.is_table is False:
+                    raise Exception("The second argument must be a table 
argument")
+
+                assert isinstance(row.data_type, StructType)
+                return AnalyzeResult(row.data_type)
+
+            def eval(self, n: int, row: Row):
+                for _ in range(n):
+                    yield row
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        expected_schema = StructType().add("id", LongType(), nullable=False)
+        expected_results = [
+            Row(a=0),
+            Row(a=0),
+            Row(a=1),
+            Row(a=1),
+            Row(a=2),
+            Row(a=2),
+            Row(a=3),
+            Row(a=3),
+        ]
+        for i, df in enumerate(
+            [
+                self.spark.sql("SELECT * FROM test_udtf(2, TABLE (SELECT id 
FROM range(0, 4)))"),
+                self.spark.sql(
+                    "SELECT * FROM test_udtf(1 + 1, TABLE (SELECT id FROM 
range(0, 4)))"

Review Comment:
   It's mentioned in the doc:
   
   
https://github.com/apache/spark/blob/0e268d149ae3b952a4cf2e97d5aeb389766106f0/python/pyspark/sql/functions.py#L15576



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

Reply via email to