[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

via GitHub Tue, 18 Jul 2023 19:31:06 -0700


ueshin commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1267466437



##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -748,6 +769,442 @@ def terminate(self):
         self.assertIn("Evaluate the input row", cls.eval.__doc__)
         self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
 
+    def test_simple_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(func().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a, AnalyzeArgument)
+                assert isinstance(a.data_type, DataType)
+                assert a.value is not None
+                assert a.is_table is False
+                return AnalyzeResult(StructType().add("a", a.data_type))
+
+            def eval(self, a):
+                yield a,
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (func(lit(1)), StructType().add("a", IntegerType()), 
[Row(a=1)]),
+                # another data type
+                (func(lit("x")), StructType().add("a", StringType()), 
[Row(a="x")]),
+                # array type
+                (
+                    func(array(lit(1), lit(2), lit(3))),
+                    StructType().add("a", ArrayType(IntegerType(), 
containsNull=False)),
+                    [Row(a=[1, 2, 3])],
+                ),
+                # map type
+                (
+                    func(create_map(lit("x"), lit(1), lit("y"), lit(2))),
+                    StructType().add(
+                        "a", MapType(StringType(), IntegerType(), 
valueContainsNull=False)
+                    ),
+                    [Row(a={"x": 1, "y": 2})],
+                ),
+                # struct type
+                (
+                    func(named_struct(lit("x"), lit(1), lit("y"), lit(2))),
+                    StructType().add(
+                        "a",
+                        StructType()
+                        .add("x", IntegerType(), nullable=False)
+                        .add("y", IntegerType(), nullable=False),
+                    ),
+                    [Row(a=Row(x=1, y=2))],
+                ),
+                # use SQL
+                (
+                    self.spark.sql("SELECT * from test_udtf(1)"),
+                    StructType().add("a", IntegerType()),
+                    [Row(a=1)],
+                ),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_decorator(self):
+        @udtf
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(TestUDTF().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze_decorator_parens(self):
+        @udtf()
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+
+        expected = [Row(c1="hello", c2="world")]
+        self.assertEqual(TestUDTF().collect(), expected)
+        self.assertEqual(self.spark.sql("SELECT * FROM 
test_udtf()").collect(), expected)
+
+    def test_udtf_with_analyze_multiple_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument, b: AnalyzeArgument) -> 
AnalyzeResult:
+                return AnalyzeResult(StructType().add("a", 
a.data_type).add("b", b.data_type))
+
+            def eval(self, a, b):
+                yield a, b
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (
+                    func(lit(1), lit("x")),
+                    StructType().add("a", IntegerType()).add("b", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+                (
+                    self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+                    StructType().add("a", IntegerType()).add("b", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_arbitary_number_arguments(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(*args: AnalyzeArgument) -> AnalyzeResult:
+                return AnalyzeResult(
+                    StructType([StructField(f"col{i}", a.data_type) for i, a 
in enumerate(args)])
+                )
+
+            def eval(self, *args):
+                yield args
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        for i, (df, expected_schema, expected_results) in enumerate(
+            [
+                (
+                    func(lit(1)),
+                    StructType().add("col0", IntegerType()),
+                    [Row(a=1)],
+                ),
+                (
+                    self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+                    StructType().add("col0", IntegerType()).add("col1", 
StringType()),
+                    [Row(a=1, b="x")],
+                ),
+                # TODO(SPARK-44479): Support Python UDTFs with empty schema
+                # (func(), StructType(), [Row()]),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+    def test_udtf_with_analyze_table_argument(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a, AnalyzeArgument)
+                assert isinstance(a.data_type, StructType)
+                assert a.value is None
+                assert a.is_table is True
+                return AnalyzeResult(StructType().add("a", 
a.data_type[0].dataType))
+
+            def eval(self, a: Row):
+                if a["id"] > 5:
+                    yield a["id"],
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 8)))")
+        self.assertEqual(df.schema, StructType().add("a", LongType()))
+        self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+    def test_udtf_with_analyze_table_argument_adding_columns(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+                assert isinstance(a.data_type, StructType)
+                assert a.is_table is True
+                return AnalyzeResult(a.data_type.add("is_even", BooleanType()))
+
+            def eval(self, a: Row):
+                yield a["id"], a["id"] % 2 == 0
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM 
range(0, 4)))")
+        self.assertEqual(
+            df.schema,
+            StructType().add("id", LongType(), nullable=False).add("is_even", 
BooleanType()),
+        )
+        self.assertEqual(
+            df.collect(),
+            [
+                Row(a=0, is_even=True),
+                Row(a=1, is_even=False),
+                Row(a=2, is_even=True),
+                Row(a=3, is_even=False),
+            ],
+        )
+
+    def test_udtf_with_analyze_table_argument_repeating_rows(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze(n, row) -> AnalyzeResult:
+                if n.value is None or not isinstance(n.value, int) or (n.value 
< 1 or n.value > 10):
+                    raise Exception("The first argument must be a scalar 
integer between 1 and 10")
+
+                if row.is_table is False:
+                    raise Exception("The second argument must be a table 
argument")
+
+                assert isinstance(row.data_type, StructType)
+                return AnalyzeResult(row.data_type)
+
+            def eval(self, n: int, row: Row):
+                for _ in range(n):
+                    yield row
+
+        func = udtf(TestUDTF)
+        self.spark.udtf.register("test_udtf", func)
+
+        expected_schema = StructType().add("id", LongType(), nullable=False)
+        expected_results = [
+            Row(a=0),
+            Row(a=0),
+            Row(a=1),
+            Row(a=1),
+            Row(a=2),
+            Row(a=2),
+            Row(a=3),
+            Row(a=3),
+        ]
+        for i, df in enumerate(
+            [
+                self.spark.sql("SELECT * FROM test_udtf(2, TABLE (SELECT id 
FROM range(0, 4)))"),
+                self.spark.sql(
+                    "SELECT * FROM test_udtf(1 + 1, TABLE (SELECT id FROM 
range(0, 4)))"
+                ),
+            ]
+        ):
+            with self.subTest(query_no=i):
+                self.assertEqual(df.schema, expected_schema)
+                self.assertEqual(df.collect(), expected_results)
+
+        with self.assertRaisesRegex(
+            AnalysisException, "The first argument must be a scalar integer 
between 1 and 10"
+        ):
+            self.spark.sql(
+                "SELECT * FROM test_udtf(0, TABLE (SELECT id FROM range(0, 
4)))"
+            ).collect()
+
+        with self.sql_conf(
+            {"spark.sql.tvf.allowMultipleTableArguments.enabled": True}
+        ), self.assertRaisesRegex(
+            AnalysisException, "The first argument must be a scalar integer 
between 1 and 10"
+        ):
+            self.spark.sql(
+                """
+                SELECT * FROM test_udtf(
+                  TABLE (SELECT id FROM range(0, 1)),
+                  TABLE (SELECT id FROM range(0, 4)))
+                """
+            ).collect()
+
+        with self.assertRaisesRegex(
+            AnalysisException, "The second argument must be a table argument"
+        ):
+            self.spark.sql("SELECT * FROM test_udtf(1, 'x')").collect()
+
+    def test_udtf_with_both_return_type_and_analyze(self):
+        class TestUDTF:
+            @staticmethod
+            def analyze() -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        with self.assertRaises(PySparkAttributeError) as e:
+            udtf(TestUDTF, returnType="c1: string, c2: string")
+
+        self.check_error(
+            exception=e.exception,
+            
error_class="INVALID_UDTF_BOTH_RETURN_TYPE_AND_ANALYZE_STATICMETHOD",
+            message_parameters={"name": "TestUDTF"},
+        )
+
+    def test_udtf_with_neither_return_type_nor_analyze(self):
+        class TestUDTF:
+            def eval(self):
+                yield "hello", "world"
+
+        with self.assertRaises(PySparkAttributeError) as e:
+            udtf(TestUDTF)
+
+        self.check_error(
+            exception=e.exception,
+            error_class="INVALID_UDTF_RETURN_TYPE",
+            message_parameters={"name": "TestUDTF"},
+        )
+
+    def test_udtf_with_analyze_non_staticmethod(self):
+        class TestUDTF:
+            def analyze(self) -> AnalyzeResult:
+                return AnalyzeResult(StructType().add("c1", 
StringType()).add("c2", StringType()))
+
+            def eval(self):
+                yield "hello", "world"
+
+        with self.assertRaises(PySparkAttributeError) as e:
+            udtf(TestUDTF)
+
+        self.check_error(
+            exception=e.exception,
+            error_class="INVALID_UDTF_RETURN_TYPE",

Review Comment:
   It IS checking if the UDTF class has an `analyze` method and throw an error 
if it is not static, and the error message will be like:
   
   ```
   The UDTF '<name>' is invalid. It does not specify its return type or 
implement the required 'analyze' static method. Please specify the return type 
or implement the 'analyze' static method in '<name>' and try again.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] ueshin commented on a diff in pull request #41948: [SPARK-44380][SQL][PYTHON] Support for Python UDTF to analyze in Python

Reply via email to