ueshin commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1267466437
##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -748,6 +769,442 @@ def terminate(self):
self.assertIn("Evaluate the input row", cls.eval.__doc__)
self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
+ def test_simple_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> AnalyzeResult:
+ return AnalyzeResult(StructType().add("c1",
StringType()).add("c2", StringType()))
+
+ def eval(self):
+ yield "hello", "world"
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ expected = [Row(c1="hello", c2="world")]
+ self.assertEqual(func().collect(), expected)
+ self.assertEqual(self.spark.sql("SELECT * FROM
test_udtf()").collect(), expected)
+
+ def test_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+ assert isinstance(a, AnalyzeArgument)
+ assert isinstance(a.data_type, DataType)
+ assert a.value is not None
+ assert a.is_table is False
+ return AnalyzeResult(StructType().add("a", a.data_type))
+
+ def eval(self, a):
+ yield a,
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ for i, (df, expected_schema, expected_results) in enumerate(
+ [
+ (func(lit(1)), StructType().add("a", IntegerType()),
[Row(a=1)]),
+ # another data type
+ (func(lit("x")), StructType().add("a", StringType()),
[Row(a="x")]),
+ # array type
+ (
+ func(array(lit(1), lit(2), lit(3))),
+ StructType().add("a", ArrayType(IntegerType(),
containsNull=False)),
+ [Row(a=[1, 2, 3])],
+ ),
+ # map type
+ (
+ func(create_map(lit("x"), lit(1), lit("y"), lit(2))),
+ StructType().add(
+ "a", MapType(StringType(), IntegerType(),
valueContainsNull=False)
+ ),
+ [Row(a={"x": 1, "y": 2})],
+ ),
+ # struct type
+ (
+ func(named_struct(lit("x"), lit(1), lit("y"), lit(2))),
+ StructType().add(
+ "a",
+ StructType()
+ .add("x", IntegerType(), nullable=False)
+ .add("y", IntegerType(), nullable=False),
+ ),
+ [Row(a=Row(x=1, y=2))],
+ ),
+ # use SQL
+ (
+ self.spark.sql("SELECT * from test_udtf(1)"),
+ StructType().add("a", IntegerType()),
+ [Row(a=1)],
+ ),
+ ]
+ ):
+ with self.subTest(query_no=i):
+ self.assertEqual(df.schema, expected_schema)
+ self.assertEqual(df.collect(), expected_results)
+
+ def test_udtf_with_analyze_decorator(self):
+ @udtf
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> AnalyzeResult:
+ return AnalyzeResult(StructType().add("c1",
StringType()).add("c2", StringType()))
+
+ def eval(self):
+ yield "hello", "world"
+
+ self.spark.udtf.register("test_udtf", TestUDTF)
+
+ expected = [Row(c1="hello", c2="world")]
+ self.assertEqual(TestUDTF().collect(), expected)
+ self.assertEqual(self.spark.sql("SELECT * FROM
test_udtf()").collect(), expected)
+
+ def test_udtf_with_analyze_decorator_parens(self):
+ @udtf()
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> AnalyzeResult:
+ return AnalyzeResult(StructType().add("c1",
StringType()).add("c2", StringType()))
+
+ def eval(self):
+ yield "hello", "world"
+
+ self.spark.udtf.register("test_udtf", TestUDTF)
+
+ expected = [Row(c1="hello", c2="world")]
+ self.assertEqual(TestUDTF().collect(), expected)
+ self.assertEqual(self.spark.sql("SELECT * FROM
test_udtf()").collect(), expected)
+
+ def test_udtf_with_analyze_multiple_arguments(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a: AnalyzeArgument, b: AnalyzeArgument) ->
AnalyzeResult:
+ return AnalyzeResult(StructType().add("a",
a.data_type).add("b", b.data_type))
+
+ def eval(self, a, b):
+ yield a, b
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ for i, (df, expected_schema, expected_results) in enumerate(
+ [
+ (
+ func(lit(1), lit("x")),
+ StructType().add("a", IntegerType()).add("b",
StringType()),
+ [Row(a=1, b="x")],
+ ),
+ (
+ self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+ StructType().add("a", IntegerType()).add("b",
StringType()),
+ [Row(a=1, b="x")],
+ ),
+ ]
+ ):
+ with self.subTest(query_no=i):
+ self.assertEqual(df.schema, expected_schema)
+ self.assertEqual(df.collect(), expected_results)
+
+ def test_udtf_with_analyze_arbitary_number_arguments(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(*args: AnalyzeArgument) -> AnalyzeResult:
+ return AnalyzeResult(
+ StructType([StructField(f"col{i}", a.data_type) for i, a
in enumerate(args)])
+ )
+
+ def eval(self, *args):
+ yield args
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ for i, (df, expected_schema, expected_results) in enumerate(
+ [
+ (
+ func(lit(1)),
+ StructType().add("col0", IntegerType()),
+ [Row(a=1)],
+ ),
+ (
+ self.spark.sql("SELECT * FROM test_udtf(1, 'x')"),
+ StructType().add("col0", IntegerType()).add("col1",
StringType()),
+ [Row(a=1, b="x")],
+ ),
+ # TODO(SPARK-44479): Support Python UDTFs with empty schema
+ # (func(), StructType(), [Row()]),
+ ]
+ ):
+ with self.subTest(query_no=i):
+ self.assertEqual(df.schema, expected_schema)
+ self.assertEqual(df.collect(), expected_results)
+
+ def test_udtf_with_analyze_table_argument(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+ assert isinstance(a, AnalyzeArgument)
+ assert isinstance(a.data_type, StructType)
+ assert a.value is None
+ assert a.is_table is True
+ return AnalyzeResult(StructType().add("a",
a.data_type[0].dataType))
+
+ def eval(self, a: Row):
+ if a["id"] > 5:
+ yield a["id"],
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM
range(0, 8)))")
+ self.assertEqual(df.schema, StructType().add("a", LongType()))
+ self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+ def test_udtf_with_analyze_table_argument_adding_columns(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a: AnalyzeArgument) -> AnalyzeResult:
+ assert isinstance(a.data_type, StructType)
+ assert a.is_table is True
+ return AnalyzeResult(a.data_type.add("is_even", BooleanType()))
+
+ def eval(self, a: Row):
+ yield a["id"], a["id"] % 2 == 0
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM
range(0, 4)))")
+ self.assertEqual(
+ df.schema,
+ StructType().add("id", LongType(), nullable=False).add("is_even",
BooleanType()),
+ )
+ self.assertEqual(
+ df.collect(),
+ [
+ Row(a=0, is_even=True),
+ Row(a=1, is_even=False),
+ Row(a=2, is_even=True),
+ Row(a=3, is_even=False),
+ ],
+ )
+
+ def test_udtf_with_analyze_table_argument_repeating_rows(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(n, row) -> AnalyzeResult:
+ if n.value is None or not isinstance(n.value, int) or (n.value
< 1 or n.value > 10):
+ raise Exception("The first argument must be a scalar
integer between 1 and 10")
+
+ if row.is_table is False:
+ raise Exception("The second argument must be a table
argument")
+
+ assert isinstance(row.data_type, StructType)
+ return AnalyzeResult(row.data_type)
+
+ def eval(self, n: int, row: Row):
+ for _ in range(n):
+ yield row
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ expected_schema = StructType().add("id", LongType(), nullable=False)
+ expected_results = [
+ Row(a=0),
+ Row(a=0),
+ Row(a=1),
+ Row(a=1),
+ Row(a=2),
+ Row(a=2),
+ Row(a=3),
+ Row(a=3),
+ ]
+ for i, df in enumerate(
+ [
+ self.spark.sql("SELECT * FROM test_udtf(2, TABLE (SELECT id
FROM range(0, 4)))"),
+ self.spark.sql(
+ "SELECT * FROM test_udtf(1 + 1, TABLE (SELECT id FROM
range(0, 4)))"
+ ),
+ ]
+ ):
+ with self.subTest(query_no=i):
+ self.assertEqual(df.schema, expected_schema)
+ self.assertEqual(df.collect(), expected_results)
+
+ with self.assertRaisesRegex(
+ AnalysisException, "The first argument must be a scalar integer
between 1 and 10"
+ ):
+ self.spark.sql(
+ "SELECT * FROM test_udtf(0, TABLE (SELECT id FROM range(0,
4)))"
+ ).collect()
+
+ with self.sql_conf(
+ {"spark.sql.tvf.allowMultipleTableArguments.enabled": True}
+ ), self.assertRaisesRegex(
+ AnalysisException, "The first argument must be a scalar integer
between 1 and 10"
+ ):
+ self.spark.sql(
+ """
+ SELECT * FROM test_udtf(
+ TABLE (SELECT id FROM range(0, 1)),
+ TABLE (SELECT id FROM range(0, 4)))
+ """
+ ).collect()
+
+ with self.assertRaisesRegex(
+ AnalysisException, "The second argument must be a table argument"
+ ):
+ self.spark.sql("SELECT * FROM test_udtf(1, 'x')").collect()
+
+ def test_udtf_with_both_return_type_and_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> AnalyzeResult:
+ return AnalyzeResult(StructType().add("c1",
StringType()).add("c2", StringType()))
+
+ def eval(self):
+ yield "hello", "world"
+
+ with self.assertRaises(PySparkAttributeError) as e:
+ udtf(TestUDTF, returnType="c1: string, c2: string")
+
+ self.check_error(
+ exception=e.exception,
+
error_class="INVALID_UDTF_BOTH_RETURN_TYPE_AND_ANALYZE_STATICMETHOD",
+ message_parameters={"name": "TestUDTF"},
+ )
+
+ def test_udtf_with_neither_return_type_nor_analyze(self):
+ class TestUDTF:
+ def eval(self):
+ yield "hello", "world"
+
+ with self.assertRaises(PySparkAttributeError) as e:
+ udtf(TestUDTF)
+
+ self.check_error(
+ exception=e.exception,
+ error_class="INVALID_UDTF_RETURN_TYPE",
+ message_parameters={"name": "TestUDTF"},
+ )
+
+ def test_udtf_with_analyze_non_staticmethod(self):
+ class TestUDTF:
+ def analyze(self) -> AnalyzeResult:
+ return AnalyzeResult(StructType().add("c1",
StringType()).add("c2", StringType()))
+
+ def eval(self):
+ yield "hello", "world"
+
+ with self.assertRaises(PySparkAttributeError) as e:
+ udtf(TestUDTF)
+
+ self.check_error(
+ exception=e.exception,
+ error_class="INVALID_UDTF_RETURN_TYPE",
Review Comment:
It IS checking if the UDTF class has an `analyze` method and throw an error
if it is not static, and the error message will be like:
```
The UDTF '<name>' is invalid. It does not specify its return type or
implement the required 'analyze' static method. Please specify the return type
or implement the 'analyze' static method in '<name>' and try again.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]