dtenedor commented on code in PR #41948:
URL: https://github.com/apache/spark/pull/41948#discussion_r1264031515
##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -719,6 +726,153 @@ def terminate(self):
self.assertIn("Evaluate the input row", cls.eval.__doc__)
self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
+ def test_simple_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> StructType:
+ return StructType().add("c1", StringType()).add("c2",
StringType())
+
+ def eval(self):
+ yield "hello", "world"
+
+ func = udtf(TestUDTF)
+ rows = func().collect()
+ self.assertEqual(rows, [Row(c1="hello", c2="world")])
+
+ def test_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a) -> StructType:
+ assert isinstance(a, dict)
+ assert isinstance(a["data_type"], DataType)
+ assert a["value"] is not None
+ assert a["is_table"] is False
+ return StructType().add("a", a["data_type"])
+
+ def eval(self, a):
+ yield a,
+
+ func = udtf(TestUDTF)
+
+ df1 = func(lit(1))
+ self.assertEquals(df1.schema, StructType().add("a", IntegerType()))
+ self.assertEqual(df1.collect(), [Row(a=1)])
+
+ df2 = func(lit("x"))
+ self.assertEquals(df2.schema, StructType().add("a", StringType()))
+ self.assertEqual(df2.collect(), [Row(a="x")])
+
+ def test_udtf_with_analyze_multiple_arguments(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a, b) -> StructType:
+ return StructType().add("a", a["data_type"]).add("b",
b["data_type"])
+
+ def eval(self, a, b):
+ yield a, b
+
+ func = udtf(TestUDTF)
+
+ df = func(lit(1), lit("x"))
+ self.assertEquals(df.schema, StructType().add("a",
IntegerType()).add("b", StringType()))
+ self.assertEqual(df.collect(), [Row(a=1, b="x")])
+
+ def test_udtf_with_analyze_table_argument(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a) -> StructType:
+ assert isinstance(a, dict)
+ assert isinstance(a["data_type"], StructType)
+ assert a["value"] is None
+ assert a["is_table"] is True
+ return StructType().add("a", a["data_type"][0].dataType)
+
+ def eval(self, a: Row):
+ if a["id"] > 5:
+ yield a["id"],
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM
range(0, 8)))")
+ self.assertEqual(df.schema, StructType().add("a", LongType()))
+ self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+ def test_udtf_with_neither_return_type_nor_analyze(self):
Review Comment:
Good question, we might be best off just returning error if we attempt to
register the UDTF with a static return type, and the UDTF class has an
‘analyze’ method, to make them mutually exclusive.
##########
python/pyspark/sql/tests/test_udtf.py:
##########
@@ -719,6 +726,153 @@ def terminate(self):
self.assertIn("Evaluate the input row", cls.eval.__doc__)
self.assertIn("Terminate the UDTF", cls.terminate.__doc__)
+ def test_simple_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze() -> StructType:
+ return StructType().add("c1", StringType()).add("c2",
StringType())
+
+ def eval(self):
+ yield "hello", "world"
+
+ func = udtf(TestUDTF)
+ rows = func().collect()
+ self.assertEqual(rows, [Row(c1="hello", c2="world")])
+
+ def test_udtf_with_analyze(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a) -> StructType:
+ assert isinstance(a, dict)
+ assert isinstance(a["data_type"], DataType)
+ assert a["value"] is not None
+ assert a["is_table"] is False
+ return StructType().add("a", a["data_type"])
+
+ def eval(self, a):
+ yield a,
+
+ func = udtf(TestUDTF)
+
+ df1 = func(lit(1))
+ self.assertEquals(df1.schema, StructType().add("a", IntegerType()))
+ self.assertEqual(df1.collect(), [Row(a=1)])
+
+ df2 = func(lit("x"))
+ self.assertEquals(df2.schema, StructType().add("a", StringType()))
+ self.assertEqual(df2.collect(), [Row(a="x")])
+
+ def test_udtf_with_analyze_multiple_arguments(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a, b) -> StructType:
+ return StructType().add("a", a["data_type"]).add("b",
b["data_type"])
+
+ def eval(self, a, b):
+ yield a, b
+
+ func = udtf(TestUDTF)
+
+ df = func(lit(1), lit("x"))
+ self.assertEquals(df.schema, StructType().add("a",
IntegerType()).add("b", StringType()))
+ self.assertEqual(df.collect(), [Row(a=1, b="x")])
+
+ def test_udtf_with_analyze_table_argument(self):
+ class TestUDTF:
+ @staticmethod
+ def analyze(a) -> StructType:
+ assert isinstance(a, dict)
+ assert isinstance(a["data_type"], StructType)
+ assert a["value"] is None
+ assert a["is_table"] is True
+ return StructType().add("a", a["data_type"][0].dataType)
+
+ def eval(self, a: Row):
+ if a["id"] > 5:
+ yield a["id"],
+
+ func = udtf(TestUDTF)
+ self.spark.udtf.register("test_udtf", func)
+
+ df = self.spark.sql("SELECT * FROM test_udtf(TABLE (SELECT id FROM
range(0, 8)))")
+ self.assertEqual(df.schema, StructType().add("a", LongType()))
+ self.assertEqual(df.collect(), [Row(a=6), Row(a=7)])
+
+ def test_udtf_with_neither_return_type_nor_analyze(self):
Review Comment:
Good question, we might be best off just returning error if we attempt to
register the UDTF with a static return type, and the UDTF class has an
‘analyze’ method, to make them mutually exclusive.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]