ueshin commented on code in PR #52689:
URL: https://github.com/apache/spark/pull/52689#discussion_r2450282483
##########
python/pyspark/sql/tests/test_udf.py:
##########
@@ -1551,6 +1554,98 @@ def check_struct_binary_type(s):
expected =
self.spark.createDataFrame([Row(type_name=expected_type)])
assertDataFrameEqual(result, expected)
+ @unittest.skipIf(is_remote_only(), "Requires JVM access")
+ def test_udf_with_logging(self):
+ @udf
+ def my_udf():
+ logger = logging.getLogger("test")
+ print("print to stdout ❤", file=sys.stdout)
+ print("print to stderr 😀", file=sys.stderr)
+ try:
+ 1 / 0
+ except Exception:
+ logger.exception("exception")
+ return "x"
+
+ # Logging is disabled by default
+ assertDataFrameEqual(
+ self.spark.range(1).select(my_udf().alias("result")),
[Row(result="x")]
+ )
+
self.assertEqual(self.spark.table("system.session.python_worker_logs").count(),
0)
+
+ with self.sql_conf({"spark.sql.pyspark.worker.logging.enabled":
"true"}):
+ assertDataFrameEqual(
+ self.spark.range(1).select(my_udf().alias("result")),
[Row(result="x")]
+ )
+
+ logs = self.spark.table("system.session.python_worker_logs")
+
+ assertDataFrameEqual(
+ logs.select("level", "msg", "context", "logger"),
+ [
+ Row(
+ level="INFO",
+ msg="print to stdout ❤",
+ context={"func_name": my_udf.__name__},
+ logger="stdout",
+ ),
+ Row(
+ level="ERROR",
+ msg="print to stderr 😀",
+ context={"func_name": my_udf.__name__},
+ logger="stderr",
+ ),
+ Row(
+ level="ERROR",
+ msg="exception",
+ context={"func_name": my_udf.__name__},
+ logger="test",
+ ),
+ ],
+ )
+
+ self.assertEqual(logs.where("exception is not
null").select("exception").count(), 1)
+
+ @unittest.skipIf(is_remote_only(), "Requires JVM access")
Review Comment:
It's necessary to clean up the log for each test.
https://github.com/apache/spark/blob/0eed72644332e49de682b31cbcbfe13ff4931756/python/pyspark/testing/connectutils.py#L199-L202
##########
python/pyspark/sql/tests/test_udf.py:
##########
@@ -1551,6 +1554,98 @@ def check_struct_binary_type(s):
expected =
self.spark.createDataFrame([Row(type_name=expected_type)])
assertDataFrameEqual(result, expected)
+ @unittest.skipIf(is_remote_only(), "Requires JVM access")
+ def test_udf_with_logging(self):
+ @udf
+ def my_udf():
+ logger = logging.getLogger("test")
+ print("print to stdout ❤", file=sys.stdout)
+ print("print to stderr 😀", file=sys.stderr)
+ try:
+ 1 / 0
+ except Exception:
+ logger.exception("exception")
+ return "x"
+
+ # Logging is disabled by default
+ assertDataFrameEqual(
+ self.spark.range(1).select(my_udf().alias("result")),
[Row(result="x")]
+ )
+
self.assertEqual(self.spark.table("system.session.python_worker_logs").count(),
0)
+
+ with self.sql_conf({"spark.sql.pyspark.worker.logging.enabled":
"true"}):
+ assertDataFrameEqual(
+ self.spark.range(1).select(my_udf().alias("result")),
[Row(result="x")]
+ )
+
+ logs = self.spark.table("system.session.python_worker_logs")
Review Comment:
So far I don't have a plan to make it configurable, but I think we can if
necessary.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]