This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 47afefb6679 [SPARK-41980][CONNECT][TESTS] Enable
test_functions_broadcast in functions parity test
47afefb6679 is described below
commit 47afefb66794df16399e34791a68bdec9885778e
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Jan 12 10:31:40 2023 +0900
[SPARK-41980][CONNECT][TESTS] Enable test_functions_broadcast in functions
parity test
### What changes were proposed in this pull request?
This PR enables `test_functions_broadcast` back by avoiding `_jdf` access
in the original test.
### Why are the changes needed?
For test coverage and feature parity.
### Does this PR introduce _any_ user-facing change?
No, test-only.
### How was this patch tested?
Fixed unittests.
Closes #39500 from HyukjinKwon/SPARK-41980.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/tests/connect/test_parity_functions.py | 6 ------
python/pyspark/sql/tests/test_functions.py | 14 +++++++++-----
2 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/python/pyspark/sql/tests/connect/test_parity_functions.py
b/python/pyspark/sql/tests/connect/test_parity_functions.py
index 2f6ed05559f..dd7229d158f 100644
--- a/python/pyspark/sql/tests/connect/test_parity_functions.py
+++ b/python/pyspark/sql/tests/connect/test_parity_functions.py
@@ -40,12 +40,6 @@ class FunctionsParityTests(FunctionsTestsMixin,
ReusedConnectTestCase):
def test_function_parity(self):
super().test_function_parity()
- @unittest.skip(
- "Spark Connect does not support Spark Context, _jdf but the test
depends on that."
- )
- def test_functions_broadcast(self):
- super().test_functions_broadcast()
-
@unittest.skip("Spark Connect does not support Spark Context but the test
depends on that.")
def test_input_file_name_reset_for_rdd(self):
super().test_input_file_name_reset_for_rdd()
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index 4db1eed1eb1..38a4e3e6644 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -16,6 +16,8 @@
#
import datetime
+import io
+from contextlib import redirect_stdout
from inspect import getmembers, isfunction
from itertools import chain
import re
@@ -452,15 +454,17 @@ class FunctionsTestsMixin:
df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key",
"value"))
# equijoin - should be converted into broadcast join
- plan1 = df1.join(broadcast(df2),
"key")._jdf.queryExecution().executedPlan()
- self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))
+ with io.StringIO() as buf, redirect_stdout(buf):
+ df1.join(broadcast(df2), "key").explain(True)
+ self.assertGreaterEqual(buf.getvalue().count("Broadcast"), 1)
# no join key -- should not be a broadcast join
- plan2 =
df1.crossJoin(broadcast(df2))._jdf.queryExecution().executedPlan()
- self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))
+ with io.StringIO() as buf, redirect_stdout(buf):
+ df1.crossJoin(broadcast(df2)).explain(True)
+ self.assertGreaterEqual(buf.getvalue().count("Broadcast"), 1)
# planner should not crash without a join
- broadcast(df1)._jdf.queryExecution().executedPlan()
+ broadcast(df1).explain(True)
def test_first_last_ignorenulls(self):
from pyspark.sql import functions
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]