This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new d0af25f4137 [SPARK-42267][CONNECT][PYTHON] DataFrame.join` should
standardize the JoinType string
d0af25f4137 is described below
commit d0af25f413787bc3451116a447e0eef488b532bd
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Feb 8 17:28:54 2023 +0900
[SPARK-42267][CONNECT][PYTHON] DataFrame.join` should standardize the
JoinType string
### What changes were proposed in this pull request?
standardize the JoinType string
be consistent with PySpark
https://github.com/apache/spark/blob/05c0fa573881b49d8ead9a5e16071190e5841e1b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala#L25
### Why are the changes needed?
```
>>> df = spark.range(1)
>>> df2 = spark.range(2)
>>> df.join(df2, how="left_outer")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/xinrong.meng/spark/python/pyspark/sql/connect/dataframe.py",
line 438, in join
plan.Join(left=self._plan, right=other._plan, on=on, how=how),
File "/Users/xinrong.meng/spark/python/pyspark/sql/connect/plan.py", line
730, in _init_
raise NotImplementedError(
NotImplementedError:
Unsupported join type: left_outer. Supported join types include:
"inner", "outer", "full", "fullouter", "full_outer",
"leftouter", "left", "left_outer", "rightouter",
"right", "right_outer", "leftsemi", "left_semi",
"semi", "leftanti", "left_anti", "anti", "cross",
```
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
updated UT
Closes #39938 from zhengruifeng/connect_join_types.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit f24ce658a71fdb57440f0caff1d3369ade4f688a)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/connect/dataframe.py | 2 ++
python/pyspark/sql/tests/connect/test_connect_plan.py | 13 ++++++++++++-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 0c3eb5b8934..40625f77df5 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -436,6 +436,8 @@ class DataFrame:
raise Exception("Cannot join when self._plan is empty.")
if other._plan is None:
raise Exception("Cannot join when other._plan is empty.")
+ if how is not None and isinstance(how, str):
+ how = how.lower().replace("_", "")
return DataFrame.withPlan(
plan.Join(left=self._plan, right=other._plan, on=on, how=how),
diff --git a/python/pyspark/sql/tests/connect/test_connect_plan.py
b/python/pyspark/sql/tests/connect/test_connect_plan.py
index 9cb7c5dac1e..026f286cc56 100644
--- a/python/pyspark/sql/tests/connect/test_connect_plan.py
+++ b/python/pyspark/sql/tests/connect/test_connect_plan.py
@@ -685,10 +685,21 @@ class SparkConnectPlanTests(PlanOnlyTestFixture):
(None, proto.Join.JoinType.JOIN_TYPE_INNER),
("inner", proto.Join.JoinType.JOIN_TYPE_INNER),
("outer", proto.Join.JoinType.JOIN_TYPE_FULL_OUTER),
+ ("full", proto.Join.JoinType.JOIN_TYPE_FULL_OUTER),
+ ("fullouter", proto.Join.JoinType.JOIN_TYPE_FULL_OUTER),
+ ("full_outer", proto.Join.JoinType.JOIN_TYPE_FULL_OUTER),
+ ("left", proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER),
("leftouter", proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER),
+ ("left_outer", proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER),
+ ("right", proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER),
("rightouter", proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER),
- ("leftanti", proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI),
+ ("right_outer", proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER),
+ ("semi", proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI),
("leftsemi", proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI),
+ ("left_semi", proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI),
+ ("anti", proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI),
+ ("leftanti", proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI),
+ ("left_anti", proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI),
("cross", proto.Join.JoinType.JOIN_TYPE_CROSS),
]:
joined_df = df_left.join(df_right, on=col("name"),
how=join_type_str)._plan.to_proto(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]