This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 474f64a8850 [SPARK-44984][PYTHON][CONNECT] Remove `_get_alias` from
DataFrame
474f64a8850 is described below
commit 474f64a88502fe242654eb85c7cb5a1514c710e9
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Aug 28 19:44:32 2023 +0800
[SPARK-44984][PYTHON][CONNECT] Remove `_get_alias` from DataFrame
### What changes were proposed in this pull request?
Remove `_get_alias` from DataFrame
### Why are the changes needed?
`_get_alias` was added in the [initial
PR](https://github.com/apache/spark/commit/6637bbe2b25ff2877b41a9677ce6d75e6996f968),
but seems unneeded
- field `alias` in `plan.Project` is always `None`;
- `_get_alias` takes no parameter, but is used to replace a specify column
name, the logic is weird when the column name varies;
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #42698 from zhengruifeng/py_connect_del_alias.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/dataframe.py | 15 ++-------------
python/pyspark/sql/connect/plan.py | 1 -
2 files changed, 2 insertions(+), 14 deletions(-)
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 365cde59227..94c3ca95956 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -1573,14 +1573,6 @@ class DataFrame:
sampleBy.__doc__ = PySparkDataFrame.sampleBy.__doc__
- def _get_alias(self) -> Optional[str]:
- p = self._plan
- while p is not None:
- if isinstance(p, plan.Project) and p.alias:
- return p.alias
- p = p._child
- return None
-
def __getattr__(self, name: str) -> "Column":
if self._plan is None:
raise SparkConnectException("Cannot analyze on empty plan.")
@@ -1607,9 +1599,8 @@ class DataFrame:
"'%s' object has no attribute '%s'" %
(self.__class__.__name__, name)
)
- alias = self._get_alias()
return _to_col_with_plan_id(
- col=alias if alias is not None else name,
+ col=name,
plan_id=self._plan._plan_id,
)
@@ -1625,8 +1616,6 @@ class DataFrame:
def __getitem__(self, item: Union[int, str, Column, List, Tuple]) ->
Union[Column, "DataFrame"]:
if isinstance(item, str):
- # Check for alias
- alias = self._get_alias()
if self._plan is None:
raise SparkConnectException("Cannot analyze on empty plan.")
@@ -1635,7 +1624,7 @@ class DataFrame:
self.select(item).isLocal()
return _to_col_with_plan_id(
- col=alias if alias is not None else item,
+ col=item,
plan_id=self._plan._plan_id,
)
elif isinstance(item, Column):
diff --git a/python/pyspark/sql/connect/plan.py
b/python/pyspark/sql/connect/plan.py
index 7952d2af999..5e9b4e53dbf 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -464,7 +464,6 @@ class Project(LogicalPlan):
def __init__(self, child: Optional["LogicalPlan"], *columns:
"ColumnOrName") -> None:
super().__init__(child)
self._columns = list(columns)
- self.alias: Optional[str] = None
self._verify_expressions()
def _verify_expressions(self) -> None:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]