Yikun commented on a change in pull request #35888:
URL: https://github.com/apache/spark/pull/35888#discussion_r830870746
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -10081,6 +10086,10 @@ def all(self, axis: Axis = 0) -> "Series":
* 0 / 'index' : reduce the index, return a Series whose index is
the
original column labels.
+ bool_only : bool, default None
+ Include only boolean columns. If None, will attempt to use
everything,
+ then use only boolean data. Not implemented for Series.
Review comment:
`Not implemented for Series.` I guess this useless, because `frame.all`
is not in `generic` like pandas.
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -10195,32 +10190,87 @@ def any(self, axis: Axis = 0) -> "Series":
col5 False
col6 True
dtype: bool
- """
- from pyspark.pandas.series import first_series
+ Include only boolean columns when set `bool_only=True`.
+
+ >>> df.any(bool_only=True)
+ col1 False
+ col2 True
+ dtype: bool
+ """
axis = validate_axis(axis)
if axis != 0:
raise NotImplementedError('axis should be either 0 or "index"
currently.')
- applied = []
column_labels = self._internal.column_labels
+ if bool_only:
+ column_labels = self._bool_column_labels(column_labels)
+ if len(column_labels) == 0:
+ return ps.Series([], dtype=bool)
+
+ applied = []
for label in column_labels:
scol = self._internal.spark_column_for(label)
- all_col = F.max(F.coalesce(scol.cast("boolean"), SF.lit(False)))
- applied.append(F.when(all_col.isNull(), False).otherwise(all_col))
+ any_col = F.max(F.coalesce(scol.cast("boolean"), SF.lit(False)))
+ applied.append(F.when(any_col.isNull(), False).otherwise(any_col))
+
+ return self._result_aggregated(column_labels, applied)
+
+ def _bool_column_labels(self, column_labels: List[Label]) -> List[Label]:
+ """
+ Filter column labels of boolean columns (without None).
+ """
+ bool_column_labels = []
+ for label in column_labels:
+ psser = self._psser_for(label)
+ if is_bool_dtype(psser):
+ # Rely on dtype rather than spark type because
+ # columns that consist of bools and Nones should be excluded
+ # if bool_only is True
+ bool_column_labels.append(label)
+ return bool_column_labels
+
+ def _result_aggregated(self, column_labels: List[Label], scols:
List[Column]) -> ps.Series:
Review comment:
```suggestion
def _result_aggregated(self, column_labels: List[Label], scols:
List[Column]) -> "Series":
```
```
pyspark/pandas/tests/test_dataframe.py:None
(pyspark/pandas/tests/test_dataframe.py)
/opt/homebrew/Cellar/[email protected]/3.9.10/Frameworks/Python.framework/Versions/3.9/lib/python3.9/importlib/__init__.py:127:
in import_module
return _bootstrap._gcd_import(name[level:], package, level)
// ...
../__init__.py:58: in <module>
from pyspark.pandas.frame import DataFrame
../frame.py:351: in <module>
class DataFrame(Frame, Generic[T]):
../frame.py:10233: in DataFrame
def _result_aggregated(self, column_labels: List[Label], scols:
List[Column]) -> ps.Series:
E AttributeError: partially initialized module 'pyspark.pandas' has no
attribute 'Series' (most likely due to a circular import)
```
Looks like we need to use "Series" instead of `ps.Series` to avoid circular
import in some case.
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -10168,6 +10159,10 @@ def any(self, axis: Axis = 0) -> "Series":
* 0 / 'index' : reduce the index, return a Series whose index is
the
original column labels.
+ bool_only : bool, default None
+ Include only boolean columns. If None, will attempt to use
everything,
+ then use only boolean data. Not implemented for Series.
Review comment:
```suggestion
then use only boolean data.
```
Same
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]