ueshin commented on a change in pull request #33929:
URL: https://github.com/apache/spark/pull/33929#discussion_r705826338
##########
File path: python/pyspark/pandas/frame.py
##########
@@ -6695,50 +6706,71 @@ def drop(
x y z w
0 1 3 5 7
1 2 4 6 8
- >>> df.drop('a') # doctest: +NORMALIZE_WHITESPACE
+ >>> df.drop(labels='a', axis=1) # doctest: +NORMALIZE_WHITESPACE
b
z w
0 5 7
1 6 8
Notes
-----
- Currently only axis = 1 is supported in this function,
- axis = 0 is yet to be implemented.
+ Currently, dropping rows of a MultiIndex DataFrame is not supported
yet.
"""
+ internal = self._internal
if labels is not None:
+ if index is not None or columns is not None:
+ raise ValueError("Cannot specify both 'labels' and
'index'/'columns'")
axis = validate_axis(axis)
if axis == 1:
- return self.drop(columns=labels)
- raise NotImplementedError("Drop currently only works for axis=1")
- elif columns is not None:
- if is_name_like_tuple(columns):
- columns = [columns]
- elif is_name_like_value(columns):
- columns = [(columns,)]
+ return self.drop(index=index, columns=labels)
else:
- columns = [col if is_name_like_tuple(col) else (col,) for col
in columns]
- drop_column_labels = set(
- label
- for label in self._internal.column_labels
- for col in columns
- if label[: len(col)] == col
- )
- if len(drop_column_labels) == 0:
- raise KeyError(columns)
- cols, labels = zip(
- *(
- (column, label)
- for column, label in zip(
- self._internal.data_spark_column_names,
self._internal.column_labels
+ return self.drop(index=labels, columns=columns)
+ else:
+ if index is None and columns is None:
+ raise ValueError("Need to specify at least one of 'labels' or
'columns' or 'index'")
+ if index is not None:
+ if is_name_like_tuple(index) or is_name_like_value(index):
+ index = [index]
+
+ index_scols = internal.index_spark_columns
+ if len(index_scols) == 1:
+ col = None
+ for label in index:
+ if col is None:
+ col = index_scols[0] != SF.lit(label)
Review comment:
That's a good point, but if the index is small, anti-join is too
expensive.
Maybe we can make the threshold configurable with `ps.option` and switch to
use `isin` or anti-join.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]