This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d282f8933bc [SPARK-39201][PYTHON][PS] Implement `ignore_index` of
`DataFrame.explode` and `DataFrame.drop_duplicates`
d282f8933bc is described below
commit d282f8933bc80124bd534e1c03c5162ba0803255
Author: Xinrong Meng <[email protected]>
AuthorDate: Thu May 19 19:18:43 2022 +0900
[SPARK-39201][PYTHON][PS] Implement `ignore_index` of `DataFrame.explode`
and `DataFrame.drop_duplicates`
### What changes were proposed in this pull request?
Implement `ignore_index` of `DataFrame.explode` and
`DataFrame.drop_duplicates`.
### Why are the changes needed?
Increase pandas API coverage.
### Does this PR introduce _any_ user-facing change?
Yes. `ignore_index` of `DataFrame.explode` and `DataFrame.drop_duplicates`
is supported as below.
### How was this patch tested?
Unit tests.
Closes #36569 from xinrong-databricks/explode.ignore_index.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/frame.py | 34 ++++++++++++--
python/pyspark/pandas/tests/test_dataframe.py | 65 ++++++++++++++++++---------
2 files changed, 73 insertions(+), 26 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index f3ef0b15879..6049249d827 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -9367,6 +9367,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
subset: Optional[Union[Name, List[Name]]] = None,
keep: Union[bool, str] = "first",
inplace: bool = False,
+ ignore_index: bool = False,
) -> Optional["DataFrame"]:
"""
Return DataFrame with duplicate rows removed, optionally only
@@ -9384,6 +9385,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
- False : Drop all duplicates.
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy.
+ ignore_index : boolean, default False
+ If True, the resulting axis will be labeled 0, 1, …, n - 1.
Returns
-------
@@ -9407,6 +9410,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 2 c
4 3 d
+ >>> df.drop_duplicates(ignore_index=True).sort_index()
+ a b
+ 0 1 a
+ 1 2 a
+ 2 2 c
+ 3 3 d
+
>>> df.drop_duplicates('a').sort_index()
a b
0 1 a
@@ -9439,11 +9449,15 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
sdf = sdf.where(~scol_for(sdf, column)).drop(column)
internal = self._internal.with_new_sdf(sdf)
+ psdf: DataFrame = DataFrame(internal)
+
if inplace:
- self._update_internal_frame(internal)
+ if ignore_index:
+ psdf.reset_index(drop=True, inplace=inplace)
+ self._update_internal_frame(psdf._internal)
return None
else:
- return DataFrame(internal)
+ return psdf.reset_index(drop=True) if ignore_index else psdf
def reindex(
self,
@@ -12146,7 +12160,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
# Returns a frame
return result
- def explode(self, column: Name) -> "DataFrame":
+ def explode(self, column: Name, ignore_index: bool = False) -> "DataFrame":
"""
Transform each element of a list-like to a row, replicating index
values.
@@ -12154,6 +12168,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
----------
column : str or tuple
Column to explode.
+ ignore_index : bool, default False
+ If True, the resulting index will be labeled 0, 1, …, n - 1.
Returns
-------
@@ -12184,6 +12200,15 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
1 NaN 1
2 3.0 1
2 4.0 1
+
+ >>> df.explode('A', ignore_index=True)
+ A B
+ 0 1.0 1
+ 1 2.0 1
+ 2 3.0 1
+ 3 NaN 1
+ 4 3.0 1
+ 5 4.0 1
"""
from pyspark.pandas.series import Series
@@ -12212,7 +12237,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
data_fields[idx] = field.copy(dtype=dtype, spark_type=spark_type,
nullable=True)
internal = psdf._internal.with_new_sdf(sdf, data_fields=data_fields)
- return DataFrame(internal)
+ result_df: DataFrame = DataFrame(internal)
+ return result_df.reset_index(drop=True) if ignore_index else result_df
def mad(self, axis: Axis = 0) -> "Series":
"""
diff --git a/python/pyspark/pandas/tests/test_dataframe.py
b/python/pyspark/pandas/tests/test_dataframe.py
index 8915ec1ca64..2a159423a2d 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -3504,6 +3504,14 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
pdf.drop_duplicates([("x", "a"), ("y", "b")],
keep=keep).sort_index(),
psdf.drop_duplicates([("x", "a"), ("y", "b")],
keep=keep).sort_index(),
)
+ self.assert_eq(
+ pdf.drop_duplicates(
+ [("x", "a"), ("y", "b")], keep=keep, ignore_index=True
+ ).sort_index(),
+ psdf.drop_duplicates(
+ [("x", "a"), ("y", "b")], keep=keep, ignore_index=True
+ ).sort_index(),
+ )
# inplace is True
subset_list = [None, "a", ["a", "b"]]
@@ -3532,7 +3540,9 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
pser = pdf[("x", "a")]
psser = psdf[("x", "a")]
pdf.drop_duplicates(subset=subset, inplace=True)
+ pdf.drop_duplicates(subset=subset, inplace=True, ignore_index=True)
psdf.drop_duplicates(subset=subset, inplace=True)
+ psdf.drop_duplicates(subset=subset, inplace=True,
ignore_index=True)
self.assert_eq(psdf.sort_index(), pdf.sort_index())
self.assert_eq(psser.sort_index(), pser.sort_index())
@@ -5371,18 +5381,25 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
psdf.truncate("C", "B", axis=1)
def test_explode(self):
- pdf = pd.DataFrame({"A": [[-1.0, np.nan], [0.0, np.inf], [1.0,
-np.inf]], "B": 1})
+ pdf = pd.DataFrame(
+ {"A": [[-1.0, np.nan], [0.0, np.inf], [1.0, -np.inf]], "B": 1},
index=["a", "b", "c"]
+ )
pdf.index.name = "index"
pdf.columns.name = "columns"
psdf = ps.from_pandas(pdf)
- expected_result1 = pdf.explode("A")
- expected_result2 = pdf.explode("B")
+ expected_result1, result1 = pdf.explode("A"), psdf.explode("A")
+ expected_result2, result2 = pdf.explode("B"), psdf.explode("B")
+ expected_result3, result3 = pdf.explode("A", ignore_index=True),
psdf.explode(
+ "A", ignore_index=True
+ )
- self.assert_eq(psdf.explode("A"), expected_result1, almost=True)
- self.assert_eq(psdf.explode("B"), expected_result2)
- self.assert_eq(psdf.explode("A").index.name,
expected_result1.index.name)
- self.assert_eq(psdf.explode("A").columns.name,
expected_result1.columns.name)
+ self.assert_eq(result1, expected_result1, almost=True)
+ self.assert_eq(result2, expected_result2)
+ self.assert_eq(result1.index.name, expected_result1.index.name)
+ self.assert_eq(result1.columns.name, expected_result1.columns.name)
+ self.assert_eq(result3, expected_result3, almost=True)
+ self.assert_eq(result3.index, expected_result3.index)
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
@@ -5393,13 +5410,18 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
pdf.index = midx
psdf = ps.from_pandas(pdf)
- expected_result1 = pdf.explode("A")
- expected_result2 = pdf.explode("B")
+ expected_result1, result1 = pdf.explode("A"), psdf.explode("A")
+ expected_result2, result2 = pdf.explode("B"), psdf.explode("B")
+ expected_result3, result3 = pdf.explode("A", ignore_index=True),
psdf.explode(
+ "A", ignore_index=True
+ )
- self.assert_eq(psdf.explode("A"), expected_result1, almost=True)
- self.assert_eq(psdf.explode("B"), expected_result2)
- self.assert_eq(psdf.explode("A").index.names,
expected_result1.index.names)
- self.assert_eq(psdf.explode("A").columns.name,
expected_result1.columns.name)
+ self.assert_eq(result1, expected_result1, almost=True)
+ self.assert_eq(result2, expected_result2)
+ self.assert_eq(result1.index.names, expected_result1.index.names)
+ self.assert_eq(result1.columns.name, expected_result1.columns.name)
+ self.assert_eq(result3, expected_result3, almost=True)
+ self.assert_eq(result3.index, expected_result3.index)
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
@@ -5408,16 +5430,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
pdf.columns = columns
psdf.columns = columns
- expected_result1 = pdf.explode(("A", "Z"))
- expected_result2 = pdf.explode(("B", "X"))
- expected_result3 = pdf.A.explode("Z")
-
- self.assert_eq(psdf.explode(("A", "Z")), expected_result1, almost=True)
- self.assert_eq(psdf.explode(("B", "X")), expected_result2)
- self.assert_eq(psdf.explode(("A", "Z")).index.names,
expected_result1.index.names)
- self.assert_eq(psdf.explode(("A", "Z")).columns.names,
expected_result1.columns.names)
+ expected_result1, result1 = pdf.explode(("A", "Z")),
psdf.explode(("A", "Z"))
+ expected_result2, result2 = pdf.explode(("B", "X")),
psdf.explode(("B", "X"))
+ expected_result3, result3 = pdf.A.explode("Z"), psdf.A.explode("Z")
- self.assert_eq(psdf.A.explode("Z"), expected_result3, almost=True)
+ self.assert_eq(result1, expected_result1, almost=True)
+ self.assert_eq(result2, expected_result2)
+ self.assert_eq(result1.index.names, expected_result1.index.names)
+ self.assert_eq(result1.columns.names, expected_result1.columns.names)
+ self.assert_eq(result3, expected_result3, almost=True)
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
self.assertRaises(ValueError, lambda: psdf.explode("A"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]