This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e44ff986e20 [SPARK-38989][PYTHON] Implement `ignore_index` of
`DataFrame/Series.sample`
e44ff986e20 is described below
commit e44ff986e20b4d7817901b2de5c01a11d0b9fd7d
Author: Xinrong Meng <[email protected]>
AuthorDate: Tue Apr 26 10:55:44 2022 +0900
[SPARK-38989][PYTHON] Implement `ignore_index` of `DataFrame/Series.sample`
### What changes were proposed in this pull request?
Implement `ignore_index` of `DataFrame/Series.sample`
### Why are the changes needed?
To increase pandas API coverage.
### Does this PR introduce _any_ user-facing change?
Yes. `ignore_index` of `DataFrame/Series.sample` is supported as below.
```py
>>> df = ps.DataFrame({'num_legs': [2, 4, 8, 0],
... 'num_wings': [2, 0, 0, 0],
... 'num_specimen_seen': [10, 2, 1, 8]},
... index=['falcon', 'dog', 'spider', 'fish'],
... columns=['num_legs', 'num_wings',
'num_specimen_seen'])
>>> df
num_legs num_wings num_specimen_seen
falcon 2 2 10
dog 4 0 2
spider 8 0 1
fish 0 0 8
>>> df.sample(frac=0.5, random_state=1, ignore_index=True)
num_legs num_wings num_specimen_seen
0 4 0 2
1 8 0 1
2 0 0 8
```
### How was this patch tested?
Unit tests.
Closes #36306 from xinrong-databricks/sample.ignore_index.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../pandas_on_spark/supported_pandas_api.rst | 3 +--
python/pyspark/pandas/frame.py | 18 +++++++++++++++++-
python/pyspark/pandas/series.py | 9 ++++++++-
python/pyspark/pandas/tests/test_dataframe.py | 16 ++++++++++++++--
4 files changed, 40 insertions(+), 6 deletions(-)
diff --git
a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
index 937f7a3f179..e3db90b23df 100644
--- a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
+++ b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
@@ -382,8 +382,7 @@ Supported DataFrame APIs
+--------------------------------------------+-------------+--------------------------------------+
| :func:`rtruediv` | P | ``axis``,
``level``, ``fill_value`` |
+--------------------------------------------+-------------+--------------------------------------+
-| :func:`sample` | P | ``weights``,
``axis``, |
-| | | ``ignore_index``
|
+| :func:`sample` | P | ``weights``,
``axis`` |
+--------------------------------------------+-------------+--------------------------------------+
| :func:`select_dtypes` | Y |
|
+--------------------------------------------+-------------+--------------------------------------+
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index c09fe029bd6..16f8e786b0f 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -8620,6 +8620,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
frac: Optional[float] = None,
replace: bool = False,
random_state: Optional[int] = None,
+ ignore_index: bool = False,
) -> "DataFrame":
"""
Return a random sample of items from an axis of object.
@@ -8642,6 +8643,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Sample with or without replacement.
random_state : int, optional
Seed for the random number generator (if int).
+ ignore_index : bool, default False
+ If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+ .. versionadded:: 3.4.0
Returns
-------
@@ -8671,6 +8676,14 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
falcon 2 2 10
fish 0 0 8
+ A random 50% sample of the ``DataFrame``, while ignoring the index.
+
+ >>> df.sample(frac=0.5, random_state=1, ignore_index=True) # doctest:
+SKIP
+ num_legs num_wings num_specimen_seen
+ 0 4 0 2
+ 1 8 0 1
+ 2 0 0 8
+
Extract 25% random elements from the ``Series`` ``df['num_legs']``,
with replacement,
so the same items could appear more than once.
@@ -8701,7 +8714,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
sdf = self._internal.resolved_copy.spark_frame.sample(
withReplacement=replace, fraction=frac, seed=random_state
)
- return DataFrame(self._internal.with_new_sdf(sdf))
+ if ignore_index:
+ return
DataFrame(sdf.drop(*self._internal.index_spark_column_names))
+ else:
+ return DataFrame(self._internal.with_new_sdf(sdf))
def astype(self, dtype: Union[str, Dtype, Dict[Name, Union[str, Dtype]]])
-> "DataFrame":
"""
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index ef0208b3bbd..f15ba4854f3 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -3524,9 +3524,16 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
frac: Optional[float] = None,
replace: bool = False,
random_state: Optional[int] = None,
+ ignore_index: bool = False,
) -> "Series":
return first_series(
- self.to_frame().sample(n=n, frac=frac, replace=replace,
random_state=random_state)
+ self.to_frame().sample(
+ n=n,
+ frac=frac,
+ replace=replace,
+ random_state=random_state,
+ ignore_index=ignore_index,
+ )
).rename(self.name)
sample.__doc__ = DataFrame.sample.__doc__
diff --git a/python/pyspark/pandas/tests/test_dataframe.py
b/python/pyspark/pandas/tests/test_dataframe.py
index b99a9a2e807..5fa6919c129 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -2619,8 +2619,7 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" *
psdf["a"])
def test_sample(self):
- pdf = pd.DataFrame({"A": [0, 2, 4]})
- psdf = ps.from_pandas(pdf)
+ psdf = ps.DataFrame({"A": [0, 2, 4]}, index=["x", "y", "z"])
# Make sure the tests run, but we can't check the result because they
are non-deterministic.
psdf.sample(frac=0.1)
@@ -2630,6 +2629,19 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
psdf["A"].sample(frac=0.2, replace=True)
psdf["A"].sample(frac=0.2, random_state=5)
+ self.assert_eq(psdf.sample(frac=0.1, ignore_index=True).index.dtype,
np.int64)
+ self.assert_eq(psdf.sample(frac=0.2, replace=True,
ignore_index=True).index.dtype, np.int64)
+ self.assert_eq(
+ psdf.sample(frac=0.2, random_state=5,
ignore_index=True).index.dtype, np.int64
+ )
+ self.assert_eq(psdf["A"].sample(frac=0.2,
ignore_index=True).index.dtype, np.int64)
+ self.assert_eq(
+ psdf["A"].sample(frac=0.2, replace=True,
ignore_index=True).index.dtype, np.int64
+ )
+ self.assert_eq(
+ psdf["A"].sample(frac=0.2, random_state=5,
ignore_index=True).index.dtype, np.int64
+ )
+
with self.assertRaises(ValueError):
psdf.sample()
with self.assertRaises(NotImplementedError):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]