This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new bda90691222f [SPARK-55472][PS] Raise `AttributeError` from methods
removed in pandas 3
bda90691222f is described below
commit bda90691222f9cdfbada43d15b4c4be4a2c6be43
Author: Takuya Ueshin <[email protected]>
AuthorDate: Wed Feb 11 08:46:39 2026 +0800
[SPARK-55472][PS] Raise `AttributeError` from methods removed in pandas 3
### What changes were proposed in this pull request?
Raises `AttributeError` from methods removed in pandas 3.
- `DataFrame`/`Series`: `backfill`, `pad`, `first`, `last`, `swapaxes`
- `GroupBy`: `fillna`
- `Index`: `hold_integer`
### Why are the changes needed?
There are some methods removed in pandas 3, which should raise
`AttributeError` in pandas API on Spark, too.
### Does this PR introduce _any_ user-facing change?
Yes, the methods removed in pandas 3 will raise `AttributeError` with
pandas 3.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54249 from ueshin/issues/SPARK-55472/attribute_error.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/frame.py | 63 ++++++-
python/pyspark/pandas/generic.py | 49 +++++-
python/pyspark/pandas/groupby.py | 25 ++-
python/pyspark/pandas/indexes/base.py | 12 +-
python/pyspark/pandas/series.py | 64 ++++++-
.../pandas/tests/computation/test_missing_data.py | 86 ++++++----
.../tests/diff_frames_ops/test_groupby_fillna.py | 83 ++++++----
.../pyspark/pandas/tests/frame/test_reindexing.py | 43 +++--
.../pandas/tests/groupby/test_missing_data.py | 183 +++++++++++----------
python/pyspark/pandas/tests/indexes/test_basic.py | 35 ++--
python/pyspark/pandas/tests/io/test_io.py | 4 +-
python/pyspark/pandas/tests/series/test_index.py | 19 ++-
.../pandas/tests/series/test_missing_data.py | 57 ++++---
python/pyspark/pandas/tests/series/test_series.py | 31 +++-
14 files changed, 507 insertions(+), 247 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index bd8802e9a0b9..62156d068c5d 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -68,6 +68,8 @@ from pandas.core.dtypes.common import infer_dtype_from_object
# type: ignore[at
from pandas.core.accessor import CachedAccessor # type: ignore[attr-defined]
from pandas.core.dtypes.inference import is_sequence # type:
ignore[attr-defined]
+from pyspark._globals import _NoValue, _NoValueType
+from pyspark.loose_version import LooseVersion
from pyspark.errors import PySparkValueError
from pyspark import StorageLevel
from pyspark.sql import Column as PySparkColumn, DataFrame as
PySparkDataFrame, functions as F
@@ -6108,7 +6110,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
def fillna(
self,
value: Optional[Union[Any, Dict[Name, Any]]] = None,
- method: Optional[str] = None,
+ method: Union[Optional[str], _NoValueType] = _NoValue,
axis: Optional[Axis] = None,
inplace: bool = False,
limit: Optional[int] = None,
@@ -6178,7 +6180,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
We can also propagate non-null values forward or backward.
- >>> df.fillna(method='ffill')
+ >>> df.fillna(method='ffill') # doctest: +SKIP
A B C D
0 NaN 2.0 NaN 0
1 3.0 4.0 NaN 1
@@ -6196,6 +6198,28 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
2 0.0 1.0 2.0 5
3 0.0 3.0 1.0 4
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ if method is _NoValue:
+ method = None
+ else:
+ if method is not _NoValue:
+ raise TypeError(
+ "The `method` parameter is not supported in pandas 3.0.0
and later. "
+ )
+ method = None
+
+ return self._fillna_with_method(
+ value=value, method=method, axis=axis, inplace=inplace,
limit=limit # type: ignore[arg-type]
+ )
+
+ def _fillna_with_method(
+ self,
+ value: Optional[Union[Any, Dict[Name, Any]]] = None,
+ method: Optional[str] = None,
+ axis: Optional[Axis] = None,
+ inplace: bool = False,
+ limit: Optional[int] = None,
+ ) -> Optional["DataFrame"]:
axis = validate_axis(axis)
if axis != 0:
raise NotImplementedError("fillna currently only works for axis=0
or axis='index'")
@@ -6603,7 +6627,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Get the rows for the last 3 days:
- >>> psdf.last('3D')
+ >>> psdf.last('3D') # doctest: +SKIP
A
2018-04-13 3
2018-04-15 4
@@ -6612,6 +6636,15 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 observed days in the dataset, and therefore data for 2018-04-11 was
not returned.
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._last(offset)
+ else:
+ raise AttributeError(
+ "The `last` method is not supported in pandas 3.0.0 and later.
"
+ "Please create a mask and filter using `.loc` instead"
+ )
+
+ def _last(self, offset: Union[str, DateOffset]) -> "DataFrame":
warnings.warn(
"last is deprecated and will be removed in a future version. "
"Please create a mask and filter using `.loc` instead",
@@ -6667,7 +6700,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Get the rows for the last 3 days:
- >>> psdf.first('3D')
+ >>> psdf.first('3D') # doctest: +SKIP
A
2018-04-09 1
2018-04-11 2
@@ -6676,6 +6709,15 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._first(offset)
+ else:
+ raise AttributeError(
+ "The `first` method is not supported in pandas 3.0.0 and
later. "
+ "Please create a mask and filter using `.loc` instead"
+ )
+
+ def _first(self, offset: Union[str, DateOffset]) -> "DataFrame":
warnings.warn(
"first is deprecated and will be removed in a future version. "
"Please create a mask and filter using `.loc` instead",
@@ -8105,17 +8147,26 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
x 1 2 3
y 4 5 6
z 7 8 9
- >>> psdf.swapaxes(i=1, j=0)
+ >>> psdf.swapaxes(i=1, j=0) # doctest: +SKIP
x y z
a 1 4 7
b 2 5 8
c 3 6 9
- >>> psdf.swapaxes(i=1, j=1)
+ >>> psdf.swapaxes(i=1, j=1) # doctest: +SKIP
a b c
x 1 2 3
y 4 5 6
z 7 8 9
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._swapaxes(i, j, copy)
+ else:
+ raise AttributeError(
+ "The `swapaxes` method is not supported in pandas 3.0.0 and
later. "
+ "Please use the `transpose` method instead"
+ )
+
+ def _swapaxes(self, i: Axis, j: Axis, copy: bool = True) -> "DataFrame":
assert copy is True
i = validate_axis(i)
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 6d01d486e64a..f73580444449 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -39,6 +39,8 @@ import numpy as np
import pandas as pd
from pandas.api.types import is_list_like
+from pyspark._globals import _NoValue, _NoValueType
+from pyspark.loose_version import LooseVersion
from pyspark.sql import Column, functions as F
from pyspark.sql.internal import InternalFunction as SF
from pyspark.sql.types import (
@@ -3310,6 +3312,17 @@ class Frame(object, metaclass=ABCMeta):
@abstractmethod
def fillna(
+ self: FrameLike,
+ value: Optional[Any] = None,
+ method: Union[Optional[str], _NoValueType] = _NoValue,
+ axis: Optional[Axis] = None,
+ inplace: bool_type = False,
+ limit: Optional[int] = None,
+ ) -> FrameLike:
+ pass
+
+ @abstractmethod
+ def _fillna_with_method(
self: FrameLike,
value: Optional[Any] = None,
method: Optional[str] = None,
@@ -3394,9 +3407,23 @@ class Frame(object, metaclass=ABCMeta):
3 1.0
dtype: float64
"""
- return self.fillna(method="bfill", axis=axis, inplace=inplace,
limit=limit)
+ return self._fillna_with_method(method="bfill", axis=axis,
inplace=inplace, limit=limit)
- backfill = bfill
+ def backfill(
+ self: FrameLike,
+ axis: Optional[Axis] = None,
+ inplace: bool_type = False,
+ limit: Optional[int] = None,
+ ) -> FrameLike:
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self.bfill(axis=axis, inplace=inplace, limit=limit)
+ else:
+ raise AttributeError(
+ "The `backfill` method is not supported in pandas 3.0.0 and
later. Use `bfill` instead."
+ )
+
+ if LooseVersion(pd.__version__) < "3.0.0":
+ backfill.__doc__ = bfill.__doc__
# TODO: add 'downcast' when value parameter exists
def ffill(
@@ -3473,9 +3500,23 @@ class Frame(object, metaclass=ABCMeta):
3 3.0
dtype: float64
"""
- return self.fillna(method="ffill", axis=axis, inplace=inplace,
limit=limit)
+ return self._fillna_with_method(method="ffill", axis=axis,
inplace=inplace, limit=limit)
+
+ def pad(
+ self: FrameLike,
+ axis: Optional[Axis] = None,
+ inplace: bool_type = False,
+ limit: Optional[int] = None,
+ ) -> FrameLike:
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self.ffill(axis=axis, inplace=inplace, limit=limit)
+ else:
+ raise AttributeError(
+ "The `pad` method is not supported in pandas 3.0.0 and later.
Use `ffill` instead."
+ )
- pad = ffill
+ if LooseVersion(pd.__version__) < "3.0.0":
+ pad.__doc__ = ffill.__doc__
# TODO: add 'axis', 'inplace', 'downcast'
def interpolate(
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 5ab4abd6c32f..a70a1ba59ba2 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -2597,20 +2597,37 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
We can also propagate non-null values forward or backward in group.
- >>> df.groupby(['A'])['B'].fillna(method='ffill').sort_index()
+ >>> df.groupby(['A'])['B'].fillna(method='ffill').sort_index() #
doctest: +SKIP
0 2.0
1 4.0
2 NaN
3 3.0
Name: B, dtype: float64
- >>> df.groupby(['A']).fillna(method='bfill').sort_index()
+ >>> df.groupby(['A']).fillna(method='bfill').sort_index() # doctest:
+SKIP
B C D
0 2.0 NaN 0
1 4.0 NaN 1
2 3.0 1.0 5
3 3.0 1.0 4
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._fillna(value=value, method=method, axis=axis,
inplace=inplace, limit=limit)
+ else:
+ raise AttributeError(
+ "The `fillna` method is not supported in pandas 3.0.0 and
later. "
+ "Use obj.ffill() or obj.bfill() for forward or backward
filling instead. "
+ "If you want to fill with a single value, use DataFrame.fillna
instead"
+ )
+
+ def _fillna(
+ self,
+ value: Optional[Any] = None,
+ method: Optional[str] = None,
+ axis: Optional[Axis] = None,
+ inplace: bool = False,
+ limit: Optional[int] = None,
+ ) -> FrameLike:
should_resolve = method is not None
if should_resolve:
warnings.warn(
@@ -2673,7 +2690,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
2 3.0 1.0 5
3 3.0 1.0 4
"""
- return self.fillna(method="bfill", limit=limit)
+ return self._fillna(method="bfill", limit=limit)
def ffill(self, limit: Optional[int] = None) -> FrameLike:
"""
@@ -2722,7 +2739,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
2 NaN NaN 5
3 3.0 1.0 4
"""
- return self.fillna(method="ffill", limit=limit)
+ return self._fillna(method="ffill", limit=limit)
def _limit(self, n: int, asc: bool) -> FrameLike:
"""
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index 83fdcb8e58d9..40b432556463 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -45,6 +45,7 @@ from pandas.io.formats.printing import pprint_thing # type:
ignore[import-not-f
from pandas.api.types import CategoricalDtype, is_hashable
from pandas._libs import lib
+from pyspark.loose_version import LooseVersion
from pyspark.sql.column import Column
from pyspark.sql import functions as F
from pyspark.sql.types import (
@@ -2370,16 +2371,21 @@ class Index(IndexOpsMixin):
Returns False for string type.
>>> psidx = ps.Index(["A", "B", "C", "D"])
- >>> psidx.holds_integer()
+ >>> psidx.holds_integer() # doctest: +SKIP
False
Returns False for float type.
>>> psidx = ps.Index([1.1, 2.2, 3.3, 4.4])
- >>> psidx.holds_integer()
+ >>> psidx.holds_integer() # doctest: +SKIP
False
"""
- return isinstance(self.spark.data_type, IntegralType)
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return isinstance(self.spark.data_type, IntegralType)
+ else:
+ raise AttributeError(
+ "The `holds_integer` method is not supported in pandas 3.0.0
and later. "
+ )
def intersection(self, other: Union[DataFrame, Series, "Index", List]) ->
"Index":
"""
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 0d845ea9bde3..72d49574423b 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -57,6 +57,7 @@ from pandas.api.types import (
)
from pandas.tseries.frequencies import DateOffset # type: ignore[attr-defined]
+from pyspark._globals import _NoValue, _NoValueType
from pyspark.loose_version import LooseVersion
from pyspark.sql import (
functions as F,
@@ -2098,7 +2099,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
def fillna(
self,
value: Optional[Any] = None,
- method: Optional[str] = None,
+ method: Union[Optional[str], _NoValueType] = _NoValue,
axis: Optional[Axis] = None,
inplace: bool = False,
limit: Optional[int] = None,
@@ -2167,7 +2168,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
We can also propagate non-null values forward or backward.
- >>> s.fillna(method='ffill')
+ >>> s.fillna(method='ffill') # doctest: +SKIP
0 NaN
1 2.0
2 3.0
@@ -2177,7 +2178,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Name: x, dtype: float64
>>> s = ps.Series([np.nan, 'a', 'b', 'c', np.nan], name='x')
- >>> s.fillna(method='ffill')
+ >>> s.fillna(method='ffill') # doctest: +SKIP
0 None
1 a
2 b
@@ -2185,6 +2186,28 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
4 c
Name: x, dtype: object
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ if method is _NoValue:
+ method = None
+ else:
+ if method is not _NoValue:
+ raise TypeError(
+ "The `method` parameter is not supported in pandas 3.0.0
and later. "
+ )
+ method = None
+
+ return self._fillna_with_method(
+ value=value, method=method, axis=axis, inplace=inplace,
limit=limit # type: ignore[arg-type]
+ )
+
+ def _fillna_with_method(
+ self,
+ value: Optional[Any] = None,
+ method: Optional[str] = None,
+ axis: Optional[Axis] = None,
+ inplace: bool = False,
+ limit: Optional[int] = None,
+ ) -> Optional["Series"]:
psser = self._fillna(value=value, method=method, axis=axis,
limit=limit)
if method is not None:
@@ -2783,7 +2806,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Get the rows for the last 3 days:
- >>> psser.last('3D')
+ >>> psser.last('3D') # doctest: +SKIP
2018-04-13 3
2018-04-15 4
dtype: int64
@@ -2792,6 +2815,15 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
3 observed days in the dataset, and therefore data for 2018-04-11 was
not returned.
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._last(offset)
+ else:
+ raise AttributeError(
+ "The `last` method is not supported in pandas 3.0.0 and later.
"
+ "Please create a mask and filter using `.loc` instead"
+ )
+
+ def _last(self, offset: Union[str, DateOffset]) -> "Series":
warnings.warn(
"last is deprecated and will be removed in a future version. "
"Please create a mask and filter using `.loc` instead",
@@ -2837,7 +2869,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Get the rows for the first 3 days:
- >>> psser.first('3D')
+ >>> psser.first('3D') # doctest: +SKIP
2018-04-09 1
2018-04-11 2
dtype: int64
@@ -2846,6 +2878,15 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._first(offset)
+ else:
+ raise AttributeError(
+ "The `first` method is not supported in pandas 3.0.0 and
later. "
+ "Please create a mask and filter using `.loc` instead"
+ )
+
+ def _first(self, offset: Union[str, DateOffset]) -> "Series":
warnings.warn(
"first is deprecated and will be removed in a future version. "
"Please create a mask and filter using `.loc` instead",
@@ -3217,12 +3258,21 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
z 3
dtype: int64
>>>
- >>> psser.swapaxes(0, 0)
+ >>> psser.swapaxes(0, 0) # doctest: +SKIP
x 1
y 2
z 3
dtype: int64
"""
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return self._swapaxes(i, j, copy)
+ else:
+ raise AttributeError(
+ "The `swapaxes` method is not supported in pandas 3.0.0 and
later. "
+ "Please use the `transpose` method instead"
+ )
+
+ def _swapaxes(self, i: Axis, j: Axis, copy: bool = True) -> "Series":
warnings.warn(
"'Series.swapaxes' is deprecated and will be removed in a future
version. "
"Please use 'Series.transpose' instead.",
@@ -5121,7 +5171,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
), "If 'regex' is True then 'to_replace' must be a string"
if to_replace is None:
- return self.fillna(method="ffill")
+ return self._fillna_with_method(method="ffill")
if not isinstance(to_replace, (str, list, tuple, dict, int, float)):
raise TypeError("'to_replace' should be one of str, list, tuple,
dict, int, float")
diff --git a/python/pyspark/pandas/tests/computation/test_missing_data.py
b/python/pyspark/pandas/tests/computation/test_missing_data.py
index f43912bd0a77..ae1f68cb9889 100644
--- a/python/pyspark/pandas/tests/computation/test_missing_data.py
+++ b/python/pyspark/pandas/tests/computation/test_missing_data.py
@@ -20,6 +20,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -39,12 +40,16 @@ class FrameMissingDataMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(pdf.backfill().sort_index(),
psdf.backfill().sort_index())
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.backfill().sort_index(),
psdf.backfill().sort_index())
- # Test `inplace=True`
- pdf.backfill(inplace=True)
- psdf.backfill(inplace=True)
- self.assert_eq(pdf.sort_index(), psdf.sort_index())
+ # Test `inplace=True`
+ pdf.backfill(inplace=True)
+ psdf.backfill(inplace=True)
+ self.assert_eq(pdf.sort_index(), psdf.sort_index())
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.backfill()
def _test_dropna(self, pdf, axis):
psdf = ps.from_pandas(pdf)
@@ -188,24 +193,28 @@ class FrameMissingDataMixin:
self.assert_eq(
psdf.fillna({"x": -1, "y": -2, "z": -5}), pdf.fillna({"x": -1,
"y": -2, "z": -5})
)
- self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
- self.assert_eq(pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2))
- self.assert_eq(
- pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
- )
- self.assert_eq(
- pdf.fillna(method="bfill", limit=2).sort_index(),
- psdf.fillna(method="bfill", limit=2).sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.fillna(method="ffill"),
psdf.fillna(method="ffill"))
+ self.assert_eq(
+ pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2)
+ )
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(
+ pdf.fillna(method="bfill", limit=2).sort_index(),
+ psdf.fillna(method="bfill", limit=2).sort_index(),
+ )
pdf = pdf.set_index(["x", "y"])
psdf = ps.from_pandas(pdf)
# check multi index
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
- self.assert_eq(
- pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
- )
- self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(pdf.fillna(method="ffill"),
psdf.fillna(method="ffill"))
pser = pdf.z
psser = psdf.z
@@ -234,8 +243,14 @@ class FrameMissingDataMixin:
psdf.fillna(pd.DataFrame({"x": [-1], "y": [-1], "z": [-1]}))
with self.assertRaisesRegex(TypeError, "Unsupported.*int64"):
psdf.fillna({"x": np.int64(-6), "y": np.int64(-4), "z": -5})
- with self.assertRaisesRegex(ValueError, "Expecting 'pad', 'ffill',
'backfill' or 'bfill'."):
- psdf.fillna(method="xxx")
+ if LooseVersion(pd.__version__) < "3.0.0":
+ with self.assertRaisesRegex(
+ ValueError, "Expecting 'pad', 'ffill', 'backfill' or 'bfill'."
+ ):
+ psdf.fillna(method="xxx")
+ else:
+ with self.assertRaises(TypeError):
+ psdf.fillna(method="xxx")
with self.assertRaisesRegex(
ValueError, "Must specify a fillna 'value' or 'method' parameter."
):
@@ -257,12 +272,17 @@ class FrameMissingDataMixin:
psdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
)
- self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
- self.assert_eq(pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2))
- self.assert_eq(
- pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
- )
- self.assert_eq(pdf.fillna(method="bfill", limit=2),
psdf.fillna(method="bfill", limit=2))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.fillna(method="ffill"),
psdf.fillna(method="ffill"))
+ self.assert_eq(
+ pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2)
+ )
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(
+ pdf.fillna(method="bfill", limit=2),
psdf.fillna(method="bfill", limit=2)
+ )
self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
self.assert_eq(
@@ -455,12 +475,16 @@ class FrameMissingDataMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(pdf.pad(), psdf.pad())
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.pad(), psdf.pad())
- # Test `inplace=True`
- pdf.pad(inplace=True)
- psdf.pad(inplace=True)
- self.assert_eq(pdf, psdf)
+ # Test `inplace=True`
+ pdf.pad(inplace=True)
+ psdf.pad(inplace=True)
+ self.assert_eq(pdf, psdf)
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.pad()
class FrameMissingDataTests(
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_fillna.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_fillna.py
index 151f4d7dd0b2..8e2617218904 100644
--- a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_fillna.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_fillna.py
@@ -18,6 +18,7 @@
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.config import set_option, reset_option
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -47,41 +48,53 @@ class GroupByFillNAMixin:
psdf = ps.from_pandas(pdf)
kkey = ps.from_pandas(pkey)
- self.assert_eq(
- psdf.groupby(kkey).fillna(0).sort_index(),
pdf.groupby(pkey).fillna(0).sort_index()
- )
- self.assert_eq(
- psdf.groupby(kkey)["C"].fillna(0).sort_index(),
- pdf.groupby(pkey)["C"].fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)[["C"]].fillna(0).sort_index(),
- pdf.groupby(pkey)[["C"]].fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey).fillna(method="bfill").sort_index(),
- pdf.groupby(pkey).fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)["C"].fillna(method="bfill").sort_index(),
- pdf.groupby(pkey)["C"].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)[["C"]].fillna(method="bfill").sort_index(),
- pdf.groupby(pkey)[["C"]].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey).fillna(method="ffill").sort_index(),
- pdf.groupby(pkey).fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)["C"].fillna(method="ffill").sort_index(),
- pdf.groupby(pkey)["C"].fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)[["C"]].fillna(method="ffill").sort_index(),
- pdf.groupby(pkey)[["C"]].fillna(method="ffill").sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ psdf.groupby(kkey).fillna(0).sort_index(),
pdf.groupby(pkey).fillna(0).sort_index()
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)["C"].fillna(0).sort_index(),
+ pdf.groupby(pkey)["C"].fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)[["C"]].fillna(0).sort_index(),
+ pdf.groupby(pkey)[["C"]].fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey).fillna(method="bfill").sort_index(),
+ pdf.groupby(pkey).fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)["C"].fillna(method="bfill").sort_index(),
+ pdf.groupby(pkey)["C"].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)[["C"]].fillna(method="bfill").sort_index(),
+ pdf.groupby(pkey)[["C"]].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey).fillna(method="ffill").sort_index(),
+ pdf.groupby(pkey).fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)["C"].fillna(method="ffill").sort_index(),
+ pdf.groupby(pkey)["C"].fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(kkey)[["C"]].fillna(method="ffill").sort_index(),
+ pdf.groupby(pkey)[["C"]].fillna(method="ffill").sort_index(),
+ )
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.groupby(kkey).fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby(kkey)["C"].fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby(kkey)[["C"]].fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby(kkey).fillna(method="bfill")
+ with self.assertRaises(AttributeError):
+ psdf.groupby(kkey)["C"].fillna(method="bfill")
class GroupByFillNATests(
diff --git a/python/pyspark/pandas/tests/frame/test_reindexing.py
b/python/pyspark/pandas/tests/frame/test_reindexing.py
index 17da37e51ad1..cc1a7b6ed1c1 100644
--- a/python/pyspark/pandas/tests/frame/test_reindexing.py
+++ b/python/pyspark/pandas/tests/frame/test_reindexing.py
@@ -21,6 +21,7 @@ from pandas.tseries.offsets import DateOffset
from pyspark import pandas as ps
from pyspark.errors import PySparkValueError
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.config import option_context
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -664,19 +665,27 @@ class FrameReindexingMixin:
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
psdf = ps.from_pandas(pdf)
- self.assert_eq(pdf.last("1D"), psdf.last("1D"))
- self.assert_eq(pdf.last(DateOffset(days=1)),
psdf.last(DateOffset(days=1)))
- with self.assertRaisesRegex(TypeError, "'last' only supports a
DatetimeIndex"):
- ps.DataFrame([1, 2, 3, 4]).last("1D")
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.last("1D"), psdf.last("1D"))
+ self.assert_eq(pdf.last(DateOffset(days=1)),
psdf.last(DateOffset(days=1)))
+ with self.assertRaisesRegex(TypeError, "'last' only supports a
DatetimeIndex"):
+ ps.DataFrame([1, 2, 3, 4]).last("1D")
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.last("1D")
def test_first(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
psdf = ps.from_pandas(pdf)
- self.assert_eq(pdf.first("1D"), psdf.first("1D"))
- self.assert_eq(pdf.first(DateOffset(days=1)),
psdf.first(DateOffset(days=1)))
- with self.assertRaisesRegex(TypeError, "'first' only supports a
DatetimeIndex"):
- ps.DataFrame([1, 2, 3, 4]).first("1D")
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pdf.first("1D"), psdf.first("1D"))
+ self.assert_eq(pdf.first(DateOffset(days=1)),
psdf.first(DateOffset(days=1)))
+ with self.assertRaisesRegex(TypeError, "'first' only supports a
DatetimeIndex"):
+ ps.DataFrame([1, 2, 3, 4]).first("1D")
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.first("1D")
def test_swaplevel(self):
# MultiIndex with two levels
@@ -754,14 +763,18 @@ class FrameReindexingMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(psdf.swapaxes(0, 1), pdf.swapaxes(0, 1))
- self.assert_eq(psdf.swapaxes(1, 0), pdf.swapaxes(1, 0))
- self.assert_eq(psdf.swapaxes("index", "columns"),
pdf.swapaxes("index", "columns"))
- self.assert_eq(psdf.swapaxes("columns", "index"),
pdf.swapaxes("columns", "index"))
- self.assert_eq((psdf + 1).swapaxes(0, 1), (pdf + 1).swapaxes(0, 1))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psdf.swapaxes(0, 1), pdf.swapaxes(0, 1))
+ self.assert_eq(psdf.swapaxes(1, 0), pdf.swapaxes(1, 0))
+ self.assert_eq(psdf.swapaxes("index", "columns"),
pdf.swapaxes("index", "columns"))
+ self.assert_eq(psdf.swapaxes("columns", "index"),
pdf.swapaxes("columns", "index"))
+ self.assert_eq((psdf + 1).swapaxes(0, 1), (pdf + 1).swapaxes(0, 1))
- self.assertRaises(AssertionError, lambda: psdf.swapaxes(0, 1,
copy=False))
- self.assertRaises(ValueError, lambda: psdf.swapaxes(0, -1))
+ self.assertRaises(AssertionError, lambda: psdf.swapaxes(0, 1,
copy=False))
+ self.assertRaises(ValueError, lambda: psdf.swapaxes(0, -1))
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.swapaxes(0, 1)
def test_isin(self):
pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/groupby/test_missing_data.py
b/python/pyspark/pandas/tests/groupby/test_missing_data.py
index ff481143faf7..c7f653638083 100644
--- a/python/pyspark/pandas/tests/groupby/test_missing_data.py
+++ b/python/pyspark/pandas/tests/groupby/test_missing_data.py
@@ -20,6 +20,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -36,95 +37,111 @@ class GroupbyMissingDataMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(
- psdf.groupby("A").fillna(0).sort_index(),
pdf.groupby("A").fillna(0).sort_index()
- )
- self.assert_eq(
- psdf.groupby("A")["C"].fillna(0).sort_index(),
- pdf.groupby("A")["C"].fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A")[["C"]].fillna(0).sort_index(),
- pdf.groupby("A")[["C"]].fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A").fillna(method="bfill").sort_index(),
- pdf.groupby("A").fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A")["C"].fillna(method="bfill").sort_index(),
- pdf.groupby("A")["C"].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A")[["C"]].fillna(method="bfill").sort_index(),
- pdf.groupby("A")[["C"]].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A").fillna(method="ffill").sort_index(),
- pdf.groupby("A").fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A")["C"].fillna(method="ffill").sort_index(),
- pdf.groupby("A")["C"].fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby("A")[["C"]].fillna(method="ffill").sort_index(),
- pdf.groupby("A")[["C"]].fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A // 5).fillna(method="bfill").sort_index(),
- pdf.groupby(pdf.A // 5).fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A // 5)["C"].fillna(method="bfill").sort_index(),
- pdf.groupby(pdf.A // 5)["C"].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A //
5)[["C"]].fillna(method="bfill").sort_index(),
- pdf.groupby(pdf.A // 5)[["C"]].fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A // 5).fillna(method="ffill").sort_index(),
- pdf.groupby(pdf.A // 5).fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A // 5)["C"].fillna(method="ffill").sort_index(),
- pdf.groupby(pdf.A // 5)["C"].fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(psdf.A //
5)[["C"]].fillna(method="ffill").sort_index(),
- pdf.groupby(pdf.A // 5)[["C"]].fillna(method="ffill").sort_index(),
- )
- self.assert_eq(
- psdf.C.rename().groupby(psdf.A).fillna(0).sort_index(),
- pdf.C.rename().groupby(pdf.A).fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.C.groupby(psdf.A.rename()).fillna(0).sort_index(),
- pdf.C.groupby(pdf.A.rename()).fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.C.rename().groupby(psdf.A.rename()).fillna(0).sort_index(),
- pdf.C.rename().groupby(pdf.A.rename()).fillna(0).sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ psdf.groupby("A").fillna(0).sort_index(),
pdf.groupby("A").fillna(0).sort_index()
+ )
+ self.assert_eq(
+ psdf.groupby("A")["C"].fillna(0).sort_index(),
+ pdf.groupby("A")["C"].fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")[["C"]].fillna(0).sort_index(),
+ pdf.groupby("A")[["C"]].fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A").fillna(method="bfill").sort_index(),
+ pdf.groupby("A").fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")["C"].fillna(method="bfill").sort_index(),
+ pdf.groupby("A")["C"].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")[["C"]].fillna(method="bfill").sort_index(),
+ pdf.groupby("A")[["C"]].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A").fillna(method="ffill").sort_index(),
+ pdf.groupby("A").fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")["C"].fillna(method="ffill").sort_index(),
+ pdf.groupby("A")["C"].fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")[["C"]].fillna(method="ffill").sort_index(),
+ pdf.groupby("A")[["C"]].fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A // 5).fillna(method="bfill").sort_index(),
+ pdf.groupby(pdf.A // 5).fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A //
5)["C"].fillna(method="bfill").sort_index(),
+ pdf.groupby(pdf.A //
5)["C"].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A //
5)[["C"]].fillna(method="bfill").sort_index(),
+ pdf.groupby(pdf.A //
5)[["C"]].fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A // 5).fillna(method="ffill").sort_index(),
+ pdf.groupby(pdf.A // 5).fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A //
5)["C"].fillna(method="ffill").sort_index(),
+ pdf.groupby(pdf.A //
5)["C"].fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(psdf.A //
5)[["C"]].fillna(method="ffill").sort_index(),
+ pdf.groupby(pdf.A //
5)[["C"]].fillna(method="ffill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.C.rename().groupby(psdf.A).fillna(0).sort_index(),
+ pdf.C.rename().groupby(pdf.A).fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.C.groupby(psdf.A.rename()).fillna(0).sort_index(),
+ pdf.C.groupby(pdf.A.rename()).fillna(0).sort_index(),
+ )
+ self.assert_eq(
+
psdf.C.rename().groupby(psdf.A.rename()).fillna(0).sort_index(),
+ pdf.C.rename().groupby(pdf.A.rename()).fillna(0).sort_index(),
+ )
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.groupby("A").fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby("A")["C"].fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby("A")[["C"]].fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby("A").fillna(method="bfill")
# multi-index columns
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y",
"C"), ("Z", "D")])
pdf.columns = columns
psdf.columns = columns
- self.assert_eq(
- psdf.groupby(("X", "A")).fillna(0).sort_index(),
- pdf.groupby(("X", "A")).fillna(0).sort_index(),
- )
- self.assert_eq(
- psdf.groupby(("X", "A")).fillna(method="bfill").sort_index(),
- pdf.groupby(("X", "A")).fillna(method="bfill").sort_index(),
- )
- self.assert_eq(
- psdf.groupby(("X", "A")).fillna(method="ffill").sort_index(),
- pdf.groupby(("X", "A")).fillna(method="ffill").sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ psdf.groupby(("X", "A")).fillna(0).sort_index(),
+ pdf.groupby(("X", "A")).fillna(0).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(("X", "A")).fillna(method="bfill").sort_index(),
+ pdf.groupby(("X", "A")).fillna(method="bfill").sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby(("X", "A")).fillna(method="ffill").sort_index(),
+ pdf.groupby(("X", "A")).fillna(method="ffill").sort_index(),
+ )
+ else:
+ with self.assertRaises(AttributeError):
+ psdf.groupby(("X", "A")).fillna(0)
+ with self.assertRaises(AttributeError):
+ psdf.groupby(("X", "A")).fillna(method="bfill")
def test_ffill(self):
idx = np.random.rand(4 * 3)
diff --git a/python/pyspark/pandas/tests/indexes/test_basic.py
b/python/pyspark/pandas/tests/indexes/test_basic.py
index 78b0ad838b15..bc9a32e6bbc1 100644
--- a/python/pyspark/pandas/tests/indexes/test_basic.py
+++ b/python/pyspark/pandas/tests/indexes/test_basic.py
@@ -21,6 +21,7 @@ import numpy as np
import pandas as pd
import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.exceptions import PandasNotImplementedError
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils,
SPARK_CONF_ARROW_ENABLED
@@ -73,26 +74,20 @@ class IndexBasicMixin:
self.assert_eq(psdf.index.copy(), pdf.index.copy())
def test_holds_integer(self):
- pidx = pd.Index([1, 2, 3, 4])
- psidx = ps.from_pandas(pidx)
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
-
- pidx = pd.Index([1.1, 2.2, 3.3, 4.4])
- psidx = ps.from_pandas(pidx)
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
-
- pidx = pd.Index(["A", "B", "C", "D"])
- psidx = ps.from_pandas(pidx)
- self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
-
- # MultiIndex
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "a")])
- psmidx = ps.from_pandas(pmidx)
- self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
-
- pmidx = pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20, 1)])
- psmidx = ps.from_pandas(pmidx)
- self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
+ def check_holds_integer(pidx):
+ psidx = ps.from_pandas(pidx)
+
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
+ else:
+ with self.assertRaises(AttributeError):
+ psidx.holds_integer()
+
+ check_holds_integer(pd.Index([1, 2, 3, 4]))
+ check_holds_integer(pd.Index([1.1, 2.2, 3.3, 4.4]))
+ check_holds_integer(pd.Index(["A", "B", "C", "D"]))
+ check_holds_integer(pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"),
("y", "a")]))
+ check_holds_integer(pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20,
1)]))
def test_item(self):
pidx = pd.Index([10])
diff --git a/python/pyspark/pandas/tests/io/test_io.py
b/python/pyspark/pandas/tests/io/test_io.py
index 4a0ea8198d81..2a05e11c0896 100644
--- a/python/pyspark/pandas/tests/io/test_io.py
+++ b/python/pyspark/pandas/tests/io/test_io.py
@@ -109,8 +109,8 @@ class FrameIOMixin:
def check_style():
# If the value is negative, the text color will be displayed as
red.
- pdf_style = pdf.style.applymap(style_negative, props="color:red;")
- psdf_style = psdf.style.applymap(style_negative,
props="color:red;")
+ pdf_style = pdf.style.map(style_negative, props="color:red;")
+ psdf_style = psdf.style.map(style_negative, props="color:red;")
# Test whether the same shape as pandas table is created including
the color.
self.assert_eq(pdf_style.to_latex(), psdf_style.to_latex())
diff --git a/python/pyspark/pandas/tests/series/test_index.py
b/python/pyspark/pandas/tests/series/test_index.py
index 1a8bccd61294..9bdc7ae333ec 100644
--- a/python/pyspark/pandas/tests/series/test_index.py
+++ b/python/pyspark/pandas/tests/series/test_index.py
@@ -19,6 +19,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -296,13 +297,17 @@ class SeriesIndexMixin:
pser = pd.Series([1, 2, 3], index=["x", "y", "z"], name="ser")
psser = ps.from_pandas(pser)
- self.assert_eq(psser.swapaxes(0, 0), pser.swapaxes(0, 0))
- self.assert_eq(psser.swapaxes("index", "index"),
pser.swapaxes("index", "index"))
- self.assert_eq((psser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0))
-
- self.assertRaises(AssertionError, lambda: psser.swapaxes(0, 1,
copy=False))
- self.assertRaises(ValueError, lambda: psser.swapaxes(0, 1))
- self.assertRaises(ValueError, lambda: psser.swapaxes("index",
"columns"))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psser.swapaxes(0, 0), pser.swapaxes(0, 0))
+ self.assert_eq(psser.swapaxes("index", "index"),
pser.swapaxes("index", "index"))
+ self.assert_eq((psser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0,
0))
+
+ self.assertRaises(AssertionError, lambda: psser.swapaxes(0, 1,
copy=False))
+ self.assertRaises(ValueError, lambda: psser.swapaxes(0, 1))
+ self.assertRaises(ValueError, lambda: psser.swapaxes("index",
"columns"))
+ else:
+ with self.assertRaises(AttributeError):
+ psser.swapaxes(0, 0)
def test_droplevel(self):
pser = pd.Series(
diff --git a/python/pyspark/pandas/tests/series/test_missing_data.py
b/python/pyspark/pandas/tests/series/test_missing_data.py
index 31a209fa61ca..2f90fdd63914 100644
--- a/python/pyspark/pandas/tests/series/test_missing_data.py
+++ b/python/pyspark/pandas/tests/series/test_missing_data.py
@@ -19,6 +19,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -58,15 +59,19 @@ class SeriesMissingDataMixin:
pser.loc[3] = np.nan
psser.loc[3] = np.nan
- self.assert_eq(psser.fillna(0), pser.fillna(0))
- self.assert_eq(psser.fillna(method="ffill"),
pser.fillna(method="ffill"))
- self.assert_eq(
- psser.fillna(method="bfill").sort_index(),
pser.fillna(method="bfill").sort_index()
- )
- self.assert_eq(
- psser.fillna(method="backfill").sort_index(),
- pser.fillna(method="backfill").sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psser.fillna(0), pser.fillna(0))
+ self.assert_eq(psser.fillna(method="ffill"),
pser.fillna(method="ffill"))
+ self.assert_eq(
+ psser.fillna(method="bfill").sort_index(),
pser.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(
+ psser.fillna(method="backfill").sort_index(),
+ pser.fillna(method="backfill").sort_index(),
+ )
+ else:
+ with self.assertRaises(TypeError):
+ psser.fillna(method="ffill")
# inplace fillna on non-nullable column
pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]})
@@ -206,26 +211,34 @@ class SeriesMissingDataMixin:
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
- self.assert_eq(pser.pad(), psser.pad())
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pser.pad(), psser.pad())
- # Test `inplace=True`
- pser.pad(inplace=True)
- psser.pad(inplace=True)
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
+ # Test `inplace=True`
+ pser.pad(inplace=True)
+ psser.pad(inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+ else:
+ with self.assertRaises(AttributeError):
+ psser.pad()
def test_backfill(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
- self.assert_eq(pser.backfill().sort_index(),
psser.backfill().sort_index())
-
- # Test `inplace=True`
- pser.backfill(inplace=True)
- psser.backfill(inplace=True)
- self.assert_eq(pser.sort_index(), psser.sort_index())
- self.assert_eq(pdf.sort_index(), psdf.sort_index())
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(pser.backfill().sort_index(),
psser.backfill().sort_index())
+
+ # Test `inplace=True`
+ pser.backfill(inplace=True)
+ psser.backfill(inplace=True)
+ self.assert_eq(pser.sort_index(), psser.sort_index())
+ self.assert_eq(pdf.sort_index(), psdf.sort_index())
+ else:
+ with self.assertRaises(AttributeError):
+ psser.backfill()
class SeriesMissingDataTests(
diff --git a/python/pyspark/pandas/tests/series/test_series.py
b/python/pyspark/pandas/tests/series/test_series.py
index 85bafe80adf0..d725123304fd 100644
--- a/python/pyspark/pandas/tests/series/test_series.py
+++ b/python/pyspark/pandas/tests/series/test_series.py
@@ -25,6 +25,7 @@ import pandas as pd
from pyspark.ml.linalg import SparseVector
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import (
PandasOnSparkTestCase,
SPARK_CONF_ARROW_ENABLED,
@@ -148,22 +149,36 @@ class SeriesTestsMixin:
self.assert_eq(psser.head(-10), pser.head(-10))
def test_last(self):
- with self.assertRaises(TypeError):
- self.psser.last("1D")
-
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
psser = ps.from_pandas(pser)
- self.assert_eq(psser.last("1D"), pser.last("1D"))
- def test_first(self):
- with self.assertRaises(TypeError):
- self.psser.first("1D")
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psser.last("1D"), pser.last("1D"))
+
+ with self.assertRaises(TypeError):
+ self.psser.last("1D")
+ else:
+ with self.assertRaises(AttributeError):
+ psser.last("1D")
+ with self.assertRaises(AttributeError):
+ self.psser.last("1D")
+ def test_first(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
psser = ps.from_pandas(pser)
- self.assert_eq(psser.first("1D"), pser.first("1D"))
+
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(psser.first("1D"), pser.first("1D"))
+
+ with self.assertRaises(TypeError):
+ self.psser.first("1D")
+ else:
+ with self.assertRaises(AttributeError):
+ psser.first("1D")
+ with self.assertRaises(AttributeError):
+ self.psser.first("1D")
def test_rename(self):
pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]