This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c4057bd09b GH-15070: [Python][CI] Compatibility with pandas 2.0
(#34878)
c4057bd09b is described below
commit c4057bd09b369c7c86ffd9c9f4fbc2a9496c0e6a
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Apr 4 16:43:26 2023 +0200
GH-15070: [Python][CI] Compatibility with pandas 2.0 (#34878)
### What changes are included in this PR?
- The issue with numpy 1.25 in the assert equal helper was fixed in pandas
1.5.3 -> removing the skip (in theory can still run into this error when using
an older pandas version with the latest numpy, but that's not something you
should do)
- Casting tz-aware strings to datetime64[ns] was not fixed in pandas
(https://github.com/pandas-dev/pandas/issues/50140) -> updating our
implementation to work around it
- Casting to numpy string dtype
(https://github.com/pandas-dev/pandas/issues/50127) is not yet fixed ->
updating the skip
### Are there any user-facing changes?
No
* Closes: #15070
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/pandas_compat.py | 3 +--
python/pyarrow/tests/test_pandas.py | 34 +++++++---------------------------
2 files changed, 8 insertions(+), 29 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 00e8613717..6b9514ea6b 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -1148,8 +1148,7 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
if pandas_dtype == "datetimetz":
tz = pa.lib.string_to_tzinfo(
column_indexes[0]['metadata']['timezone'])
- dt = level.astype(numpy_dtype)
- level = dt.tz_localize('utc').tz_convert(tz)
+ level = pd.to_datetime(level, utc=True).tz_convert(tz)
elif level.dtype != dtype:
level = level.astype(dtype)
# ARROW-9096: if original DataFrame was upcast we keep that
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 989ad1e939..189ab7fa0d 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -187,17 +187,12 @@ class TestConvertMetadata:
_check_pandas_roundtrip(df, preserve_index=True)
def test_column_index_names_with_tz(self):
- if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
- # TODO: regression in pandas, should be fixed before final 2.0.0
- # https://github.com/pandas-dev/pandas/issues/50140
- pytest.skip("Regression in pandas 2.0.0.dev")
# ARROW-13756
# Bug if index is timezone aware DataTimeIndex
df = pd.DataFrame(
np.random.randn(5, 3),
- columns=pd.date_range(
- "2021-01-01", "2021-01-3", freq="D", tz="CET")
+ columns=pd.date_range("2021-01-01", periods=3, freq="50D",
tz="CET")
)
_check_pandas_roundtrip(df, preserve_index=True)
@@ -453,11 +448,11 @@ class TestConvertMetadata:
preserve_index=True)
def test_binary_column_name(self):
- if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
- # TODO: regression in pandas, should be fixed before final 2.0.0
+ if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
+ # TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
- pytest.skip("Regression in pandas 2.0.0.dev")
+ pytest.skip("Regression in pandas 2.0.0")
column_data = ['い']
key = 'あ'.encode()
data = {key: column_data}
@@ -2064,11 +2059,6 @@ class TestConvertListTypes:
assert result3.equals(expected3)
def test_infer_lists(self):
- if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
- (Version(pd.__version__) < Version("2.0.0"))):
- # TODO: regression in pandas with numpy 1.25dev
- # https://github.com/pandas-dev/pandas/issues/50360
- pytest.skip("Regression in pandas with numpy 1.25")
data = OrderedDict([
('nan_ints', [[None, 1], [2, 3]]),
('ints', [[0, 1], [2, 3]]),
@@ -2118,11 +2108,6 @@ class TestConvertListTypes:
_check_pandas_roundtrip(df, expected_schema=expected_schema)
def test_to_list_of_structs_pandas(self):
- if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
- (Version(pd.__version__) < Version("2.0.0"))):
- # TODO: regression in pandas with numpy 1.25dev
- # https://github.com/pandas-dev/pandas/issues/50360
- pytest.skip("Regression in pandas with numpy 1.25")
ints = pa.array([1, 2, 3], pa.int32())
strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
pa.list_(pa.string()))
@@ -2192,11 +2177,6 @@ class TestConvertListTypes:
assert result.equals(expected)
def test_nested_large_list(self):
- if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
- (Version(pd.__version__) < Version("2.0.0"))):
- # TODO: regression in pandas with numpy 1.25dev
- # https://github.com/pandas-dev/pandas/issues/50360
- pytest.skip("Regression in pandas with numpy 1.25")
s = (pa.array([[[1, 2, 3], [4]], None],
type=pa.large_list(pa.large_list(pa.int64())))
.to_pandas())
@@ -2950,11 +2930,11 @@ def _fully_loaded_dataframe_example():
@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
- if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
- # TODO: regression in pandas, should be fixed before final 2.0.0
+ if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
+ # TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
- pytest.skip("Regression in pandas 2.0.0.dev")
+ pytest.skip("Regression in pandas 2.0.0")
df = pd.DataFrame(columns=columns)
table1 = pa.Table.from_pandas(df)