[arrow] branch main updated: GH-15070: [Python][CI] Compatibility with pandas 2.0 (#34878)

jorisvandenbossche Tue, 04 Apr 2023 07:43:42 -0700

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new c4057bd09b GH-15070: [Python][CI] Compatibility with pandas 2.0 
(#34878)
c4057bd09b is described below

commit c4057bd09b369c7c86ffd9c9f4fbc2a9496c0e6a
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Apr 4 16:43:26 2023 +0200

    GH-15070: [Python][CI] Compatibility with pandas 2.0 (#34878)
    
    ### What changes are included in this PR?
    
    - The issue with numpy 1.25 in the assert equal helper was fixed in pandas 
1.5.3 -> removing the skip (in theory can still run into this error when using 
an older pandas version with the latest numpy, but that's not something you 
should do)
    - Casting tz-aware strings to datetime64[ns] was not fixed in pandas 
(https://github.com/pandas-dev/pandas/issues/50140) -> updating our 
implementation to work around it
    - Casting to numpy string dtype 
(https://github.com/pandas-dev/pandas/issues/50127) is not yet fixed -> 
updating the skip
    
    ### Are there any user-facing changes?
    
    No
    * Closes: #15070
    
    Authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 python/pyarrow/pandas_compat.py     |  3 +--
 python/pyarrow/tests/test_pandas.py | 34 +++++++---------------------------
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 00e8613717..6b9514ea6b 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -1148,8 +1148,7 @@ def _reconstruct_columns_from_metadata(columns, 
column_indexes):
         if pandas_dtype == "datetimetz":
             tz = pa.lib.string_to_tzinfo(
                 column_indexes[0]['metadata']['timezone'])
-            dt = level.astype(numpy_dtype)
-            level = dt.tz_localize('utc').tz_convert(tz)
+            level = pd.to_datetime(level, utc=True).tz_convert(tz)
         elif level.dtype != dtype:
             level = level.astype(dtype)
         # ARROW-9096: if original DataFrame was upcast we keep that
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 989ad1e939..189ab7fa0d 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -187,17 +187,12 @@ class TestConvertMetadata:
         _check_pandas_roundtrip(df, preserve_index=True)
 
     def test_column_index_names_with_tz(self):
-        if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
-            # TODO: regression in pandas, should be fixed before final 2.0.0
-            # https://github.com/pandas-dev/pandas/issues/50140
-            pytest.skip("Regression in pandas 2.0.0.dev")
         # ARROW-13756
         # Bug if index is timezone aware DataTimeIndex
 
         df = pd.DataFrame(
             np.random.randn(5, 3),
-            columns=pd.date_range(
-                "2021-01-01", "2021-01-3", freq="D", tz="CET")
+            columns=pd.date_range("2021-01-01", periods=3, freq="50D", 
tz="CET")
         )
         _check_pandas_roundtrip(df, preserve_index=True)
 
@@ -453,11 +448,11 @@ class TestConvertMetadata:
                                         preserve_index=True)
 
     def test_binary_column_name(self):
-        if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
-            # TODO: regression in pandas, should be fixed before final 2.0.0
+        if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
+            # TODO: regression in pandas, hopefully fixed in next version
             # https://issues.apache.org/jira/browse/ARROW-18394
             # https://github.com/pandas-dev/pandas/issues/50127
-            pytest.skip("Regression in pandas 2.0.0.dev")
+            pytest.skip("Regression in pandas 2.0.0")
         column_data = ['い']
         key = 'あ'.encode()
         data = {key: column_data}
@@ -2064,11 +2059,6 @@ class TestConvertListTypes:
         assert result3.equals(expected3)
 
     def test_infer_lists(self):
-        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
-                (Version(pd.__version__) < Version("2.0.0"))):
-            # TODO: regression in pandas with numpy 1.25dev
-            # https://github.com/pandas-dev/pandas/issues/50360
-            pytest.skip("Regression in pandas with numpy 1.25")
         data = OrderedDict([
             ('nan_ints', [[None, 1], [2, 3]]),
             ('ints', [[0, 1], [2, 3]]),
@@ -2118,11 +2108,6 @@ class TestConvertListTypes:
         _check_pandas_roundtrip(df, expected_schema=expected_schema)
 
     def test_to_list_of_structs_pandas(self):
-        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
-                (Version(pd.__version__) < Version("2.0.0"))):
-            # TODO: regression in pandas with numpy 1.25dev
-            # https://github.com/pandas-dev/pandas/issues/50360
-            pytest.skip("Regression in pandas with numpy 1.25")
         ints = pa.array([1, 2, 3], pa.int32())
         strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
                            pa.list_(pa.string()))
@@ -2192,11 +2177,6 @@ class TestConvertListTypes:
             assert result.equals(expected)
 
     def test_nested_large_list(self):
-        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
-                (Version(pd.__version__) < Version("2.0.0"))):
-            # TODO: regression in pandas with numpy 1.25dev
-            # https://github.com/pandas-dev/pandas/issues/50360
-            pytest.skip("Regression in pandas with numpy 1.25")
         s = (pa.array([[[1, 2, 3], [4]], None],
                       type=pa.large_list(pa.large_list(pa.int64())))
              .to_pandas())
@@ -2950,11 +2930,11 @@ def _fully_loaded_dataframe_example():
 
 @pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
 def test_roundtrip_with_bytes_unicode(columns):
-    if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
-        # TODO: regression in pandas, should be fixed before final 2.0.0
+    if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
+        # TODO: regression in pandas, hopefully fixed in next version
         # https://issues.apache.org/jira/browse/ARROW-18394
         # https://github.com/pandas-dev/pandas/issues/50127
-        pytest.skip("Regression in pandas 2.0.0.dev")
+        pytest.skip("Regression in pandas 2.0.0")
 
     df = pd.DataFrame(columns=columns)
     table1 = pa.Table.from_pandas(df)

[arrow] branch main updated: GH-15070: [Python][CI] Compatibility with pandas 2.0 (#34878)

Reply via email to