This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6eeee3b769 GH-36412: [Python][CI] Fix extra deprecation warnings in
the pandas nightly build (#39609)
6eeee3b769 is described below
commit 6eeee3b769f3ad6c724305b4182526307ab025d5
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Jan 17 11:12:41 2024 +0100
GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly
build (#39609)
Fixes left deprecation warnings coming from the pandas development version,
by updating our test code to avoid the deprecated patterns.
* Closes: #36412
Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/pandas_compat.py | 15 ++--------
python/pyarrow/tests/parquet/test_datetime.py | 4 +--
python/pyarrow/tests/test_compute.py | 6 ++--
python/pyarrow/tests/test_dataset.py | 6 ++--
python/pyarrow/tests/test_pandas.py | 42 +++++++++++++++------------
5 files changed, 35 insertions(+), 38 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 39dee85492..61e6318e29 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -967,20 +967,9 @@ def _extract_index_level(table, result_table, field_name,
# The serialized index column was removed by the user
return result_table, None, None
- pd = _pandas_api.pd
-
col = table.column(i)
- values = col.to_pandas(types_mapper=types_mapper).values
-
- if hasattr(values, 'flags') and not values.flags.writeable:
- # ARROW-1054: in pandas 0.19.2, factorize will reject
- # non-writeable arrays when calling MultiIndex.from_arrays
- values = values.copy()
-
- if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None:
- index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz)
- else:
- index_level = pd.Series(values, dtype=values.dtype, copy=False)
+ index_level = col.to_pandas(types_mapper=types_mapper)
+ index_level.name = None
result_table = result_table.remove_column(
result_table.schema.get_field_index(field_name)
)
diff --git a/python/pyarrow/tests/parquet/test_datetime.py
b/python/pyarrow/tests/parquet/test_datetime.py
index 6a9cbd4f73..0896eb37e6 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -116,7 +116,7 @@ def test_coerce_timestamps(tempdir):
df_expected = df.copy()
for i, x in enumerate(df_expected['datetime64']):
if isinstance(x, np.ndarray):
- df_expected['datetime64'][i] = x.astype('M8[us]')
+ df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
tm.assert_frame_equal(df_expected, df_read)
@@ -429,7 +429,7 @@ def
test_noncoerced_nanoseconds_written_without_exception(tempdir):
# nanosecond timestamps by default
n = 9
df = pd.DataFrame({'x': range(n)},
- index=pd.date_range('2017-01-01', freq='1n', periods=n))
+ index=pd.date_range('2017-01-01', freq='ns', periods=n))
tb = pa.Table.from_pandas(df)
filename = tempdir / 'written.parquet'
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index d1eb605c71..34d4da580f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2360,10 +2360,10 @@ def _check_temporal_rounding(ts, values, unit):
unit_shorthand = {
"nanosecond": "ns",
"microsecond": "us",
- "millisecond": "L",
+ "millisecond": "ms",
"second": "s",
"minute": "min",
- "hour": "H",
+ "hour": "h",
"day": "D"
}
greater_unit = {
@@ -2371,7 +2371,7 @@ def _check_temporal_rounding(ts, values, unit):
"microsecond": "ms",
"millisecond": "s",
"second": "min",
- "minute": "H",
+ "minute": "h",
"hour": "d",
}
ta = pa.array(ts)
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index ae2146c0bd..d473299f20 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -178,12 +178,14 @@ def multisourcefs(request):
# simply split the dataframe into four chunks to construct a data source
# from each chunk into its own directory
- df_a, df_b, df_c, df_d = np.array_split(df, 4)
+ n = len(df)
+ df_a, df_b, df_c, df_d = [df.iloc[i:i+n//4] for i in range(0, n, n//4)]
# create a directory containing a flat sequence of parquet files without
# any partitioning involved
mockfs.create_dir('plain')
- for i, chunk in enumerate(np.array_split(df_a, 10)):
+ n = len(df_a)
+ for i, chunk in enumerate([df_a.iloc[i:i+n//10] for i in range(0, n,
n//10)]):
path = 'plain/chunk-{}.parquet'.format(i)
with mockfs.open_output_stream(path) as out:
pq.write_table(_table_from_pandas(chunk), out)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index d15ee82d5d..8106219057 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -113,6 +113,10 @@ def _check_pandas_roundtrip(df, expected=None,
use_threads=False,
if expected is None:
expected = df
+ for col in expected.columns:
+ if expected[col].dtype == 'object':
+ expected[col] = expected[col].replace({np.nan: None})
+
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "elementwise comparison failed", DeprecationWarning)
@@ -152,6 +156,9 @@ def _check_array_roundtrip(values, expected=None, mask=None,
expected = pd.Series(values).copy()
expected[mask.copy()] = None
+ if expected.dtype == 'object':
+ expected = expected.replace({np.nan: None})
+
tm.assert_series_equal(pd.Series(result), expected, check_names=False)
@@ -478,7 +485,7 @@ class TestConvertMetadata:
preserve_index=True)
def test_binary_column_name(self):
- if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
+ if Version("2.0.0") <= Version(pd.__version__) < Version("3.0.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
@@ -3108,7 +3115,7 @@ def _fully_loaded_dataframe_example():
@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
- if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
+ if Version("2.0.0") <= Version(pd.__version__) < Version("3.0.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
@@ -3491,7 +3498,7 @@ def test_table_from_pandas_schema_field_order_metadata():
# ensure that a different field order in specified schema doesn't
# mangle metadata
df = pd.DataFrame({
- "datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=2),
+ "datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=2),
"float": np.random.randn(2)
})
@@ -4181,8 +4188,6 @@ def _Int64Dtype__from_arrow__(self, array):
def test_convert_to_extension_array(monkeypatch):
- import pandas.core.internals as _int
-
# table converted from dataframe with extension types (so pandas_metadata
# has this information)
df = pd.DataFrame(
@@ -4193,16 +4198,15 @@ def test_convert_to_extension_array(monkeypatch):
# Int64Dtype is recognized -> convert to extension block by default
# for a proper roundtrip
result = table.to_pandas()
- assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")
- assert isinstance(_get_mgr(result).blocks[1], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[1].values.dtype == pd.Int64Dtype()
tm.assert_frame_equal(result, df)
# test with missing values
df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
table2 = pa.table(df2)
result = table2.to_pandas()
- assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
tm.assert_frame_equal(result, df2)
# monkeypatch pandas Int64Dtype to *not* have the protocol method
@@ -4215,7 +4219,7 @@ def test_convert_to_extension_array(monkeypatch):
# Int64Dtype has no __from_arrow__ -> use normal conversion
result = table.to_pandas()
assert len(_get_mgr(result).blocks) == 1
- assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")
class MyCustomIntegerType(pa.ExtensionType):
@@ -4233,8 +4237,6 @@ class MyCustomIntegerType(pa.ExtensionType):
def test_conversion_extensiontype_to_extensionarray(monkeypatch):
# converting extension type to linked pandas ExtensionDtype/Array
- import pandas.core.internals as _int
-
storage = pa.array([1, 2, 3, 4], pa.int64())
arr = pa.ExtensionArray.from_storage(MyCustomIntegerType(), storage)
table = pa.table({'a': arr})
@@ -4242,12 +4244,12 @@ def
test_conversion_extensiontype_to_extensionarray(monkeypatch):
# extension type points to Int64Dtype, which knows how to create a
# pandas ExtensionArray
result = arr.to_pandas()
- assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
expected = pd.Series([1, 2, 3, 4], dtype='Int64')
tm.assert_series_equal(result, expected)
result = table.to_pandas()
- assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
tm.assert_frame_equal(result, expected)
@@ -4261,7 +4263,7 @@ def
test_conversion_extensiontype_to_extensionarray(monkeypatch):
pd.core.arrays.integer.NumericDtype, "__from_arrow__")
result = arr.to_pandas()
- assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
+ assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")
expected = pd.Series([1, 2, 3, 4])
tm.assert_series_equal(result, expected)
@@ -4312,10 +4314,14 @@ def test_array_to_pandas():
def test_roundtrip_empty_table_with_extension_dtype_index():
df = pd.DataFrame(index=pd.interval_range(start=0, end=3))
table = pa.table(df)
- table.to_pandas().index == pd.Index([{'left': 0, 'right': 1},
- {'left': 1, 'right': 2},
- {'left': 2, 'right': 3}],
- dtype='object')
+ if Version(pd.__version__) > Version("1.0"):
+ tm.assert_index_equal(table.to_pandas().index, df.index)
+ else:
+ tm.assert_index_equal(table.to_pandas().index,
+ pd.Index([{'left': 0, 'right': 1},
+ {'left': 1, 'right': 2},
+ {'left': 2, 'right': 3}],
+ dtype='object'))
@pytest.mark.parametrize("index", ["a", ["a", "b"]])