This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b90a2b82d8 GH-48314: [Python] Compat with pandas 3.0 changed default
datetime unit (#48319)
b90a2b82d8 is described below
commit b90a2b82d85b1479470b7f1bdd941c9a59ecd3d4
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Mon Jan 12 10:02:10 2026 +0100
GH-48314: [Python] Compat with pandas 3.0 changed default datetime unit
(#48319)
### Rationale for this change
pandas 3.0 changes the default datetime/timedelta resolution from
nanoseconds to microseconds. We already had mostly accounted for that in
previous PRs, but pandas made this change the last days in a few additional
places (eg `pd.date_range`), uncovering some more issues
### What changes are included in this PR?
- Don't hardcode the nanosecond unit in the metadata and when recreating a
datetime-tz column's Index
- Update a few tests to account for those changes
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* GitHub Issue: #48314
Lead-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/pandas_compat.py | 4 ++--
python/pyarrow/tests/test_pandas.py | 26 ++++++++++++++++++++------
2 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index dfed76d371..dfca59cbf5 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -140,7 +140,7 @@ def get_extension_dtype_info(column):
physical_dtype = str(cats.codes.dtype)
elif hasattr(dtype, 'tz'):
metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)}
- physical_dtype = 'datetime64[ns]'
+ physical_dtype = f'datetime64[{dtype.unit}]'
else:
metadata = None
physical_dtype = str(dtype)
@@ -1188,7 +1188,7 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
if _pandas_api.is_ge_v3():
# with pandas 3+, to_datetime returns a unit depending on the
string
# data, so we restore it to the original unit from the metadata
- level = level.as_unit(np.datetime_data(dtype)[0])
+ level = level.as_unit(np.datetime_data(numpy_dtype)[0])
# GH-41503: if the column index was decimal, restore to decimal
elif pandas_dtype == "decimal":
level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level])
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index daa9c8314a..481292e1e6 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -212,13 +212,14 @@ class TestConvertMetadata:
df.columns.names = ['a']
_check_pandas_roundtrip(df, preserve_index=True)
- def test_column_index_names_with_tz(self):
+ @pytest.mark.parametrize("tz", [None, "Europe/Brussels"])
+ def test_column_index_names_datetime(self, tz):
# ARROW-13756
# Bug if index is timezone aware DataTimeIndex
df = pd.DataFrame(
np.random.randn(5, 3),
- columns=pd.date_range("2021-01-01", periods=3, freq="50D",
tz="CET")
+ columns=pd.date_range("2021-01-01", periods=3, freq="50D", tz=tz)
)
_check_pandas_roundtrip(df, preserve_index=True)
@@ -447,11 +448,16 @@ class TestConvertMetadata:
assert len(md) == 1
assert md['encoding'] == 'UTF-8'
- def test_datetimetz_column_index(self):
+ @pytest.mark.parametrize('unit', ['us', 'ns'])
+ def test_datetimetz_column_index(self, unit):
+ ext_kwargs = {}
+ if Version(pd.__version__) >= Version("2.0.0"):
+ # unit argument not supported on date_range for pandas < 2.0.0
+ ext_kwargs = {'unit': unit}
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
columns=pd.date_range(
- start='2017-01-01', periods=3, tz='America/New_York'
+ start='2017-01-01', periods=3, tz='America/New_York',
**ext_kwargs
)
)
t = pa.Table.from_pandas(df, preserve_index=True)
@@ -460,7 +466,10 @@ class TestConvertMetadata:
column_indexes, = js['column_indexes']
assert column_indexes['name'] is None
assert column_indexes['pandas_type'] == 'datetimetz'
- assert column_indexes['numpy_type'] == 'datetime64[ns]'
+ if ext_kwargs:
+ assert column_indexes['numpy_type'] == f'datetime64[{unit}]'
+ else:
+ assert column_indexes['numpy_type'] == 'datetime64[ns]'
md = column_indexes['metadata']
assert md['timezone'] == 'America/New_York'
@@ -709,7 +718,12 @@ class TestConvertMetadata:
# It is possible that the metadata and actual schema is not fully
# matching (eg no timezone information for tz-aware column)
# -> to_pandas() conversion should not fail on that
- df = pd.DataFrame({"datetime": pd.date_range("2020-01-01", periods=3)})
+ ext_kwargs = {}
+ if Version(pd.__version__) >= Version("2.0.0"):
+ # unit argument not supported on date_range for pandas < 2.0.0
+ ext_kwargs = {'unit': 'ns'}
+ df = pd.DataFrame({"datetime": pd.date_range(
+ "2020-01-01", periods=3, **ext_kwargs)})
# OPTION 1: casting after conversion
table = pa.Table.from_pandas(df)